In [1]:
import pandas as pd
import joblib
import os

In [2]:
# Loading trained models & scaler
model_path = "../models/"
xgb_model = joblib.load(os.path.join(model_path, "xgboost_model.pkl"))
scaler = joblib.load(os.path.join(model_path, "scaler.pkl"))

In [3]:
# Load test dataset (Ensure this is the same as when you trained the model)
X_test = pd.read_csv("..\data\preprocessed dataset\X_test.csv")

In [4]:
# Adding the ID column back to match predictions with the actual users
id_data = pd.read_csv("../data/original_dataset.csv")[["ID"]]  # Extract IDs from the original dataset
X_test["ID"] = id_data.iloc[X_test.index]["ID"]  # Match IDs back using the same index

In [5]:
# Extract features used during training
feature_cols = ["LIMIT_BAL", "SEX", "EDUCATION", "MARRIAGE", "AGE", 
                "PAY_0", "PAY_2", "PAY_3", "PAY_4", "PAY_5", "PAY_6", 
                "PAY_AMT1", "PAY_AMT2", "PAY_AMT3", "PAY_AMT4", "PAY_AMT5", "PAY_AMT6", 
                "BILL_MEAN"]  

# Keeping only the required features
X_test_features = X_test[feature_cols] 

In [6]:
# Scale using the saved scaler
X_test_scaled = scaler.transform(X_test_features)

In [7]:
# Predict probabilities
risk_probs = xgb_model.predict_proba(X_test_scaled)[:, 1]

In [8]:
# Applying threshold for risk categorization
threshold = 0.35  
X_test["Risk_Score"] = risk_probs
X_test["Risk_Category"] = X_test["Risk_Score"].apply(lambda x: "High Risk" if x >= threshold else "Low Risk")

In [9]:
# Save final predictions
final_predictions = X_test[["ID", "Risk_Score", "Risk_Category"]]  
final_predictions.to_csv("../data/final_predictions.csv", index=False)

print("Final predictions saved to `final_predictions.csv`")

Final predictions saved to `final_predictions.csv`
