In [35]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    classification_report,
    accuracy_score,
    roc_auc_score,
    balanced_accuracy_score,
    roc_curve,
)
import numpy as np

In [29]:
df = pd.read_csv("PastLoans.csv")

In [30]:
def replace_digital3(row):
    if row["digital3"] == 0:
        if row["digital1"] != 0 and row["digital2"] != 0:
            return (row["digital1"] + row["digital2"]) / 2
        elif row["digital1"] != 0:
            return row["digital1"]
        elif row["digital2"] != 0:
            return row["digital2"]
    return row["digital3"]


df["digital3"] = df.apply(replace_digital3, axis=1)


In [31]:
# Encode categorical variables (sex and employment)
df["sex"] = LabelEncoder().fit_transform(df["sex"])
df["employment"] = LabelEncoder().fit_transform(df["employment"])


In [32]:
df

Unnamed: 0,sex,employment,married,income,digital1,digital2,digital3,default
0,1,3,0,0,0.749267,0.422656,0.738735,0
1,1,0,1,26108,0.697428,0.596361,0.733947,0
2,1,0,1,11810,0.000000,0.652496,0.652496,0
3,1,3,1,0,0.000000,0.000000,0.000000,0
4,0,1,1,6310,0.000000,0.041123,0.203202,0
...,...,...,...,...,...,...,...,...
99995,0,2,0,4564,0.000000,0.333091,0.876951,0
99996,1,0,0,18882,0.000000,0.367979,0.556855,0
99997,1,0,1,19121,0.858688,0.508737,0.722149,0
99998,1,0,1,53563,0.736672,0.328757,0.532715,0


In [6]:
df_nonzero = df[df["digital3"] != 0]
df_zero = df[df["digital3"] == 0]

In [7]:
df_nonzero

Unnamed: 0,sex,employment,married,income,digital1,digital2,digital3,default
0,1,3,0,0,0.749267,0.422656,0.738735,0
1,1,0,1,26108,0.697428,0.596361,0.733947,0
2,1,0,1,11810,0.000000,0.652496,0.652496,0
4,0,1,1,6310,0.000000,0.041123,0.203202,0
5,0,0,1,11511,0.683078,0.694442,0.862871,0
...,...,...,...,...,...,...,...,...
99995,0,2,0,4564,0.000000,0.333091,0.876951,0
99996,1,0,0,18882,0.000000,0.367979,0.556855,0
99997,1,0,1,19121,0.858688,0.508737,0.722149,0
99998,1,0,1,53563,0.736672,0.328757,0.532715,0


In [8]:
def evaluate_model(y_true, y_pred, model_name):
    acc = accuracy_score(y_true, y_pred)
    balanced_acc = balanced_accuracy_score(y_true, y_pred)
    roc_auc = roc_auc_score(y_true, y_pred)
    report = classification_report(y_true, y_pred)

    print(f"Model: {model_name}")
    print(f"Accuracy: {acc}")
    print(f"Balanced Accuracy: {balanced_acc}")
    print(f"ROC AUC: {roc_auc}")
    print(f"Classification Report:\n{report}\n")


In [17]:
from autoviz import AutoViz_Class

In [None]:
av = AutoViz_Class()
dft = av.AutoViz("PastLoans.csv", chart_format="bokeh")

# Modeling the case with digital footprint

In [75]:
df_2 = df.copy()

In [76]:
df_2["income"] = df_2["income"].replace(0.0, 1)

df_2["income"] = np.log(df_2["income"])

In [77]:
df_2

Unnamed: 0,sex,employment,married,income,digital1,digital2,digital3,default
0,1,3,0,0.000000,0.749267,0.422656,0.738735,0
1,1,0,1,10.169997,0.697428,0.596361,0.733947,0
2,1,0,1,9.376702,0.000000,0.652496,0.652496,0
3,1,3,1,0.000000,0.000000,0.000000,0.000000,0
4,0,1,1,8.749891,0.000000,0.041123,0.203202,0
...,...,...,...,...,...,...,...,...
99995,0,2,0,8.425955,0.000000,0.333091,0.876951,0
99996,1,0,0,9.845964,0.000000,0.367979,0.556855,0
99997,1,0,1,9.858542,0.858688,0.508737,0.722149,0
99998,1,0,1,10.888614,0.736672,0.328757,0.532715,0


In [78]:
X_train, X_test, y_train, y_test = train_test_split(
    df_2.drop(columns=["default"]),
    df_2["default"],
    test_size=0.2,
    stratify=df_2["default"],
)

In [79]:
from imblearn.combine import SMOTETomek

smote_tomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = smote_tomek.fit_resample(X_train, y_train)

In [80]:
# Verify class distribution after resampling
from collections import Counter

print(f"Class distribution after resampling: {Counter(y_resampled)}")


Class distribution after resampling: Counter({0: 69390, 1: 69390})


In [83]:
import xgboost as xgb

dtrain = xgb.DMatrix(X_resampled, label=y_resampled)
dtest = xgb.DMatrix(X_test, label=y_test)

param = {
    "objective": "binary:logistic",
    "max_depth": 6,
    "learning_rate": 0.01,
    "n_estimators": 1000,
    "scale_pos_weight": y_train.value_counts()[0] / y_train.value_counts()[1],
    "random_state": 42,
    "eval_metric": "auc",
}

evals = [(dtrain, "train"), (dtest, "test")]

In [84]:
xgb_model = xgb.train(
    param,
    dtrain,
    num_boost_round=1000,
    evals=evals,
    early_stopping_rounds=10,
    verbose_eval=100,
)

Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-auc:0.74412	test-auc:0.72181
[86]	train-auc:0.78503	test-auc:0.74139


In [85]:
y_pred = xgb_model.predict(dtest)

In [86]:
# Calculate the false positive rate, true positive rate, and thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred)

# Calculate the optimal threshold
optimal_idx = np.argmax(tpr - fpr)
optimal_threshold = thresholds[optimal_idx]

print(f"Optimal Threshold: {optimal_threshold}")

Optimal Threshold: 0.7345231175422668


In [87]:
y_pred_binary = y_pred > optimal_threshold

evaluate_model(y_test, y_pred_binary, "XGBoost")

Model: XGBoost
Accuracy: 0.6294
Balanced Accuracy: 0.6759409275584032
ROC AUC: 0.6759409275584032
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.62      0.75     18011
           1       0.17      0.73      0.28      1989

    accuracy                           0.63     20000
   macro avg       0.56      0.68      0.52     20000
weighted avg       0.88      0.63      0.70     20000




In [88]:
import numpy as np


def focal_loss(preds, dtrain, gamma=2.0, alpha=0.25):
    labels = dtrain.get_label()
    preds = 1.0 / (1.0 + np.exp(-preds))  # Convert to probabilities

    # Gradient and hessian calculations
    p_t = preds * labels + (1 - preds) * (1 - labels)  # Prob of true class
    grad = alpha * (preds - labels) * ((1 - p_t) ** gamma)
    hess = alpha * (1 - p_t) ** (gamma - 1) * p_t * (1 - p_t)

    return grad, hess


# Define a custom evaluation metric based on focal loss
def focal_loss_eval(preds, dtrain, gamma=2.0, alpha=0.25):
    labels = dtrain.get_label()
    preds = 1.0 / (1.0 + np.exp(-preds))

    # Calculate the focal loss
    p_t = preds * labels + (1 - preds) * (1 - labels)
    loss = -alpha * ((1 - p_t) ** gamma) * np.log(p_t)

    return "focal_loss", np.mean(loss)

In [89]:
xgb_model = xgb.train(
    param,
    dtrain,
    num_boost_round=1000,
    obj=lambda preds, dtrain: focal_loss(preds, dtrain, gamma=3.0, alpha=0.9),
    feval=lambda preds, dtest: focal_loss_eval(preds, dtest, alpha=0.9, gamma=3.0),
    evals=evals,
    early_stopping_rounds=10,
    verbose_eval=100,
)

Parameters: { "n_estimators" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-auc:0.76697	train-focal_loss:0.07760	test-auc:0.72853	test-focal_loss:0.07763




[100]	train-auc:0.80966	train-focal_loss:0.06343	test-auc:0.74912	test-focal_loss:0.06551
[200]	train-auc:0.81775	train-focal_loss:0.06078	test-auc:0.74840	test-focal_loss:0.06355
[300]	train-auc:0.82127	train-focal_loss:0.05985	test-auc:0.74747	test-focal_loss:0.06294
[400]	train-auc:0.82397	train-focal_loss:0.05930	test-auc:0.74684	test-focal_loss:0.06266
[500]	train-auc:0.82584	train-focal_loss:0.05894	test-auc:0.74606	test-focal_loss:0.06252
[600]	train-auc:0.82735	train-focal_loss:0.05867	test-auc:0.74532	test-focal_loss:0.06245
[700]	train-auc:0.82959	train-focal_loss:0.05833	test-auc:0.74499	test-focal_loss:0.06234
[800]	train-auc:0.83151	train-focal_loss:0.05805	test-auc:0.74452	test-focal_loss:0.06224
[900]	train-auc:0.83335	train-focal_loss:0.05781	test-auc:0.74419	test-focal_loss:0.06218
[999]	train-auc:0.83573	train-focal_loss:0.05746	test-auc:0.74397	test-focal_loss:0.06207


In [90]:
from sklearn.metrics import precision_recall_curve

# Get predicted probabilities
y_pred_prob = xgb_model.predict(dtest)

# Calculate precision and recall at different thresholds
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)

# Find the best threshold based on your desired balance of precision and recall
f1_scores = 2 * (precision * recall) / (precision + recall)
best_threshold = thresholds[np.argmax(f1_scores)]
print(f"Best threshold for highest F1-score: {best_threshold}")


Best threshold for highest F1-score: 0.5315434336662292


In [91]:
y_pred = xgb_model.predict(dtest)

y_pred_binary = y_pred > best_threshold

evaluate_model(y_test, y_pred_binary, "XGBoost with Focal Loss")


Model: XGBoost with Focal Loss
Accuracy: 0.79435
Balanced Accuracy: 0.652358626490448
ROC AUC: 0.652358626490448
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.83      0.88     18011
           1       0.24      0.48      0.31      1989

    accuracy                           0.79     20000
   macro avg       0.59      0.65      0.60     20000
weighted avg       0.87      0.79      0.82     20000


