In [1]:
import pandas as pd

data = pd.read_csv("Delinquency_prediction_dataset.csv")

In [2]:
data.head()

Unnamed: 0,Customer_ID,Age,Income,Credit_Score,Credit_Utilization,Missed_Payments,Delinquent_Account,Loan_Balance,Debt_to_Income_Ratio,Employment_Status,Account_Tenure,Credit_Card_Type,Location,Month_1,Month_2,Month_3,Month_4,Month_5,Month_6
0,CUST0001,56,165580.0,398.0,0.390502,3,0,16310.0,0.317396,EMP,18,Student,Los Angeles,Late,Late,Missed,Late,Missed,Late
1,CUST0002,69,100999.0,493.0,0.312444,6,1,17401.0,0.196093,Self-employed,0,Standard,Phoenix,Missed,Missed,Late,Missed,On-time,On-time
2,CUST0003,46,188416.0,500.0,0.35993,0,0,13761.0,0.301655,Self-employed,1,Platinum,Chicago,Missed,Late,Late,On-time,Missed,Late
3,CUST0004,32,101672.0,413.0,0.3714,3,0,88778.0,0.264794,Unemployed,15,Platinum,Phoenix,Late,Missed,Late,Missed,Late,Late
4,CUST0005,60,38524.0,487.0,0.234716,2,0,13316.0,0.510583,Self-employed,11,Standard,Phoenix,Missed,On-time,Missed,Late,Late,Late


In [3]:
data.shape

(500, 19)

In [4]:
data.describe(include='all')

Unnamed: 0,Customer_ID,Age,Income,Credit_Score,Credit_Utilization,Missed_Payments,Delinquent_Account,Loan_Balance,Debt_to_Income_Ratio,Employment_Status,Account_Tenure,Credit_Card_Type,Location,Month_1,Month_2,Month_3,Month_4,Month_5,Month_6
count,500,500.0,461.0,498.0,500.0,500.0,500.0,471.0,500.0,500,500.0,500,500,500,500,500,500,500,500
unique,500,,,,,,,,,6,,5,5,3,3,3,3,3,3
top,CUST0500,,,,,,,,,Unemployed,,Gold,Los Angeles,On-time,Late,Late,Late,Missed,Late
freq,1,,,,,,,,,93,,118,107,177,173,169,181,187,172
mean,,46.266,108379.893709,577.716867,0.491446,2.968,0.16,48654.428875,0.298862,,9.74,,,,,,,,
std,,16.187629,53662.723741,168.881211,0.197103,1.946935,0.366973,29395.537273,0.094521,,5.923054,,,,,,,,
min,,18.0,15404.0,301.0,0.05,0.0,0.0,612.0,0.1,,0.0,,,,,,,,
25%,,33.0,62295.0,418.25,0.356486,1.0,0.0,23716.5,0.233639,,5.0,,,,,,,,
50%,,46.5,107658.0,586.0,0.485636,3.0,0.0,45776.0,0.301634,,10.0,,,,,,,,
75%,,59.25,155734.0,727.25,0.63444,5.0,0.0,75546.5,0.362737,,15.0,,,,,,,,


In [5]:
data.isnull().sum()

Unnamed: 0,0
Customer_ID,0
Age,0
Income,39
Credit_Score,2
Credit_Utilization,0
Missed_Payments,0
Delinquent_Account,0
Loan_Balance,29
Debt_to_Income_Ratio,0
Employment_Status,0


### Feature engineering

In [6]:
data['Customer_ID'] = data['Customer_ID'].str.extract(r'(\d+)$').astype(int)

In [7]:
data['Employment_Status'] = data['Employment_Status'].str.lower()
mapping = {
    'emp': 1, 'employed': 1,
    'unemployed': 2,
    'self-employed': 3,
    'retired': 4
}
data['Employment_Status'] = data['Employment_Status'].map(mapping)

In [8]:
mapping2 = {'On-time' : 0, 'Late': 1, 'Missed' : 2}
data['Month_1'] = data['Month_1'].map(mapping2)
data['Month_2'] = data['Month_2'].map(mapping2)
data['Month_3'] = data['Month_3'].map(mapping2)
data['Month_4'] = data['Month_4'].map(mapping2)
data['Month_5'] = data['Month_5'].map(mapping2)
data['Month_6'] = data['Month_6'].map(mapping2)

In [9]:
# 1. Correlation between credit utilization and delinquency
if 'Credit_Utilization' in data.columns:
    corr_utilization = data[['Credit_Utilization', 'Delinquent_Account']].corr().iloc[0, 1]
    print(f"Correlation between credit_utilization and delinquency: {corr_utilization:.3f}")
    high_util_count = (data['Credit_Utilization'] > 80).sum()
    print(f"Records with utilization >80%: {high_util_count}")

# 2. Missed payment history association
month_cols = [f"Month_{i}" for i in range(1, 7)]
# Check correlations
for col in month_cols:
    if col in data.columns:
        corr_val = data[[col, 'Delinquent_Account']].corr().iloc[0, 1]
        print(f"Correlation between {col} and delinquency: {corr_val:.3f}")

# 3. Credit score correlation
if 'Credit_Score' in data.columns:
    corr_credit_score = data[['Credit_Score', 'Delinquent_Account']].corr().iloc[0, 1]
    print(f"Correlation between credit_score and delinquency: {corr_credit_score:.3f}")
    low_score_count = (data['Credit_Score'] < 600).sum()
    print(f"Records with credit_score <600: {low_score_count}")

# 4. Debt-to-income ratio correlation
if 'Debt_to_Income_Ratio' in data.columns:
    corr_dti = data[['Debt_to_Income_Ratio', 'Delinquent_Account']].corr().iloc[0, 1]
    print(f"Correlation between debt_to_income_ratio and delinquency: {corr_dti:.3f}")

# --- Unexpected anomalies ---

# 5. Credit utilization > 100%
if 'Credit_Utilization' in data.columns:
    over_100_count = (data['Credit_Utilization'] > 100).sum()
    print(f"Records with credit_utilization >100%: {over_100_count}")

# 6. Negative income
if 'Income' in data.columns:
    negative_income_count = (data['Income'] < 0).sum()
    print(f"Records with negative income: {negative_income_count}")

# 7. Credit score < 300
if 'Credit_Score' in data.columns:
    below_300_count = (data['Credit_Score'] < 300).sum()
    print(f"Records with credit_score <300: {below_300_count}")


Correlation between credit_utilization and delinquency: 0.034
Records with utilization >80%: 0
Correlation between Month_1 and delinquency: -0.039
Correlation between Month_2 and delinquency: -0.021
Correlation between Month_3 and delinquency: 0.014
Correlation between Month_4 and delinquency: 0.060
Correlation between Month_5 and delinquency: -0.033
Correlation between Month_6 and delinquency: -0.015
Correlation between credit_score and delinquency: 0.035
Records with credit_score <600: 258
Correlation between debt_to_income_ratio and delinquency: 0.034
Records with credit_utilization >100%: 0
Records with negative income: 0
Records with credit_score <300: 0


In [10]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

target = 'Delinquent_Account'

# Encode the two categorical columns
for col in ['Credit_Card_Type', 'Location']:
    if col in data.columns:
        data[col] = data[col].fillna("Missing")
        data[col] = LabelEncoder().fit_transform(data[col])

# 1. Correlation with target
corr_with_target = data.corr()[target].sort_values(ascending=False)
print("Correlation with Delinquent_Account:\n", corr_with_target)

# 2. Feature importance with RandomForest
X = data.drop(columns=[target])
y = data[target]

rf = RandomForestClassifier(n_estimators=200, random_state=42)
rf.fit(X, y)

importances = pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)
print("\nFeature importances:\n", importances)


Correlation with Delinquent_Account:
 Delinquent_Account      1.000000
Month_4                 0.060377
Income                  0.045409
Credit_Score            0.034833
Debt_to_Income_Ratio    0.034386
Credit_Utilization      0.034224
Age                     0.022508
Month_3                 0.014217
Location               -0.001995
Loan_Balance           -0.004320
Month_6                -0.015360
Month_2                -0.021050
Missed_Payments        -0.026478
Credit_Card_Type       -0.030250
Customer_ID            -0.032240
Month_5                -0.032708
Employment_Status      -0.038903
Month_1                -0.039127
Account_Tenure         -0.039829
Name: Delinquent_Account, dtype: float64

Feature importances:
 Loan_Balance            0.104849
Customer_ID             0.102004
Income                  0.099618
Credit_Score            0.096646
Debt_to_Income_Ratio    0.089397
Credit_Utilization      0.081497
Account_Tenure          0.075604
Age                     0.072937
Missed_

### Handling missing data

We observed earlier that the 'Income', 'Credit_Score', and 'Loan_Balance' columns have missing values. We will explore two common approaches to address this: simple imputation (median/mean) and more advanced imputation using `IterativeImputer`.

In [11]:
# Approach 1: Simple Imputation (Median/Mean)

# Create a copy of the dataframe to avoid modifying the original data
data_simple_imputed = data.copy()

# Impute 'Income' and 'Loan_Balance' with the median (robust to outliers)
for col in ['Income', 'Loan_Balance']:
    if col in data_simple_imputed.columns:
        median_val = data_simple_imputed[col].median()
        data_simple_imputed[col].fillna(median_val, inplace=True)

# Impute 'Credit_Score' with the mean (assuming a relatively normal distribution)
if 'Credit_Score' in data_simple_imputed.columns:
    mean_val = data_simple_imputed['Credit_Score'].mean()
    data_simple_imputed['Credit_Score'].fillna(mean_val, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_simple_imputed[col].fillna(median_val, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data_simple_imputed['Credit_Score'].fillna(mean_val, inplace=True)


In [12]:
# Approach 2: Imputation using IterativeImputer [best]

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

# Create a copy of the dataframe for iterative imputation
data_iterative_imputed = data.copy()

# Initialize the IterativeImputer
# We can use a regression model (like BayesianRidge) to predict missing values
imputer = IterativeImputer(random_state=42)

# Apply the imputer to the relevant columns
cols_to_impute = ['Income', 'Credit_Score', 'Loan_Balance']
if all(col in data_iterative_imputed.columns for col in cols_to_impute):
    data_iterative_imputed[cols_to_impute] = imputer.fit_transform(data_iterative_imputed[cols_to_impute])

### Predictive Modeling

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE

# Define features (X) and target (y)
X = data_iterative_imputed.drop(columns=['Delinquent_Account'])
y = data_iterative_imputed['Delinquent_Account']

# Split data into training and testing sets (before SMOTE)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Apply SMOTE only to the training set
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Initialize and train the Decision Tree model
dt_model = dt_model = DecisionTreeClassifier(
    class_weight='balanced',  # prioritize minority class
    max_depth=5,              # prevent overfitting
    min_samples_split=10,     # avoid tiny splits
    min_samples_leaf=5,
    criterion='entropy',      # alternative split metric
    random_state=42
)
dt_model.fit(X_train_resampled, y_train_resampled)

# Make predictions on the (original) test set
y_pred = dt_model.predict(X_test)

# Evaluate the model
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[98 28]
 [22  2]]

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.78      0.80       126
           1       0.07      0.08      0.07        24

    accuracy                           0.67       150
   macro avg       0.44      0.43      0.44       150
weighted avg       0.70      0.67      0.68       150



A Decision Tree model was selected for its transparency and ease of interpretation, which are critical for regulatory compliance in financial risk assessment. To address the imbalanced nature of the dataset, the SMOTE technique was applied to balance the minority (delinquent) and majority (non-delinquent) classes. While this approach improved the balance of training data, the model still failed to achieve satisfactory recall for delinquent accounts, indicating it struggles to correctly identify a sufficient proportion of high-risk customers. This suggests that more advanced ensemble methods (e.g., Random Forest, Gradient Boosting) or logistic regression with class weights may be required to improve predictive performance while maintaining interpretability through explainable AI techniques such as SHAP.

In [14]:
#  Define features (X) and target (y)
X = data_iterative_imputed.drop(columns=['Delinquent_Account'])
y = data_iterative_imputed['Delinquent_Account']

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Split your data first (train/test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Apply SMOTE only on training data
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Initialize and train the Logistic Regression model with best parameters
logreg_model = LogisticRegression(
    C=1,
    class_weight=None,
    penalty='l1',
    solver='liblinear',
    random_state=42,
    max_iter=1000
)

logreg_model.fit(X_train_resampled, y_train_resampled)


# Predict probabilities for class 1
y_probs = logreg_model.predict_proba(X_test)[:, 1]

# Lower decision threshold to 0.3
threshold = 0.3
y_pred_thresh = (y_probs >= threshold).astype(int)

# Evaluate
print(f"Confusion Matrix (Threshold = {threshold}):\n", confusion_matrix(y_test, y_pred_thresh))
print("\nClassification Report:\n", classification_report(y_test, y_pred_thresh))

Confusion Matrix (Threshold = 0.3):
 [[62 64]
 [ 8 16]]

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.49      0.63       126
           1       0.20      0.67      0.31        24

    accuracy                           0.52       150
   macro avg       0.54      0.58      0.47       150
weighted avg       0.78      0.52      0.58       150



Logistic Regression outperformed Decision Tree in identifying delinquent accounts, achieving a recall of 0.42 compared to < 0.20 in the tree model. Iterative imputation yielded slightly higher accuracy (0.55) than simple imputation (0.53) while maintaining the same recall, suggesting it better preserved underlying feature relationships. Given the importance of recall in credit risk, and the marginal accuracy gain, Iterative Imputation combined with Logistic Regression is the preferred approach.

In [21]:
!pip install tensorflow-addons



In [24]:
!pip install tensorflow-addons==0.23.0 # Install a compatible version



In [28]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# ------------------------
# 1. Prepare data
# ------------------------
X = data_iterative_imputed.drop(columns=['Delinquent_Account', 'Customer_ID'])
y = data_iterative_imputed['Delinquent_Account']

# Split before SMOTE
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42, stratify=y
)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

# Scale data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_res)
X_test_scaled = scaler.transform(X_test)

# ------------------------
# 2. Define tuned neural network
# ------------------------
model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    Dropout(0.4),
    Dense(32, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

# Compile with class weights to prioritize minority class
class_weights = {0: 1, 1: 8}  # penalize false negatives more
optimizer = tf.keras.optimizers.Adam(learning_rate=0.0005)

model.compile(
    optimizer=optimizer,
    loss='binary_crossentropy',
    metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)

# ------------------------
# 3. Train longer with smaller batches
# ------------------------
early_stop = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

history = model.fit(
    X_train_scaled, y_train_res,
    epochs=200,
    batch_size=8,
    validation_split=0.2,
    verbose=1,
    class_weight=class_weights,
    callbacks=[early_stop]
)

# ------------------------
# 4. Lower threshold for recall boost
# ------------------------
y_pred_prob = model.predict(X_test_scaled)
threshold = 0.2  # lowered further
y_pred = (y_pred_prob >= threshold).astype("int32")

print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Epoch 1/200


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.5209 - loss: 2.2713 - precision_4: 0.4217 - recall_4: 0.6854 - val_accuracy: 1.0000 - val_loss: 0.3612 - val_precision_4: 1.0000 - val_recall_4: 1.0000
Epoch 2/200
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4457 - loss: 1.9402 - precision_4: 0.4061 - recall_4: 0.9233 - val_accuracy: 1.0000 - val_loss: 0.2504 - val_precision_4: 1.0000 - val_recall_4: 1.0000
Epoch 3/200
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4407 - loss: 1.8254 - precision_4: 0.4074 - recall_4: 0.9565 - val_accuracy: 1.0000 - val_loss: 0.2017 - val_precision_4: 1.0000 - val_recall_4: 1.0000
Epoch 4/200
[1m59/59[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.4376 - loss: 1.6498 - precision_4: 0.4146 - recall_4: 0.9772 - val_accuracy: 1.0000 - val_loss: 0.1639 - val_precision_4: 1.0000 - val_recall_4: 1.0000
Epoch 5/

This SMOTE-balanced neural network predicts delinquent accounts by learning complex patterns from customer features. Class weights heavily prioritize recall to catch more risky customers, while a lowered decision threshold (0.2) further boosts sensitivity. Early stopping prevents overfitting, and scaling ensures stable gradient updates.