In [11]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
import warnings
warnings.filterwarnings('ignore')

In [12]:
# Load the Excel dataset
df = pd.read_csv('data\Delinquency_prediction_dataset-_1_.csv')

# Display first 5 rows to verify data
print('First 5 rows of the dataset:')
print(df.head())

# Check for missing values
print('\nMissing values in each column:')
print(df.isnull().sum())

First 5 rows of the dataset:
  Customer_ID  Age    Income  Credit_Score  Credit_Utilization  \
0    CUST0001   56  165580.0         398.0            0.390502   
1    CUST0002   69  100999.0         493.0            0.312444   
2    CUST0003   46  188416.0         500.0            0.359930   
3    CUST0004   32  101672.0         413.0            0.371400   
4    CUST0005   60   38524.0         487.0            0.234716   

   Missed_Payments  Delinquent_Account  Loan_Balance  Debt_to_Income_Ratio  \
0                3                   0       16310.0              0.317396   
1                6                   1       17401.0              0.196093   
2                0                   0       13761.0              0.301655   
3                3                   0       88778.0              0.264794   
4                2                   0       13316.0              0.510583   

  Employment_Status  Account_Tenure Credit_Card_Type     Location Month_1  \
0               EMP         

In [13]:
# Handle missing values
# Fill missing Income and Loan_Balance with median values
df['Income'].fillna(df['Income'].median(), inplace=True)
df['Loan_Balance'].fillna(df['Loan_Balance'].median(), inplace=True)

# Convert payment history (Month_1 to Month_6) to numerical values
# On-time = 0, Late = 1, Missed = 2
payment_mapping = {'On-time': 0, 'Late': 1, 'Missed': 2}
for month in ['Month_1', 'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6']:
    df[month] = df[month].map(payment_mapping)

# Encode categorical variables (Employment_Status, Credit_Card_Type, Location)
le = LabelEncoder()
df['Employment_Status'] = le.fit_transform(df['Employment_Status'])
df['Credit_Card_Type'] = le.fit_transform(df['Credit_Card_Type'])
df['Location'] = le.fit_transform(df['Location'])

# Create a new feature: Payment_Score (sum of payment statuses)
df['Payment_Score'] = df[['Month_1', 'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6']].sum(axis=1)

# Verify preprocessing
print("Data after preprocessing:")
print(df.head())
print("\nMissing values after handling:")
print(df.isnull().sum())

Data after preprocessing:
  Customer_ID  Age    Income  Credit_Score  Credit_Utilization  \
0    CUST0001   56  165580.0         398.0            0.390502   
1    CUST0002   69  100999.0         493.0            0.312444   
2    CUST0003   46  188416.0         500.0            0.359930   
3    CUST0004   32  101672.0         413.0            0.371400   
4    CUST0005   60   38524.0         487.0            0.234716   

   Missed_Payments  Delinquent_Account  Loan_Balance  Debt_to_Income_Ratio  \
0                3                   0       16310.0              0.317396   
1                6                   1       17401.0              0.196093   
2                0                   0       13761.0              0.301655   
3                3                   0       88778.0              0.264794   
4                2                   0       13316.0              0.510583   

   Employment_Status  Account_Tenure  Credit_Card_Type  Location  Month_1  \
0                  0           

In [14]:
# Features: all columns except Customer_ID and Delinquent_Account
X = df.drop(['Customer_ID', 'Delinquent_Account'], axis=1)
# Target: Delinquent_Account (0 or 1)
y = df['Delinquent_Account']

# Scale numerical features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training (80%) and testing (20%) sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Print shapes to verify
print('Training data shape:', X_train.shape)
print('Testing data shape:', X_test.shape)
print('Training labels shape:', y_train.shape)
print('Testing labels shape:', y_test.shape)

Training data shape: (400, 18)
Testing data shape: (100, 18)
Training labels shape: (400,)
Testing labels shape: (100,)


In [15]:
# Define models
models = {
    'XGBoost': XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'ANN': Sequential([
        Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
}

# Store results
results = []

# Train and evaluate each model
for name, model in models.items():
    print(f'\nTraining {name}...')
    if name == 'ANN':
        # Compile and train ANN
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        model.fit(X_train, y_train, epochs=50, batch_size=32, verbose=0)
        y_pred = (model.predict(X_test) > 0.5).astype(int).flatten()
    else:
        # Train tree-based models
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    # Cross-validation (skip for ANN due to complexity)
    cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='accuracy') if name != 'ANN' else [None]

    # Store results
    results.append({
        'Model': name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1,
        'CV Accuracy': cv_scores.mean() if name != 'ANN' else 'N/A'
    })

    # Print confusion matrix
    print(f'\nConfusion Matrix for {name}:')
    print(confusion_matrix(y_test, y_pred))

    # Print feature importance for tree-based models
    if name != 'ANN':
        feature_importance = pd.DataFrame({
            'Feature': X.columns,
            'Importance': model.feature_importances_
        }).sort_values(by='Importance', ascending=False)
        print(f'\nFeature Importance for {name}:')
        print(feature_importance)

# Display results
print('\nModel Comparison:')
results_df = pd.DataFrame(results)
print(results_df)


Training XGBoost...

Confusion Matrix for XGBoost:
[[85  1]
 [14  0]]

Feature Importance for XGBoost:
                 Feature  Importance
15               Month_5    0.101168
17         Payment_Score    0.073944
0                    Age    0.065643
6   Debt_to_Income_Ratio    0.064999
14               Month_4    0.064046
12               Month_2    0.060586
1                 Income    0.059578
7      Employment_Status    0.059489
3     Credit_Utilization    0.058543
8         Account_Tenure    0.055470
9       Credit_Card_Type    0.053433
2           Credit_Score    0.049559
10              Location    0.045501
16               Month_6    0.043786
11               Month_1    0.039813
5           Loan_Balance    0.037406
4        Missed_Payments    0.036032
13               Month_3    0.031007

Training Random Forest...

Confusion Matrix for Random Forest:
[[86  0]
 [14  0]]

Feature Importance for Random Forest:
                 Feature  Importance
2           Credit_Score    0.1045

In [16]:
import joblib
from tensorflow.keras.models import save_model

# Save tree-based models
joblib.dump(models['XGBoost'], 'xgboost_model.pkl')
joblib.dump(models['Random Forest'], 'random_forest_model.pkl')
joblib.dump(models['Decision Tree'], 'decision_tree_model.pkl')

# Save ANN model
models['ANN'].save('ann_model.h5')

# Save the scaler for preprocessing
joblib.dump(scaler, 'scaler.pkl')

# Save the LabelEncoder for categorical variables (if needed)
joblib.dump(le, 'label_encoder.pkl')



['label_encoder.pkl']