In [1]:
import zipfile
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer
from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import make_pipeline as make_imb_pipeline

# STEP 1: Data Acquisition

In [2]:
file_location = 'data/Loan_default.csv'  # Update with your actual file path

# STEP 2: Data Loading

In [3]:
Data = pd.read_csv(file_location)
print("Initial Data Shape:", Data.shape)
print("\nFirst few rows of data:")
print(Data.head())

Initial Data Shape: (255347, 18)

First few rows of data:
       LoanID  Age  Income  LoanAmount  CreditScore  MonthsEmployed  \
0  I38PQUQS96   56   85994       50587          520              80   
1  HPSK72WA7R   69   50432      124440          458              15   
2  C1OZ6DPJ8Y   46   84208      129188          451              26   
3  V2KKSFM3UN   32   31713       44799          743               0   
4  EY08JDHTZP   60   20437        9139          633               8   

   NumCreditLines  InterestRate  LoanTerm  DTIRatio    Education  \
0               4         15.23        36      0.44   Bachelor's   
1               1          4.81        60      0.68     Master's   
2               3         21.17        24      0.31     Master's   
3               3          7.07        24      0.23  High School   
4               4          6.51        48      0.73   Bachelor's   

  EmploymentType MaritalStatus HasMortgage HasDependents LoanPurpose  \
0      Full-time      Divorced    

# STEP 3: Data Cleaning - Drop Irrelevant Features

In [4]:
Data = Data.drop(columns='LoanID')
Data.shape

(255347, 17)

# STEP 4: Data Preprocessing - Select Numerical Features

In [5]:
numerical_features = Data.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_features.remove('Default')  # Remove target variable
categorical_features = Data.select_dtypes(include=['object']).columns.tolist()

print("\nNumerical features:", numerical_features)
print("Categorical features:", categorical_features)


Numerical features: ['Age', 'Income', 'LoanAmount', 'CreditScore', 'MonthsEmployed', 'NumCreditLines', 'InterestRate', 'LoanTerm', 'DTIRatio']
Categorical features: ['Education', 'EmploymentType', 'MaritalStatus', 'HasMortgage', 'HasDependents', 'LoanPurpose', 'HasCoSigner']


# STEP 5: Feature Engineering - Categorical Encoding

In [6]:
def preprocess_data(df):
    # Make a copy to avoid modifying original dataframe
    df_processed = df.copy()

    # Map categorical features to numerical values
    education_map = {
        'High School or Equivalent': 1,
        'Bachelor\'s': 2,
        'Master\'s': 3,
        'PhD': 4
    }

    employment_map = {
        'Unemployed': 0,
        'Self-employed': 1,
        'Employed full-time': 2,
        'Employed part-time': 3
    }

    marital_map = {
        'Single': 0,
        'Married': 1,
        'Divorced': 2,
        'Widowed': 3
    }

    binary_map = {
        'No': 0,
        'Yes': 1
    }

    purpose_map = {
        'Home purchase': 0,
        'Debt consolidation': 1,
        'Education': 2,
        'Medical expenses': 3,
        'Business': 4,
        'Auto': 5,
        'Other': 6
    }

    df_processed['Education'] = df_processed['Education'].map(education_map)
    df_processed['EmploymentType'] = df_processed['EmploymentType'].map(employment_map)
    df_processed['MaritalStatus'] = df_processed['MaritalStatus'].map(marital_map)
    df_processed['HasMortgage'] = df_processed['HasMortgage'].map(binary_map)
    df_processed['HasDependents'] = df_processed['HasDependents'].map(binary_map)
    df_processed['LoanPurpose'] = df_processed['LoanPurpose'].map(purpose_map)
    df_processed['HasCoSigner'] = df_processed['HasCoSigner'].map(binary_map)

    return df_processed

Data_processed = preprocess_data(Data)
print("\nData after preprocessing:")
Data_processed.head()


Data after preprocessing:


Unnamed: 0,Age,Income,LoanAmount,CreditScore,MonthsEmployed,NumCreditLines,InterestRate,LoanTerm,DTIRatio,Education,EmploymentType,MaritalStatus,HasMortgage,HasDependents,LoanPurpose,HasCoSigner,Default
0,56,85994,50587,520,80,4,15.23,36,0.44,2.0,,2,1,1,6.0,1,0
1,69,50432,124440,458,15,1,4.81,60,0.68,3.0,,1,0,0,6.0,1,0
2,46,84208,129188,451,26,3,21.17,24,0.31,3.0,0.0,2,1,1,5.0,0,1
3,32,31713,44799,743,0,3,7.07,24,0.23,,,1,0,0,4.0,0,0
4,60,20437,9139,633,8,4,6.51,48,0.73,2.0,0.0,2,0,1,5.0,0,0


# STEP 6: Data Splitting

In [11]:
X = Data_processed.drop(columns='Default')
y = Data_processed['Default']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print("\nClass distribution in original data:")
print(y.value_counts(normalize=True))
print("\nClass distribution in training set:")
print(y_train.value_counts(normalize=True))
print("\nClass distribution in test set:")
print(y_test.value_counts(normalize=True))



Class distribution in original data:
Default
0    0.883872
1    0.116128
Name: proportion, dtype: float64

Class distribution in training set:
Default
0    0.883873
1    0.116127
Name: proportion, dtype: float64

Class distribution in test set:
Default
0    0.883865
1    0.116135
Name: proportion, dtype: float64


# STEP 7: Handling Class Imbalance
SMOTE (Synthetic Minority Over-sampling Technique) – Smart Upsampling
 What it does:

  Instead of copying, SMOTE creates new, synthetic samples of the minority class.

  It generates new "default" data points by interpolating between real ones (like creating something between two similar default cases).

#Pros:

 - Reduces overfitting by adding new, slightly different samples.

 - More robust than RandomOverSampler.

#Cons:

 - Slightly more complex and slower than random oversampling.


In [15]:
# Define resampling strategies
ros = RandomOverSampler(random_state=42)
smote = SMOTE(random_state=42)
rus = RandomUnderSampler(random_state=42)

# Apply SMOTE to training data only
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)

print("\nClass distribution after SMOTE:")
print(pd.Series(y_train_res).value_counts(normalize=True))


Class distribution after SMOTE:
Default
0    0.5
1    0.5
Name: proportion, dtype: float64


# STEP 8: Model Pipeline

In [17]:
# Define preprocessing for numerical and categorical features
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        n_estimators=200,
        max_depth=10,
        min_samples_split=5,
        class_weight='balanced',
        random_state=42,
        n_jobs=-1
    ))
])

# STEP 9: Model Training

In [18]:
model.fit(X_train_res, y_train_res)
print("\nModel training complete.")


Model training complete.


# STEP 10: Model Evaluation

In [19]:
# Predictions
y_pred = model.predict(X_test)

# Evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("\nModel Evaluation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Cross-validation
cv_scores = cross_val_score(model, X_train_res, y_train_res, cv=5, scoring='f1')
print("\nCross-validation F1 scores:", cv_scores)
print("Mean CV F1 score:", cv_scores.mean())


Model Evaluation Metrics:
Accuracy: 0.7503
Precision: 0.1725
Recall: 0.3030
F1 Score: 0.2199

Cross-validation F1 scores: [0.57298008 0.87347219 0.85230602 0.87647611 0.86731297]
Mean CV F1 score: 0.8085094738755838


# STEP 11: Feature Importance

In [21]:
# Get feature names after one-hot encoding
feature_names = numerical_features.copy()
cat_features = model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out(categorical_features)
feature_names.extend(cat_features)

# Get feature importances
importances = model.named_steps['classifier'].feature_importances_

# Create DataFrame of feature importances
feature_importances = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importances
}).sort_values('Importance', ascending=False)

print("\nTop 10 most important features:")
print(feature_importances.head(10))


Top 10 most important features:
                   Feature  Importance
94876        Education_4.0    0.037086
267167       HasCoSigner_0    0.024205
94877   EmploymentType_0.0    0.021364
153754     MaritalStatus_2    0.020432
8                 DTIRatio    0.020272
194126     LoanPurpose_4.0    0.020172
267166     LoanPurpose_6.0    0.019639
0                      Age    0.018952
153753     MaritalStatus_1    0.016718
267168       HasCoSigner_1    0.016634


# STEP 12: Hyperparameter Tuning (Optional)

In [24]:
# Define parameter grid
param_grid = {
    'classifier__n_estimators': [100, 200, 300],
    'classifier__max_depth': [5, 10, 15],
    'classifier__min_samples_split': [2, 5, 10]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    cv=3,
    scoring='f1',
    n_jobs=-1,
    verbose=1
)


In [25]:
# Fit GridSearchCV
print("\nStarting grid search...")
grid_search.fit(X_train_res, y_train_res)


Starting grid search...
Fitting 3 folds for each of 27 candidates, totalling 81 fits


In [26]:
# Best parameters and score
print("\nBest parameters found:")
print(grid_search.best_params_)
print("Best F1 score:", grid_search.best_score_)


Best parameters found:
{'classifier__max_depth': 15, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 300}
Best F1 score: 0.8163990953433564


In [27]:
# Get best model
best_model = grid_search.best_estimator_

In [28]:
# Evaluate best model
y_pred_best = best_model.predict(X_test)
best_f1 = f1_score(y_test, y_pred_best)
print("\nBest model F1 score on test set:", best_f1)


Best model F1 score on test set: 0.22200039928129367
