In [103]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

df_copy = pd.read_csv('data\cleaned_dataset.csv')

In [104]:
df_copy.isnull().sum()  

Month                       0
Age                         0
Occupation                  0
Annual_Income               0
Monthly_Inhand_Salary       0
Interest_Rate               0
Payment_of_Min_Amount       0
Payment_Behaviour           0
Credit_Score                0
Annual_Income_qt            0
Income_Bin                  0
Auto Loan                   0
Credit-Builder Loan         0
Debt Consolidation Loan     0
Home Equity Loan            0
Mortgage Loan               0
Not Specified               0
Payday Loan                 0
Personal Loan               0
Student Loan                0
DTI                         0
EMI_to_Income               0
Invest_to_Income            0
Balance_to_Income           0
Avg_Delay_if_Delayed        0
Has_Delays                  0
High_Utilization            0
Total_Financial_Products    0
Inquiries_per_Year          0
Limit_Decrease_Flag         0
Large_Limit_Change          0
Num_Loan_Types              0
dtype: int64

In [117]:
df_copy.select_dtypes(exclude=['object']).columns

Index(['Month', 'Age', 'Annual_Income', 'Monthly_Inhand_Salary',
       'Interest_Rate', 'Annual_Income_qt', 'Auto Loan', 'Credit-Builder Loan',
       'Debt Consolidation Loan', 'Home Equity Loan', 'Mortgage Loan',
       'Not Specified', 'Payday Loan', 'Personal Loan', 'Student Loan', 'DTI',
       'EMI_to_Income', 'Invest_to_Income', 'Balance_to_Income',
       'Avg_Delay_if_Delayed', 'Has_Delays', 'High_Utilization',
       'Total_Financial_Products', 'Inquiries_per_Year', 'Limit_Decrease_Flag',
       'Large_Limit_Change', 'Num_Loan_Types'],
      dtype='object')

In [106]:
# =====================================================
# 1. Split Data
# =====================================================
from sklearn.model_selection import train_test_split


X = df_copy.drop(columns=['Credit_Score'])
y = df_copy['Credit_Score']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# =====================================================
# 2. Build Preprocessing Pipeline
# =====================================================
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('Standard Scaler', StandardScaler(), num_cols),
        ('OneHot Encoder', OneHotEncoder(drop='first'), cat_cols)
    ]
)

# =====================================================
# 3. Transform Data, Reduce Correlation
# =====================================================
preprocessor.fit(X_train)
feature_names = preprocessor.get_feature_names_out()
# Transform train and test sets
X_train_t = preprocessor.transform(X_train)
X_test_t = preprocessor.transform(X_test)
X_train_df = pd.DataFrame(X_train_t, columns=feature_names)
X_test_df = pd.DataFrame(X_test_t, columns=feature_names)

# Correlation-based feature pruning
corr_matrix = X_train_df.corr().abs()
redundant = set()
threshold = 0.9
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if corr_matrix.iloc[i, j] > threshold:
            redundant.add(corr_matrix.columns[j])

X_train_reduced = X_train_df.drop(columns=list(redundant))
X_test_reduced = X_test_df.drop(columns=list(redundant))

In [107]:
redundant


set()

In [108]:
X_train_reduced


Unnamed: 0,Standard Scaler__Month,Standard Scaler__Age,Standard Scaler__Annual_Income,Standard Scaler__Monthly_Inhand_Salary,Standard Scaler__Interest_Rate,Standard Scaler__Annual_Income_qt,Standard Scaler__Auto Loan,Standard Scaler__Credit-Builder Loan,Standard Scaler__Debt Consolidation Loan,Standard Scaler__Home Equity Loan,...,OneHot Encoder__Occupation_Writer,OneHot Encoder__Payment_of_Min_Amount_Yes,OneHot Encoder__Payment_Behaviour_High_spent_Medium_value_payments,OneHot Encoder__Payment_Behaviour_High_spent_Small_value_payments,OneHot Encoder__Payment_Behaviour_Low_spent_Large_value_payments,OneHot Encoder__Payment_Behaviour_Low_spent_Medium_value_payments,OneHot Encoder__Payment_Behaviour_Low_spent_Small_value_payments,OneHot Encoder__Income_Bin_Low,OneHot Encoder__Income_Bin_Mid High,OneHot Encoder__Income_Bin_Mid Low
0,0.216718,1.644986,-0.051439,1.399201,-0.519133,1.111471,-0.663473,-0.679992,1.491304,-0.676869,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
1,1.525404,-0.956594,-0.051197,1.395315,-0.633112,1.116242,-0.663473,-0.679992,-0.670554,-0.676869,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,-1.091967,1.459159,-0.011172,2.753654,-0.747092,1.972923,-0.663473,-0.679992,-0.670554,1.477392,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
3,-0.655739,0.622937,-0.080401,0.192404,0.392706,0.439857,-0.663473,1.470605,-0.670554,-0.676869,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.652947,-1.514076,-0.087660,-0.033247,0.620666,0.261754,-0.663473,1.470605,1.491304,-0.676869,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79995,1.525404,2.016641,-0.042292,1.624776,-1.203012,1.303452,1.507221,-0.679992,-0.670554,-0.676869,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
79996,1.089175,-0.492026,-0.117057,-1.109925,1.874444,-2.002768,1.507221,1.470605,-0.670554,-0.676869,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
79997,1.525404,1.180418,-0.098217,-0.420218,2.102404,-0.094590,-0.663473,1.470605,-0.670554,-0.676869,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
79998,0.216718,0.808764,-0.031690,2.025695,-1.089032,1.569574,-0.663473,1.470605,-0.670554,-0.676869,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


In [109]:
X_test_reduced

Unnamed: 0,Standard Scaler__Month,Standard Scaler__Age,Standard Scaler__Annual_Income,Standard Scaler__Monthly_Inhand_Salary,Standard Scaler__Interest_Rate,Standard Scaler__Annual_Income_qt,Standard Scaler__Auto Loan,Standard Scaler__Credit-Builder Loan,Standard Scaler__Debt Consolidation Loan,Standard Scaler__Home Equity Loan,...,OneHot Encoder__Occupation_Writer,OneHot Encoder__Payment_of_Min_Amount_Yes,OneHot Encoder__Payment_Behaviour_High_spent_Medium_value_payments,OneHot Encoder__Payment_Behaviour_High_spent_Small_value_payments,OneHot Encoder__Payment_Behaviour_Low_spent_Large_value_payments,OneHot Encoder__Payment_Behaviour_Low_spent_Medium_value_payments,OneHot Encoder__Payment_Behaviour_Low_spent_Small_value_payments,OneHot Encoder__Income_Bin_Low,OneHot Encoder__Income_Bin_Mid High,OneHot Encoder__Income_Bin_Mid Low
0,-0.219510,0.622937,-0.091642,-0.153193,1.304545,0.180335,1.507221,1.470605,1.491304,-0.676869,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,1.525404,0.065455,-0.109130,-0.847790,0.848626,-0.671989,1.507221,-0.679992,1.491304,-0.676869,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
2,0.652947,0.530023,-0.094752,-0.326715,0.848626,0.064644,1.507221,-0.679992,-0.670554,-0.676869,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,-1.091967,-0.492026,-0.117585,-1.092203,0.050767,-2.326018,1.507221,-0.679992,-0.670554,1.477392,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
4,-1.528196,0.994591,-0.093480,-0.281906,0.278726,0.110448,1.507221,1.470605,1.491304,1.477392,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19995,-1.528196,0.065455,-0.069183,0.758788,0.506686,0.742111,-0.663473,1.470605,-0.670554,-0.676869,...,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
19996,0.216718,0.437110,-0.100341,-0.513751,-0.633112,-0.220265,-0.663473,1.470605,-0.670554,-0.676869,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
19997,0.216718,-1.421162,-0.047331,1.468288,0.620666,1.198868,-0.663473,-0.679992,-0.670554,-0.676869,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
19998,-0.219510,-0.584940,-0.095248,-0.202667,-0.975052,0.043445,-0.663473,-0.679992,-0.670554,-0.676869,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0


In [110]:

from sklearn.metrics import classification_report, accuracy_score


rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_reduced, y_train)
y_pred = rf.predict(X_test_reduced)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Random Forest Accuracy: 0.8238
              precision    recall  f1-score   support

        Good       0.80      0.78      0.79      3566
        Poor       0.81      0.85      0.83      5799
    Standard       0.84      0.82      0.83     10635

    accuracy                           0.82     20000
   macro avg       0.82      0.82      0.82     20000
weighted avg       0.82      0.82      0.82     20000



In [111]:
# Grid Search with reduced features
from sklearn.model_selection import GridSearchCV


param_grid = {
    'n_estimators': [100, 200],  # fewer options
    'max_depth': [8, 15, None],  # smaller set
    'min_samples_split': [2, 10], 
    'max_features': [7]
}

grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, verbose=2, n_jobs=4)
grid.fit(X_train_reduced, y_train)
print("Best parameters:", grid.best_params_)
print("Best CV score:", grid.best_score_)
rf_best = grid.best_estimator_

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best parameters: {'max_depth': None, 'max_features': 7, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.8032374779069856


In [112]:
best_params = {
    'max_depth': None,
    'max_features': 7,
    'min_samples_split': 2,
    'n_estimators': 200,
    'random_state': 42
}

# Initialize the model with best parameters
rf_best = RandomForestClassifier(**best_params)

# Fit the model on the full training data again
rf_best.fit(X_train_reduced, y_train)
y_pred = rf_best.predict(X_test_reduced)
from sklearn.metrics import accuracy_score, classification_report

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



Test Accuracy: 0.8257
              precision    recall  f1-score   support

        Good       0.80      0.78      0.79      3566
        Poor       0.81      0.85      0.83      5799
    Standard       0.85      0.83      0.84     10635

    accuracy                           0.83     20000
   macro avg       0.82      0.82      0.82     20000
weighted avg       0.83      0.83      0.83     20000



In [113]:
# Feature importances
from sklearn.model_selection import cross_val_score


importances = rf.feature_importances_
feat_ranks = sorted(zip(X_train_reduced.columns, importances), key=lambda x: x[1], reverse=True)
print("Top 10 Features:")
for f, v in feat_ranks[:10]:
    print(f"{f}: {v:.4f}")

# Cross-validation
cv_scores = cross_val_score(rf, X_train_reduced, y_train, cv=5, scoring='accuracy')
print("Random Forest Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

Top 10 Features:
Standard Scaler__Interest_Rate: 0.1057
Standard Scaler__Inquiries_per_Year: 0.0971
Standard Scaler__EMI_to_Income: 0.0758
Standard Scaler__Total_Financial_Products: 0.0697
Standard Scaler__Avg_Delay_if_Delayed: 0.0633
Standard Scaler__Invest_to_Income: 0.0567
Standard Scaler__DTI: 0.0510
Standard Scaler__Monthly_Inhand_Salary: 0.0499
Standard Scaler__Annual_Income_qt: 0.0470
Standard Scaler__Annual_Income: 0.0468
Random Forest Cross-validation scores: [0.8135625 0.8128125 0.8123125 0.8138125 0.8101875]
Mean CV accuracy: 0.8125375
