In [2]:
# =====================================================
# 1. IMPORTS AND SETUP
# =====================================================
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import calendar
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, OrdinalEncoder, MultiLabelBinarizer, StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
from sklearn.pipeline import Pipeline
import joblib

In [3]:
# =====================================================
# 2. LOAD DATA
# =====================================================
df = pd.read_csv('data/dataset.csv')
df_copy = df.copy()

# =====================================================
# 3. CLEANING & PREPROCESSING
# =====================================================
# Clean strings (removes whitespace, leading/trailing underscores)
for col in df_copy.columns:
    if df_copy[col].dtype == 'object':
        df_copy[col] = df_copy[col].str.strip().str.strip('_')

# Specific column fixes
df_copy['Customer_ID'] = df_copy['Customer_ID'].str.lstrip('CUS_0x')
df_copy['Payment_Behaviour'] = df_copy['Payment_Behaviour'].replace('!@9#%8', np.nan)
df_copy['Occupation'] = df_copy['Occupation'].replace('', np.nan)
df_copy['Credit_Mix'] = df_copy['Credit_Mix'].replace('', np.nan)
df_copy['Payment_of_Min_Amount'].replace('NM', 'No', inplace=True)

# Month name to number
month_to_num = {m:i for i,m in enumerate(calendar.month_name) if m}
df_copy['Month'] = df_copy['Month'].map(month_to_num)

# Drop irrelevant privacy columns
df_copy.drop(columns=[c for c in ["ID","SSN","Name"] if c in df_copy.columns], inplace=True)

# =====================================================
# 4. TYPE CONVERSION & RANGE VALIDATION
# =====================================================
float_cols = ['Annual_Income','Interest_Rate', 'Monthly_Inhand_Salary', 'Changed_Credit_Limit',
              'Outstanding_Debt','Credit_Utilization_Ratio','Total_EMI_per_month',
              'Amount_invested_monthly','Monthly_Balance']

int_cols = ['Age','Num_Bank_Accounts', 'Num_Credit_Card', 'Num_of_Loan',
            'Delay_from_due_date','Num_of_Delayed_Payment','Num_Credit_Inquiries']

for col in float_cols:
    df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')

for col in int_cols:
    df_copy[col] = pd.to_numeric(df_copy[col], errors='coerce')

# Drop out-of-range values 
for col in ['Age','Num_Bank_Accounts','Num_Credit_Card','Num_of_Loan',
            'Num_of_Delayed_Payment','Num_Credit_Inquiries','Interest_Rate']:
    df_copy.loc[(df_copy[col]<0)|(df_copy[col]>100), col] = np.nan
df_copy.loc[(df_copy['Num_Bank_Accounts']>60)|(df_copy['Num_Bank_Accounts']<0), 'Num_Bank_Accounts'] = np.nan

In [4]:

median_value = df_copy['Monthly_Balance'].median()
df_median_imputed = df_copy.copy()
df_median_imputed['Monthly_Balance'].fillna(median_value, inplace=True)


In [5]:
# =====================================================
# 5. IMPUTATION (GROUPWISE)
# =====================================================
def impute_by_customer_median(df, columns):
    group_medians = df.groupby('Customer_ID')[columns].median()
    for col in columns:
        df[col] = df[col].fillna(df['Customer_ID'].map(group_medians[col]))
    return df

def impute_by_customer_mode(df, columns):
    for col in columns:
        modes = df.groupby('Customer_ID')[col].agg(lambda x: x.mode().iloc[0] if not x.mode().empty else np.nan)
        df[col] = df[col].fillna(df['Customer_ID'].map(modes))
    return df

df_copy = impute_by_customer_median(df_copy, float_cols + int_cols)
cat_cols = df_copy.select_dtypes(include='object').columns.tolist()
df_copy = impute_by_customer_mode(df_copy, [c for c in cat_cols if c!='Credit_History_Age'])

# Impute global mode for 'Type_of_Loan'
if 'Type_of_Loan' in df_copy.columns:
    mode_value = df_copy['Type_of_Loan'].mode()[0]
    df_copy['Type_of_Loan'] = df_copy['Type_of_Loan'].fillna(mode_value)


In [6]:
df_copy.isnull().sum()

Customer_ID                    0
Month                          0
Age                            0
Occupation                     0
Annual_Income                  0
Monthly_Inhand_Salary          0
Num_Bank_Accounts              0
Num_Credit_Card                0
Interest_Rate                  0
Num_of_Loan                    0
Type_of_Loan                   0
Delay_from_due_date            0
Num_of_Delayed_Payment         0
Changed_Credit_Limit           0
Num_Credit_Inquiries           0
Credit_Mix                     0
Outstanding_Debt               0
Credit_Utilization_Ratio       0
Credit_History_Age          9030
Payment_of_Min_Amount          0
Total_EMI_per_month            0
Amount_invested_monthly        0
Payment_Behaviour              0
Monthly_Balance             1696
Credit_Score                   0
dtype: int64

In [7]:
# =====================================================
# 6. CONVERT CREDIT HISTORY AGE TO MONTHS
# =====================================================
def convert_age_to_months(age_str):
    if pd.isna(age_str) or age_str in ['NA', '_', '']:
        return np.nan
    match = re.match(r"(\d+)\s*Years\s*and\s*(\d+)\s*Months", str(age_str))
    if match:
        years = int(match.group(1))
        months = int(match.group(2))
        return years*12 + months
    return np.nan

if 'Credit_History_Age' in df_copy.columns:
    df_copy['Credit_History_Age_Months'] = df_copy['Credit_History_Age'].apply(convert_age_to_months)
    df_copy['Credit_History_Age_Months'] = df_copy['Credit_History_Age_Months'].fillna(df_copy['Credit_History_Age_Months'].median())
    df_copy.drop(columns=['Credit_History_Age'], inplace=True)

In [8]:
# =====================================================
# 7. ENCODING & FEATURE ENGINEERING
# =====================================================


# MultiLabelBinarizer for loan types
def split_loan_types(val):
    if pd.isna(val): return []
    return [loan.strip() for loan in str(val).replace(" and ", ", ").split(",") if loan.strip()]
if 'Type_of_Loan' in df_copy.columns:
    df_copy["Type_of_Loan_List"] = df_copy["Type_of_Loan"].apply(split_loan_types)
    mlb = MultiLabelBinarizer()
    loan_dummies = pd.DataFrame(mlb.fit_transform(df_copy['Type_of_Loan_List']),
                                columns=mlb.classes_,index=df_copy.index)
    df_copy = pd.concat([df_copy, loan_dummies], axis=1)
df_copy.drop(columns=['Type_of_Loan','Type_of_Loan_List'], inplace=True)

# Outlier capping (choose appropriate quantiles for your data)
# for feature in ['Annual_Income','Age','Monthly_Inhand_Salary']:
#     lower = df_copy[feature].quantile(0.06)
#     upper = df_copy[feature].quantile(0.94)
#     df_copy[feature+'_Capped'] = df_copy[feature].clip(lower=lower, upper=upper)
#     df_copy.drop(columns=feature, inplace=True)

# # Log transformation (for capped features)
# for feature in ['Annual_Income_Capped','Monthly_Inhand_Salary_Capped', 'Age_Capped']:
#     df_copy[feature+'_log'] = np.log1p(df_copy[feature])

# Feature engineering: ratios and flags
df_copy['DTI'] = df_copy['Total_EMI_per_month'] / df_copy['Monthly_Inhand_Salary']
df_copy['EMI_to_Income'] = df_copy['Outstanding_Debt'] / df_copy['Annual_Income']
df_copy['Invest_to_Income'] = df_copy['Amount_invested_monthly'] / df_copy['Monthly_Inhand_Salary']
df_copy['Balance_to_Income'] = df_copy['Monthly_Balance'] / df_copy['Monthly_Inhand_Salary']
df_copy['Avg_Delay_if_Delayed'] = df_copy['Delay_from_due_date'] / df_copy['Num_of_Delayed_Payment'].replace(0, 1)
df_copy['Has_Delays'] = (df_copy['Num_of_Delayed_Payment'] > 0).astype(int)
df_copy['High_Utilization'] = (df_copy['Credit_Utilization_Ratio'] > 0.7).astype(int)
df_copy['Total_Financial_Products'] = df_copy['Num_Bank_Accounts'] + df_copy['Num_Credit_Card'] + df_copy['Num_of_Loan']
df_copy['Inquiries_per_Year'] = df_copy['Num_Credit_Inquiries'] / (df_copy['Credit_History_Age_Months']/12).replace(0, 1)
df_copy['Limit_Decrease_Flag'] = (df_copy['Changed_Credit_Limit'] < 0).astype(int)
df_copy['Large_Limit_Change'] = (df_copy['Changed_Credit_Limit'].abs() > 20).astype(int)
loan_cols = [col for col in df_copy.columns if col.endswith("Loan")]
df_copy['Num_Loan_Types'] = df_copy[loan_cols].sum(axis=1)

# Drop columns that have been fully replaced with engineered features
drop_cols = ['Total_EMI_per_month','Outstanding_Debt','Amount_invested_monthly','Monthly_Balance',
             'Delay_from_due_date','Num_of_Delayed_Payment','Credit_Utilization_Ratio','Num_Bank_Accounts',
             'Num_Credit_Card','Num_of_Loan','Num_Credit_Inquiries','Credit_History_Age_Months',
             'Changed_Credit_Limit','Credit_Mix']
df_copy.drop(columns=[col for col in drop_cols if col in df_copy.columns], inplace=True)

# Drop Customer_ID if present
if 'Customer_ID' in df_copy.columns:
    df_copy.drop(columns=['Customer_ID'], inplace=True)

df_copy.to_csv('data/cleaned_dataset.csv', index=False)
print("Final feature list:", df_copy.columns.tolist())



Final feature list: ['Month', 'Age', 'Occupation', 'Annual_Income', 'Monthly_Inhand_Salary', 'Interest_Rate', 'Payment_of_Min_Amount', 'Payment_Behaviour', 'Credit_Score', 'Auto Loan', 'Credit-Builder Loan', 'Debt Consolidation Loan', 'Home Equity Loan', 'Mortgage Loan', 'Not Specified', 'Payday Loan', 'Personal Loan', 'Student Loan', 'DTI', 'EMI_to_Income', 'Invest_to_Income', 'Balance_to_Income', 'Avg_Delay_if_Delayed', 'Has_Delays', 'High_Utilization', 'Total_Financial_Products', 'Inquiries_per_Year', 'Limit_Decrease_Flag', 'Large_Limit_Change', 'Num_Loan_Types']


In [9]:
# =====================================================
# 1. Split Data
# =====================================================
X = df_copy.drop(columns=['Credit_Score'])
y = df_copy['Credit_Score']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

# =====================================================
# 2. Build Preprocessing Pipeline
# =====================================================
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_cols),
        ('cat', OneHotEncoder(drop='first'), cat_cols)
    ]
)

# =====================================================
# 3. Transform Data, Reduce Correlation
# =====================================================
preprocessor.fit(X_train)
feature_names = preprocessor.get_feature_names_out()
# Transform train and test sets
X_train_t = preprocessor.transform(X_train)
X_test_t = preprocessor.transform(X_test)
X_train_df = pd.DataFrame(X_train_t, columns=feature_names)
X_test_df = pd.DataFrame(X_test_t, columns=feature_names)

# Correlation-based feature pruning
corr_matrix = X_train_df.corr().abs()
redundant = set()
threshold = 0.9
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if corr_matrix.iloc[i, j] > threshold:
            redundant.add(corr_matrix.columns[j])

X_train_reduced = X_train_df.drop(columns=list(redundant))
X_test_reduced = X_test_df.drop(columns=list(redundant))

In [10]:
redundant

set()

In [11]:
from sklearn.model_selection import RandomizedSearchCV


rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_reduced, y_train)
y_pred = rf.predict(X_test_reduced)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

Random Forest Accuracy: 0.82345
              precision    recall  f1-score   support

        Good       0.80      0.78      0.79      3566
        Poor       0.81      0.85      0.83      5799
    Standard       0.84      0.83      0.83     10635

    accuracy                           0.82     20000
   macro avg       0.82      0.82      0.82     20000
weighted avg       0.82      0.82      0.82     20000



In [12]:
# Grid Search with reduced features
param_grid = {
    'n_estimators': [100, 200],  # fewer options
    'max_depth': [8, 15, None],  # smaller set
    'min_samples_split': [2, 10], 
    'max_features': ['auto', 7]
}

grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, verbose=2, n_jobs=4)
grid.fit(X_train_reduced, y_train)
print("Best parameters:", grid.best_params_)
print("Best CV score:", grid.best_score_)
rf_best = grid.best_estimator_

random = RandomizedSearchCV(RandomForestClassifier(random_state=42), param_distributions=param_grid, n_iter=10, cv=3, verbose=2, n_jobs=4)
random.fit(X_train_reduced, y_train)
print("Best parameters from Randomized Search:", random.best_params_)
print("Best CV score from Randomized Search:", random.best_score_)
rf_random_best = random.best_estimator_

Fitting 3 folds for each of 24 candidates, totalling 72 fits
Best parameters: {'max_depth': None, 'max_features': 7, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.8008499716561613
Fitting 3 folds for each of 20 candidates, totalling 60 fits
Best parameters from Randomized Search: {'n_estimators': 200, 'min_samples_split': 2, 'max_features': 7, 'max_depth': None}
Best CV score from Randomized Search: 0.8008499716561613


In [15]:
best_params = {
    'max_depth': None,
    'max_features': 7,
    'min_samples_split': 2,
    'n_estimators': 200,
    'random_state': 42
}

# Initialize the model with best parameters
rf_best = RandomForestClassifier(**best_params)

# Fit the model on the full training data again
rf_best.fit(X_train_reduced, y_train)
y_pred = rf_best.predict(X_test_reduced)
from sklearn.metrics import accuracy_score, classification_report

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))



Test Accuracy: 0.82615
              precision    recall  f1-score   support

        Good       0.81      0.78      0.79      3566
        Poor       0.81      0.85      0.83      5799
    Standard       0.84      0.83      0.84     10635

    accuracy                           0.83     20000
   macro avg       0.82      0.82      0.82     20000
weighted avg       0.83      0.83      0.83     20000



In [16]:
# Feature importances
importances = rf.feature_importances_
feat_ranks = sorted(zip(X_train_reduced.columns, importances), key=lambda x: x[1], reverse=True)
print("Top 10 Features:")
for f, v in feat_ranks[:10]:
    print(f"{f}: {v:.4f}")

# Cross-validation
cv_scores = cross_val_score(rf, X_train_reduced, y_train, cv=5, scoring='accuracy')
print("Random Forest Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

Top 10 Features:
num__Interest_Rate: 0.1111
num__Inquiries_per_Year: 0.0984
num__EMI_to_Income: 0.0785
num__Total_Financial_Products: 0.0712
num__Avg_Delay_if_Delayed: 0.0657
num__Invest_to_Income: 0.0592
num__Annual_Income: 0.0560
num__Monthly_Inhand_Salary: 0.0554
num__DTI: 0.0550
num__Age: 0.0489
Random Forest Cross-validation scores: [0.81075   0.8103125 0.8134375 0.81      0.8090625]
Mean CV accuracy: 0.8107125


In [None]:
# # =====================================================
# # 5. Feature/Model Interpretation & Visualization
# # =====================================================
# import matplotlib.pyplot as plt
# indices = importances.argsort()[::-1]
# plt.figure(figsize=(10,6))
# plt.bar(range(len(importances)), importances[indices], align="center")
# plt.xticks(range(len(importances)), X_train_reduced.columns[indices], rotation=90)
# plt.title("Feature Importances (Random Forest)")
# plt.show()






In [None]:
from sklearn.inspection import permutation_importance, PartialDependenceDisplay
result = permutation_importance(rf, X_test_reduced, y_test, n_repeats=10, random_state=42)
perm_sorted_idx = result.importances_mean.argsort()[::-1]
plt.figure(figsize=(10,6))
plt.bar(range(len(result.importances_mean)), result.importances_mean[perm_sorted_idx], align="center")
plt.xticks(range(len(result.importances_mean)), X_test_reduced.columns[perm_sorted_idx], rotation=90)
plt.title("Permutation Feature Importance")
plt.show()



In [None]:
# PartialDependenceDisplay.from_estimator(rf, X_test_reduced, [0, 1], feature_names=X_test_reduced.columns, target=0)
# plt.show()

test_results = X_test_reduced.copy()
test_results['True_Score'], test_results['Pred_Score'] = y_test, y_pred
misclassified = test_results[test_results['True_Score'] != test_results['Pred_Score']]
print("Sample misclassified cases:")
misclassified.head()

In [None]:
# =====================================================
# 6. Save Model and Feature List For Deployment
# =====================================================
import joblib

joblib.dump(rf_best, "credit_rf_model.pkl")
joblib.dump(X_train_reduced.columns.tolist(), "feature_columns.pkl")

def predict_new(data_row):
    model = joblib.load("credit_rf_model.pkl")
    feature_list = joblib.load("feature_columns.pkl")
    X_new = pd.DataFrame([data_row])[feature_list]
    pred = model.predict(X_new)[0]
    class_map = {0: 'Poor', 1: 'Standard', 2: 'Good'}
    return class_map.get(int(pred), pred)

print("Credit scoring project: End-to-End code complete.")