In [5]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

df_copy = pd.read_csv('data\cleaned_dataset.csv')

In [6]:
df_copy.isnull().sum()  

Month                       0
Age                         0
Occupation                  0
Annual_Income               0
Monthly_Inhand_Salary       0
Interest_Rate               0
Payment_of_Min_Amount       0
Payment_Behaviour           0
Credit_Score                0
Annual_Income_qt            0
Income_Bin                  0
Auto Loan                   0
Credit-Builder Loan         0
Debt Consolidation Loan     0
Home Equity Loan            0
Mortgage Loan               0
Not Specified               0
Payday Loan                 0
Personal Loan               0
Student Loan                0
DTI                         0
EMI_to_Income               0
Invest_to_Income            0
Balance_to_Income           0
Avg_Delay_if_Delayed        0
Has_Delays                  0
High_Utilization            0
Total_Financial_Products    0
Inquiries_per_Year          0
Limit_Decrease_Flag         0
Large_Limit_Change          0
Num_Loan_Types              0
dtype: int64

In [7]:
df_copy.select_dtypes(exclude=['object']).columns

Index(['Month', 'Age', 'Annual_Income', 'Monthly_Inhand_Salary',
       'Interest_Rate', 'Annual_Income_qt', 'Auto Loan', 'Credit-Builder Loan',
       'Debt Consolidation Loan', 'Home Equity Loan', 'Mortgage Loan',
       'Not Specified', 'Payday Loan', 'Personal Loan', 'Student Loan', 'DTI',
       'EMI_to_Income', 'Invest_to_Income', 'Balance_to_Income',
       'Avg_Delay_if_Delayed', 'Has_Delays', 'High_Utilization',
       'Total_Financial_Products', 'Inquiries_per_Year', 'Limit_Decrease_Flag',
       'Large_Limit_Change', 'Num_Loan_Types'],
      dtype='object')

In [8]:
df_copy.select_dtypes(include=['object']).columns

Index(['Occupation', 'Payment_of_Min_Amount', 'Payment_Behaviour',
       'Credit_Score', 'Income_Bin'],
      dtype='object')

In [None]:
# =====================================================
# 1. Split Data
# =====================================================
from sklearn.model_selection import train_test_split


X = df_copy.drop(columns=['Credit_Score'])
y = df_copy['Credit_Score']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)


In [68]:
X_train.columns

Index(['Month', 'Age', 'Occupation', 'Annual_Income', 'Monthly_Inhand_Salary',
       'Interest_Rate', 'Payment_of_Min_Amount', 'Payment_Behaviour',
       'Annual_Income_qt', 'Income_Bin', 'Auto Loan', 'Credit-Builder Loan',
       'Debt Consolidation Loan', 'Home Equity Loan', 'Mortgage Loan',
       'Not Specified', 'Payday Loan', 'Personal Loan', 'Student Loan', 'DTI',
       'EMI_to_Income', 'Invest_to_Income', 'Balance_to_Income',
       'Avg_Delay_if_Delayed', 'Has_Delays', 'High_Utilization',
       'Total_Financial_Products', 'Inquiries_per_Year', 'Limit_Decrease_Flag',
       'Large_Limit_Change', 'Num_Loan_Types'],
      dtype='object')

In [73]:
X_test.columns

Index(['Month', 'Age', 'Occupation', 'Annual_Income', 'Monthly_Inhand_Salary',
       'Interest_Rate', 'Payment_of_Min_Amount', 'Payment_Behaviour',
       'Annual_Income_qt', 'Income_Bin', 'Auto Loan', 'Credit-Builder Loan',
       'Debt Consolidation Loan', 'Home Equity Loan', 'Mortgage Loan',
       'Not Specified', 'Payday Loan', 'Personal Loan', 'Student Loan', 'DTI',
       'EMI_to_Income', 'Invest_to_Income', 'Balance_to_Income',
       'Avg_Delay_if_Delayed', 'Has_Delays', 'High_Utilization',
       'Total_Financial_Products', 'Inquiries_per_Year', 'Limit_Decrease_Flag',
       'Large_Limit_Change', 'Num_Loan_Types'],
      dtype='object')

In [None]:

# =====================================================
# 2. Build Preprocessing Pipeline
# =====================================================
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import pandas as pd

num_cols = X_train.select_dtypes(include=['int64', 'float64']).columns.tolist()
cat_cols = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

preprocessor = ColumnTransformer(
    transformers=[
        ('Standard Scaler', StandardScaler(), num_cols),
        ('OneHot Encoder', OneHotEncoder(drop='first'), cat_cols)
    ]
)

# =====================================================
# 3. Transform Data, Reduce Correlation
# =====================================================
preprocessor.fit(X_train)
feature_names = preprocessor.get_feature_names_out()
# Transform train and test sets
X_train_t = preprocessor.transform(X_train)
X_test_t = preprocessor.transform(X_test)
X_train_df = pd.DataFrame(X_train_t, columns=feature_names)
X_test_df = pd.DataFrame(X_test_t, columns=feature_names)

# Correlation-based feature pruning
corr_matrix = X_train_df.corr().abs()
redundant = set()
threshold = 0.9
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if corr_matrix.iloc[i, j] > threshold:
            redundant.add(corr_matrix.columns[j])

X_train_reduced = X_train_df.drop(columns=list(redundant))
X_test_reduced = X_test_df.drop(columns=list(redundant))

In [10]:
redundant


set()

In [74]:
X_train_reduced.columns


Index(['Standard Scaler__Month', 'Standard Scaler__Age',
       'Standard Scaler__Annual_Income',
       'Standard Scaler__Monthly_Inhand_Salary',
       'Standard Scaler__Interest_Rate', 'Standard Scaler__Annual_Income_qt',
       'Standard Scaler__Auto Loan', 'Standard Scaler__Credit-Builder Loan',
       'Standard Scaler__Debt Consolidation Loan',
       'Standard Scaler__Home Equity Loan', 'Standard Scaler__Mortgage Loan',
       'Standard Scaler__Not Specified', 'Standard Scaler__Payday Loan',
       'Standard Scaler__Personal Loan', 'Standard Scaler__Student Loan',
       'Standard Scaler__DTI', 'Standard Scaler__EMI_to_Income',
       'Standard Scaler__Invest_to_Income',
       'Standard Scaler__Balance_to_Income',
       'Standard Scaler__Avg_Delay_if_Delayed', 'Standard Scaler__Has_Delays',
       'Standard Scaler__High_Utilization',
       'Standard Scaler__Total_Financial_Products',
       'Standard Scaler__Inquiries_per_Year',
       'Standard Scaler__Limit_Decrease_Flag',


In [75]:
X_test_reduced.columns

Index(['Standard Scaler__Month', 'Standard Scaler__Age',
       'Standard Scaler__Annual_Income',
       'Standard Scaler__Monthly_Inhand_Salary',
       'Standard Scaler__Interest_Rate', 'Standard Scaler__Annual_Income_qt',
       'Standard Scaler__Auto Loan', 'Standard Scaler__Credit-Builder Loan',
       'Standard Scaler__Debt Consolidation Loan',
       'Standard Scaler__Home Equity Loan', 'Standard Scaler__Mortgage Loan',
       'Standard Scaler__Not Specified', 'Standard Scaler__Payday Loan',
       'Standard Scaler__Personal Loan', 'Standard Scaler__Student Loan',
       'Standard Scaler__DTI', 'Standard Scaler__EMI_to_Income',
       'Standard Scaler__Invest_to_Income',
       'Standard Scaler__Balance_to_Income',
       'Standard Scaler__Avg_Delay_if_Delayed', 'Standard Scaler__Has_Delays',
       'Standard Scaler__High_Utilization',
       'Standard Scaler__Total_Financial_Products',
       'Standard Scaler__Inquiries_per_Year',
       'Standard Scaler__Limit_Decrease_Flag',


In [49]:

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix


rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_reduced, y_train)
y_pred = rf.predict(X_test_reduced)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))

Random Forest Accuracy: 0.8238
              precision    recall  f1-score   support

        Good       0.80      0.78      0.79      3566
        Poor       0.81      0.85      0.83      5799
    Standard       0.84      0.82      0.83     10635

    accuracy                           0.82     20000
   macro avg       0.82      0.82      0.82     20000
weighted avg       0.82      0.82      0.82     20000

[[2766    1  799]
 [  20 4945  834]
 [ 690 1180 8765]]


In [14]:
# Grid Search with reduced features
from sklearn.model_selection import GridSearchCV


param_grid = {
    'n_estimators': [100, 200],  # fewer options
    'max_depth': [8, 15, None],  # smaller set
    'min_samples_split': [2, 10], 
    'max_features': [7]
}

grid = GridSearchCV(RandomForestClassifier(random_state=42), param_grid, cv=3, verbose=2, n_jobs=4)
grid.fit(X_train_reduced, y_train)
print("Best parameters:", grid.best_params_)
print("Best CV score:", grid.best_score_)
rf_best = grid.best_estimator_

Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best parameters: {'max_depth': None, 'max_features': 7, 'min_samples_split': 2, 'n_estimators': 200}
Best CV score: 0.8034249797820676


In [50]:
best_params = {
    'max_depth': None,
    'max_features': 7,
    'min_samples_split': 2,
    'n_estimators': 200,
    'random_state': 42
}

# Initialize the model with best parameters
rf_best = RandomForestClassifier(**best_params)

# Fit the model on the full training data again
rf_best.fit(X_train_reduced, y_train)
y_pred = rf_best.predict(X_test_reduced)
from sklearn.metrics import accuracy_score, classification_report

print("Test Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test,y_pred))



Test Accuracy: 0.8258
              precision    recall  f1-score   support

        Good       0.80      0.78      0.79      3566
        Poor       0.81      0.85      0.83      5799
    Standard       0.85      0.83      0.84     10635

    accuracy                           0.83     20000
   macro avg       0.82      0.82      0.82     20000
weighted avg       0.83      0.83      0.83     20000

[[2788    1  777]
 [  20 4948  831]
 [ 677 1178 8780]]


In [16]:
# Feature importances
from sklearn.model_selection import cross_val_score


importances = importances = rf_best.feature_importances_

feat_ranks = sorted(zip(X_train_reduced.columns, importances), key=lambda x: x[1], reverse=True)
print("Top 10 Features:")
for f, v in feat_ranks[:10]:
    print(f"{f}: {v:.4f}")

# Cross-validation
cv_scores = cross_val_score(rf_best, X_train_reduced, y_train, cv=5, scoring='accuracy')
print("Random Forest Cross-validation scores:", cv_scores)
print("Mean CV accuracy:", cv_scores.mean())

Top 10 Features:
Standard Scaler__Interest_Rate: 0.1041
Standard Scaler__Inquiries_per_Year: 0.0966
Standard Scaler__EMI_to_Income: 0.0754
Standard Scaler__Total_Financial_Products: 0.0700
Standard Scaler__Avg_Delay_if_Delayed: 0.0630
Standard Scaler__Invest_to_Income: 0.0569
Standard Scaler__DTI: 0.0512
Standard Scaler__Monthly_Inhand_Salary: 0.0492
Standard Scaler__Annual_Income: 0.0471
Standard Scaler__Annual_Income_qt: 0.0468
Random Forest Cross-validation scores: [0.8130625 0.8143125 0.814     0.81425   0.8135   ]
Mean CV accuracy: 0.8138249999999999


In [None]:
def get_user_input():
    # Create an empty dictionary to store inputs
    user_data = {}
    
    # Numeric columns input
    for col in num_cols:
        val = input(f"Enter value for {col} (numeric): ")
        # Convert input to float, handle errors if needed
        user_data[col] = float(val)
    
    # Categorical columns input
    for col in cat_cols:
        val = input(f"Enter category for {col} (categorical): ")
        user_data[col] = val
    
    # Convert to DataFrame
    user_df = pd.DataFrame([user_data])
    return user_df

# Get user input as DataFrame
# user_input_df = get_user_input()

# # Preprocess user input
# user_input_t = preprocessor.transform(user_input_df)

# # Convert to DataFrame with feature names
# user_input_df_transformed = pd.DataFrame(user_input_t, columns=feature_names)

# # Drop redundant features to match model input
# user_input_df_reduced = user_input_df_transformed.drop(columns=list(redundant))

# # Predict using trained model
# user_pred = rf_best.predict(user_input_df_reduced)
# print("Predicted Credit Score:", user_pred[0])


Predicted Credit Score: Standard


In [143]:
test_customer = {
    "Month": 5,
    "Age": 34,
    "Occupation": "Lawyer",
    "Annual_Income": 8200,
    "Monthly_Inhand_Salary": 620,
    "Interest_Rate": 7.0,
    "Payment_of_Min_Amount": "No",
    "Payment_Behaviour": "Low_spent_Small_value_payments",
    # "Credit_Score": "Good",
    "Annual_Income_qt": 0.01,
    "Income_Bin": "Low",


    "Auto Loan": 1,
    "Credit-Builder Loan": 1,
    "Debt Consolidation Loan": 1,
    "Home Equity Loan": 1,
    "Mortgage Loan": 1,
    "Not Specified": 0,
    "Payday Loan": 0,
    "Personal Loan": 1,
    "Student Loan": 0,


    "DTI": 0.82,
    "EMI_to_Income": 0.90,
    "Invest_to_Income": 0.08,
    "Balance_to_Income": 0.02,


    "Avg_Delay_if_Delayed": 30,
    "Has_Delays": 1,
    "High_Utilization": 1,
    "Total_Financial_Products": 7,
    "Inquiries_per_Year": 8,
    "Limit_Decrease_Flag": 1,
    "Large_Limit_Change": 1,
    "Num_Loan_Types": 8
}

user_df = pd.DataFrame([test_customer])

In [144]:
# Preprocess user input
user_input_t = preprocessor.transform(user_df)

# Convert to DataFrame with feature names
user_input_df_transformed = pd.DataFrame(user_input_t, columns=feature_names)

# Drop redundant features to match model input
user_input_df_reduced = user_input_df_transformed.drop(columns=list(redundant))

# Predict using trained model
user_pred = rf_best.predict(user_input_df_reduced)
print("Predicted Credit Score:", user_pred[0])

Predicted Credit Score: Standard
