In [73]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support
import warnings
import os

In [74]:
df = pd.read_csv('C:/Users/dell/Documents/Data science/campusX/project/credit risk modeling/data_for_model.csv')

df_encoded = df.copy()
df_encoded.shape

(41500, 55)

In [75]:
df_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41500 entries, 0 to 41499
Data columns (total 55 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pct_tl_open_L6M               41500 non-null  float64
 1   pct_tl_closed_L6M             41500 non-null  float64
 2   Tot_TL_closed_L12M            41500 non-null  int64  
 3   pct_tl_closed_L12M            41500 non-null  float64
 4   Tot_Missed_Pmnt               41500 non-null  int64  
 5   CC_TL                         41500 non-null  int64  
 6   Home_TL                       41500 non-null  int64  
 7   PL_TL                         41500 non-null  int64  
 8   Secured_TL                    41500 non-null  int64  
 9   Unsecured_TL                  41500 non-null  int64  
 10  Other_TL                      41500 non-null  int64  
 11  Age_Oldest_TL                 41500 non-null  int64  
 12  Age_Newest_TL                 41500 non-null  int64  
 13  t

In [76]:
# Machine Learing model fitting

# Data processing

# 1. Random Forest

y = df_encoded['Approved_Flag']
x = df_encoded.drop( ['Approved_Flag'], axis = 1 )

In [77]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [78]:
rf_classifier = RandomForestClassifier(n_estimators = 200, random_state=42)

rf_classifier.fit(x_train, y_train)

y_pred = rf_classifier.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print ()
print(f'Accuracy: {accuracy}')
print ()
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()


Accuracy: 0.7646987951807229

Class p1:
Precision: 0.8152046783625732
Recall: 0.7033299697275479
F1 Score: 0.7551462621885157

Class p2:
Precision: 0.7947133539306557
Recall: 0.9265559335601361
F1 Score: 0.855585327543195

Class p3:
Precision: 0.45923460898502494
Recall: 0.21495327102803738
F1 Score: 0.2928381962864722

Class p4:
Precision: 0.730844793713163
Recall: 0.7237354085603113
F1 Score: 0.7272727272727273



In [79]:
# 2. xgboost

import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',  num_class=4)

y = df_encoded['Approved_Flag']
x = df_encoded. drop ( ['Approved_Flag'], axis = 1 )

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

xgb_classifier.fit(x_train, y_train)
y_pred = xgb_classifier.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print ()
print(f'Accuracy: {accuracy:.2f}')
print ()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()



Accuracy: 0.77

Class p1:
Precision: 0.8167202572347267
Recall: 0.768920282542886
F1 Score: 0.7920997920997921

Class p2:
Precision: 0.8246471226927253
Recall: 0.9119471683009805
F1 Score: 0.8661028223890526

Class p3:
Precision: 0.43196004993757803
Recall: 0.26947040498442365
F1 Score: 0.33189448441247005

Class p4:
Precision: 0.7269230769230769
Recall: 0.7354085603112841
F1 Score: 0.7311411992263056



In [80]:
# 3. Decision Tree
from sklearn.tree import DecisionTreeClassifier

y = df_encoded['Approved_Flag']
x = df_encoded. drop ( ['Approved_Flag'], axis = 1 )

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

dt_model = DecisionTreeClassifier(max_depth=20, min_samples_split=10)
dt_model.fit(x_train, y_train)
y_pred = dt_model.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print ()
print(f"Accuracy: {accuracy:.2f}")
print ()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()


Accuracy: 0.71

Class p1:
Precision: 0.7306910569105691
Recall: 0.7255297679112008
F1 Score: 0.7281012658227848

Class p2:
Precision: 0.8097197726827357
Recall: 0.8268961376826096
F1 Score: 0.8182178217821782

Class p3:
Precision: 0.3364406779661017
Recall: 0.309190031152648
F1 Score: 0.3222402597402597

Class p4:
Precision: 0.6321393998063891
Recall: 0.6352140077821011
F1 Score: 0.633672974284328



In [81]:
# Random Forest = 0.76
# Xgboost = 0.77
# Decision Tree = 0.71

In [82]:
# Accuracy score , xgboost is better
# Now we will further finetune it
# HP tunning
# feature engineering - -scaling , graph


In [83]:
# Hyperparameter tuning for xgboost
# define the hyperparameter grid

In [84]:
param_grid ={
    'colsample_bytree': [0.7, 0.8, 0.9],
    'learning_rate': [ 0.1,0.5, 1],
    'max_depth': [2, 3, 4  ],
    'n_estimators': [ 100, 200,  300, 400],
    'alpha': [1,5, 10]

}

In [85]:
# loop through each combination of hyperparamter
# Assuming these are initialized before the loop
index = 0
answer_grid = {
    'combination': [],
    'train_accuracy': [],
    'test_accuracy': [],
    'colsample_bytree': [],
    'learning_rate': [],
    'max_depth': [],
    'n_estimators': [],
    'alpha': []
}

label_encoder = LabelEncoder()

In [86]:
x = df_encoded.drop(['Approved_Flag'], axis=1)
y = df_encoded['Approved_Flag']
y = label_encoder.fit_transform(y)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [70]:
best_hyperparameter = []
# Loop through each combination of hyperparameters
for colsample_bytree in param_grid['colsample_bytree']:
    for learning_rate in param_grid['learning_rate']:
        for max_depth in param_grid['max_depth']:
            for n_estimators in param_grid['n_estimators']:
                for alpha in param_grid['alpha']:
                    index += 1
                    model = xgb.XGBClassifier(
                        objective='multi:softmax',
                        num_class=4,
                        colsample_bytree = colsample_bytree,
                        learning_rate = learning_rate,
                        max_depth = max_depth,
                        n_estimators = n_estimators,
                        alpha =  alpha
                    )


                    model.fit(x_train, y_train)
                    y_pred_train = model.predict(x_train)
                    y_pred_test = model.predict(x_test)

                    train_accuracy = accuracy_score(y_train, y_pred_train)
                    test_accuracy = accuracy_score(y_test, y_pred_test)

                    # Include into the lists
                    answer_grid['combination'].append(index)
                    answer_grid['train_accuracy'].append(train_accuracy)
                    answer_grid['test_accuracy'].append(test_accuracy)
                    answer_grid['colsample_bytree'].append(colsample_bytree)
                    answer_grid['learning_rate'].append(learning_rate)
                    answer_grid['max_depth'].append(max_depth)
                    answer_grid['n_estimators'].append(n_estimators)
                    answer_grid['alpha'].append(alpha)
                    
                    if train_accuracy > 0.8 and test_accuracy > 0.75:
                        
                        best_hyperparameter.append( {
                            'Sr.NO': index,
                            'Train accuracy': train_accuracy,
                            'Test accuracy' : test_accuracy,
                            'Colsample bytree':colsample_bytree,
                            'Learning rate':learning_rate,
                            'Max depth':max_depth,
                            'N estimators':n_estimators,
                            'Alpha':alpha
                        })
                        
                        
                        
                        # Print the result for this combination
                        print(f"Combination {index}:")
                        print(f"Colsample_bytree: {colsample_bytree}, learning_rate: {learning_rate}, max_depth: {max_depth}, alpha: {alpha}, n_estimators: {n_estimators}")
                        print(f"Train Accuracy: {train_accuracy:.4f}")
                        print(f"Test Accuracy: {test_accuracy:.4f}")
                        print('--' * 40)


Combination 22:
Colsample_bytree: 0.7, learning_rate: 0.1, max_depth: 3, alpha: 1, n_estimators: 400
Train Accuracy: 0.8069
Test Accuracy: 0.7792
--------------------------------------------------------------------------------
Combination 23:
Colsample_bytree: 0.7, learning_rate: 0.1, max_depth: 3, alpha: 5, n_estimators: 400
Train Accuracy: 0.8017
Test Accuracy: 0.7788
--------------------------------------------------------------------------------
Combination 28:
Colsample_bytree: 0.7, learning_rate: 0.1, max_depth: 4, alpha: 1, n_estimators: 200
Train Accuracy: 0.8081
Test Accuracy: 0.7778
--------------------------------------------------------------------------------
Combination 29:
Colsample_bytree: 0.7, learning_rate: 0.1, max_depth: 4, alpha: 5, n_estimators: 200
Train Accuracy: 0.8033
Test Accuracy: 0.7772
--------------------------------------------------------------------------------
Combination 31:
Colsample_bytree: 0.7, learning_rate: 0.1, max_depth: 4, alpha: 1, n_estimat

Combination 70:
Colsample_bytree: 0.7, learning_rate: 0.5, max_depth: 4, alpha: 1, n_estimators: 400
Train Accuracy: 0.9363
Test Accuracy: 0.7655
--------------------------------------------------------------------------------
Combination 71:
Colsample_bytree: 0.7, learning_rate: 0.5, max_depth: 4, alpha: 5, n_estimators: 400
Train Accuracy: 0.9000
Test Accuracy: 0.7699
--------------------------------------------------------------------------------
Combination 72:
Colsample_bytree: 0.7, learning_rate: 0.5, max_depth: 4, alpha: 10, n_estimators: 400
Train Accuracy: 0.8395
Test Accuracy: 0.7723
--------------------------------------------------------------------------------
Combination 73:
Colsample_bytree: 0.7, learning_rate: 1, max_depth: 2, alpha: 1, n_estimators: 100
Train Accuracy: 0.8018
Test Accuracy: 0.7757
--------------------------------------------------------------------------------
Combination 76:
Colsample_bytree: 0.7, learning_rate: 1, max_depth: 2, alpha: 1, n_estimators

Combination 127:
Colsample_bytree: 0.8, learning_rate: 0.1, max_depth: 3, alpha: 1, n_estimators: 300
Train Accuracy: 0.8005
Test Accuracy: 0.7775
--------------------------------------------------------------------------------
Combination 130:
Colsample_bytree: 0.8, learning_rate: 0.1, max_depth: 3, alpha: 1, n_estimators: 400
Train Accuracy: 0.8071
Test Accuracy: 0.7775
--------------------------------------------------------------------------------
Combination 131:
Colsample_bytree: 0.8, learning_rate: 0.1, max_depth: 3, alpha: 5, n_estimators: 400
Train Accuracy: 0.8029
Test Accuracy: 0.7784
--------------------------------------------------------------------------------
Combination 136:
Colsample_bytree: 0.8, learning_rate: 0.1, max_depth: 4, alpha: 1, n_estimators: 200
Train Accuracy: 0.8089
Test Accuracy: 0.7789
--------------------------------------------------------------------------------
Combination 137:
Colsample_bytree: 0.8, learning_rate: 0.1, max_depth: 4, alpha: 5, n_es

Combination 174:
Colsample_bytree: 0.8, learning_rate: 0.5, max_depth: 4, alpha: 10, n_estimators: 200
Train Accuracy: 0.8375
Test Accuracy: 0.7751
--------------------------------------------------------------------------------
Combination 175:
Colsample_bytree: 0.8, learning_rate: 0.5, max_depth: 4, alpha: 1, n_estimators: 300
Train Accuracy: 0.9159
Test Accuracy: 0.7695
--------------------------------------------------------------------------------
Combination 176:
Colsample_bytree: 0.8, learning_rate: 0.5, max_depth: 4, alpha: 5, n_estimators: 300
Train Accuracy: 0.8869
Test Accuracy: 0.7698
--------------------------------------------------------------------------------
Combination 177:
Colsample_bytree: 0.8, learning_rate: 0.5, max_depth: 4, alpha: 10, n_estimators: 300
Train Accuracy: 0.8394
Test Accuracy: 0.7755
--------------------------------------------------------------------------------
Combination 178:
Colsample_bytree: 0.8, learning_rate: 0.5, max_depth: 4, alpha: 1, n_

Combination 213:
Colsample_bytree: 0.8, learning_rate: 1, max_depth: 4, alpha: 10, n_estimators: 300
Train Accuracy: 0.8699
Test Accuracy: 0.7681
--------------------------------------------------------------------------------
Combination 214:
Colsample_bytree: 0.8, learning_rate: 1, max_depth: 4, alpha: 1, n_estimators: 400
Train Accuracy: 0.9824
Test Accuracy: 0.7543
--------------------------------------------------------------------------------
Combination 215:
Colsample_bytree: 0.8, learning_rate: 1, max_depth: 4, alpha: 5, n_estimators: 400
Train Accuracy: 0.9434
Test Accuracy: 0.7604
--------------------------------------------------------------------------------
Combination 216:
Colsample_bytree: 0.8, learning_rate: 1, max_depth: 4, alpha: 10, n_estimators: 400
Train Accuracy: 0.8699
Test Accuracy: 0.7681
--------------------------------------------------------------------------------
Combination 235:
Colsample_bytree: 0.9, learning_rate: 0.1, max_depth: 3, alpha: 1, n_estimato

Combination 278:
Colsample_bytree: 0.9, learning_rate: 0.5, max_depth: 4, alpha: 5, n_estimators: 100
Train Accuracy: 0.8309
Test Accuracy: 0.7725
--------------------------------------------------------------------------------
Combination 279:
Colsample_bytree: 0.9, learning_rate: 0.5, max_depth: 4, alpha: 10, n_estimators: 100
Train Accuracy: 0.8202
Test Accuracy: 0.7751
--------------------------------------------------------------------------------
Combination 280:
Colsample_bytree: 0.9, learning_rate: 0.5, max_depth: 4, alpha: 1, n_estimators: 200
Train Accuracy: 0.8862
Test Accuracy: 0.7689
--------------------------------------------------------------------------------
Combination 281:
Colsample_bytree: 0.9, learning_rate: 0.5, max_depth: 4, alpha: 5, n_estimators: 200
Train Accuracy: 0.8640
Test Accuracy: 0.7730
--------------------------------------------------------------------------------
Combination 282:
Colsample_bytree: 0.9, learning_rate: 0.5, max_depth: 4, alpha: 10, n_

Combination 317:
Colsample_bytree: 0.9, learning_rate: 1, max_depth: 4, alpha: 5, n_estimators: 200
Train Accuracy: 0.9057
Test Accuracy: 0.7611
--------------------------------------------------------------------------------
Combination 318:
Colsample_bytree: 0.9, learning_rate: 1, max_depth: 4, alpha: 10, n_estimators: 200
Train Accuracy: 0.8673
Test Accuracy: 0.7664
--------------------------------------------------------------------------------
Combination 319:
Colsample_bytree: 0.9, learning_rate: 1, max_depth: 4, alpha: 1, n_estimators: 300
Train Accuracy: 0.9692
Test Accuracy: 0.7587
--------------------------------------------------------------------------------
Combination 320:
Colsample_bytree: 0.9, learning_rate: 1, max_depth: 4, alpha: 5, n_estimators: 300
Train Accuracy: 0.9278
Test Accuracy: 0.7593
--------------------------------------------------------------------------------
Combination 321:
Colsample_bytree: 0.9, learning_rate: 1, max_depth: 4, alpha: 10, n_estimators

In [71]:
hyper = pd.DataFrame(best_hyperparameter)
hyper.shape

(228, 8)

In [72]:
hyper.to_csv("data_of_hyperparameter.csv", index=False)


In [87]:
best_hyper = pd.read_csv('data_of_hyperparameter.csv')


Unnamed: 0,Sr.NO,Train accuracy,Test accuracy,Colsample bytree,Learning rate,Max depth,N estimators,Alpha
0,22,0.806867,0.779157,0.7,0.1,3,400,1
1,23,0.801747,0.778795,0.7,0.1,3,400,5
2,28,0.808102,0.777831,0.7,0.1,4,200,1
3,29,0.803253,0.777229,0.7,0.1,4,200,5
4,31,0.819699,0.777831,0.7,0.1,4,300,1
...,...,...,...,...,...,...,...,...
223,320,0.927831,0.759277,0.9,1.0,4,300,5
224,321,0.869970,0.765663,0.9,1.0,4,300,10
225,322,0.983343,0.757470,0.9,1.0,4,400,1
226,323,0.942620,0.760964,0.9,1.0,4,400,5


In [90]:
best_hyper['Test accuracy'].sort_values(ascending=False)

194    0.780602
87     0.779759
89     0.779398
166    0.779398
173    0.779277
         ...   
225    0.757470
145    0.757349
71     0.757229
213    0.756506
148    0.754337
Name: Test accuracy, Length: 228, dtype: float64

In [91]:
# this is our best hyperparameter
best_hyper.iloc[194]

Sr.NO               289.000000
Train accuracy        0.802771
Test accuracy         0.780602
Colsample bytree      0.900000
Learning rate         1.000000
Max depth             2.000000
N estimators        100.000000
Alpha                 1.000000
Name: 194, dtype: float64

In [127]:
X = df_encoded.drop(['Approved_Flag'], axis=1)
y = df_encoded['Approved_Flag']
Y = label_encoder.fit_transform(y)


In [128]:

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [129]:
final_model = xgb.XGBClassifier(
                        objective='multi:softmax',
                        num_class=4,
                        colsample_bytree = 0.9,
                        learning_rate = 1,
                        max_depth = 2,
                        n_estimators = 100,
                        alpha =  1
                    )


final_model.fit(x_train, y_train)
y_pred_train = final_model.predict(x_train)
y_pred_test = final_model.predict(x_test)

train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)
print(train_accuracy)
print(test_accuracy)

0.8027710843373494
0.7806024096385542


In [132]:
testing_data = pd.read_csv("data_for_testing.csv")
testing_data.shape

(564, 55)

In [133]:
x = testing_data.drop(['Approved_Flag'], axis=1)
y = testing_data['Approved_Flag']
y = label_encoder.fit_transform(y)


In [134]:
y_pred = final_model.predict(x)

In [135]:
accuracy = accuracy_score(y, y_pred)
accuracy

0.7890070921985816

In [136]:
# model give same accuracy on unseen data as testing data

In [137]:
import pickle

model_filename = 'Credit_Risk_Predictor.pkl'


with open(model_filename, 'wb') as file:
    pickle.dump(final_model, file)

print(f"Model saved to {model_filename}")

Model saved to Credit_Risk_Predictor.pkl
