<a href="https://colab.research.google.com/github/Abhishek-Jaiswal-Git/creditRiskModel/blob/main/creditRiskModeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score,accuracy_score,classification_report,precision_recall_fscore_support
from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import warnings
import os

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
df1 =pd.read_csv('https://raw.githubusercontent.com/Abhishek-Jaiswal-Git/creditRiskModel/main/bankProductData.csv')

In [None]:
df2 =  pd.read_csv('https://raw.githubusercontent.com/Abhishek-Jaiswal-Git/creditRiskModel/main/cibilData.csv')

In [None]:
df1 = df1[df1['Age_Oldest_TL']!= -99999]

In [None]:
#remove columns rom df2 which has more than 10000 missing values (-99999)
columnsToBeRemoved = []
for i in df2.columns:
    if df2[df2[i] == -99999].shape[0] > 10000:
        columnsToBeRemoved.append(i)

In [None]:
columnsToBeRemoved

['time_since_first_deliquency',
 'time_since_recent_deliquency',
 'max_delinquency_level',
 'max_deliq_6mts',
 'max_deliq_12mts',
 'CC_utilization',
 'PL_utilization',
 'max_unsec_exposure_inPct']

In [None]:
df2 = df2.drop(columnsToBeRemoved,axis=1)

In [None]:
df2.shape

(51336, 54)

In [None]:
#removing null rows from df2
for i in df2.columns:
    df2 = df2.loc[df2[i] != -99999]

In [None]:
df2.shape

(42066, 54)

In [None]:
df = pd.merge(df1,df2,how = 'inner', on ='PROSPECTID')

In [None]:
#dividing features into categorical and numericals
cat_features = [i for i in df.columns if df[i].dtype == 'object']
cat_features

['MARITALSTATUS',
 'EDUCATION',
 'GENDER',
 'last_prod_enq2',
 'first_prod_enq2',
 'Approved_Flag']

In [None]:
#chi=square test
for i in cat_features:
    chi2,pval,_,_ = chi2_contingency(pd.crosstab(df[i],df['Approved_Flag']))
    print(i,'---->',pval)

MARITALSTATUS ----> 3.578180861038862e-233
EDUCATION ----> 2.6942265249737532e-30
GENDER ----> 1.907936100186563e-05
last_prod_enq2 ----> 0.0
first_prod_enq2 ----> 7.84997610555419e-287
Approved_Flag ----> 0.0


In [None]:
#all the features has pvalue 0.05, we will use all values as all are associated with output

In [None]:
num_features = [i for i in df.columns if df[i].dtype != 'object' and i not in ['PROSPECTID','Approved_Flag']]

In [None]:
#VIF (Variance Inflation Factor) use to find multicollinearilty
vif_data = df[num_features]
total_colums = vif_data.shape[1]
columns_to_be_kept = []
column_index=0

for i in range(0,total_colums):
    vif_value = variance_inflation_factor(vif_data,column_index)
    if vif_value<=6:
        columns_to_be_kept.append(num_features[i])
        column_index+=1

    else:
        vif_data = vif_data.drop([num_features[i]],axis=1)


  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)


In [None]:
vif_data.shape

(42064, 39)

In [None]:
from scipy.stats import f_oneway

columns_to_be_kept_numerical = []
#ANOVA test
for i in columns_to_be_kept:
    a= list(df[i])
    b=list(df['Approved_Flag'])

    group_P1 = [value for value,group in zip(a,b) if group == 'P1']
    group_P2 = [value for value,group in zip(a,b) if group == 'P2']
    group_P3 = [value for value,group in zip(a,b) if group == 'P3']
    group_P4 = [value for value,group in zip(a,b) if group == 'P4']

    f_statistic,p_value = f_oneway(group_P1,group_P2,group_P3,group_P4)

    if p_value <=0.05:
        columns_to_be_kept_numerical.append(i)

In [None]:
difference_list = [value for value in columns_to_be_kept if value not in columns_to_be_kept_numerical]
print(difference_list)

['num_lss_12mts', 'pct_currentBal_all_TL']


In [None]:
df=df[columns_to_be_kept_numerical+ cat_features]

In [None]:
df.loc[df['EDUCATION'] == 'SSC',['EDUCATION']] = 1
df.loc[df['EDUCATION'] == '12TH',['EDUCATION']] = 2
df.loc[df['EDUCATION'] == 'GRADUATE',['EDUCATION']] = 3
df.loc[df['EDUCATION'] == 'UNDER GRADUATE',['EDUCATION']] = 3
df.loc[df['EDUCATION'] == 'POST-GRADUATE',['EDUCATION']] = 4
df.loc[df['EDUCATION'] == 'OTHERS',['EDUCATION']] = 1
df.loc[df['EDUCATION'] == 'PROFESSIONAL',['EDUCATION']] = 3

In [None]:
df_encoded  = pd.get_dummies(df,columns = ['MARITALSTATUS','GENDER','last_prod_enq2','first_prod_enq2'])

In [None]:
#Model fitting
X= df_encoded.drop(['Approved_Flag'],axis=1)
y= df_encoded['Approved_Flag']

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state = 42)

In [None]:
model = RandomForestClassifier()


model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)

In [None]:
accuracy_score(y_test,y_pred)

0.7638179008677047

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print ()
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)


for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()

Accuracy: 0.7638179008677047

Class p1:
Precision: 0.8327526132404182
Recall: 0.7071005917159763
F1 Score: 0.7647999999999999

Class p2:
Precision: 0.7986348122866894
Recall: 0.9276511397423192
F1 Score: 0.8583218707015131

Class p3:
Precision: 0.4396946564885496
Recall: 0.21735849056603773
F1 Score: 0.2909090909090909

Class p4:
Precision: 0.7145612343297975
Recall: 0.7201166180758017
F1 Score: 0.7173281703775412



In [None]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',  num_class=4)



y = df_encoded['Approved_Flag']
x = df_encoded.drop ( ['Approved_Flag'], axis = 1 )
x['EDUCATION'] = x['EDUCATION'].astype('int64')

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)


In [None]:
xgb_classifier.fit(x_train, y_train)
y_pred = xgb_classifier.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print ()
print(f'Accuracy: {accuracy:.2f}')
print ()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()


Accuracy: 0.78

Class p1:
Precision: 0.823906083244397
Recall: 0.7613412228796844
F1 Score: 0.7913890312660173

Class p2:
Precision: 0.8255418233924413
Recall: 0.913577799801784
F1 Score: 0.8673315769665036

Class p3:
Precision: 0.4756380510440835
Recall: 0.30943396226415093
F1 Score: 0.3749428440786465

Class p4:
Precision: 0.7342386032977691
Recall: 0.7356656948493683
F1 Score: 0.7349514563106796



In [None]:
df_encoded['Approved_Flag'].value_counts()

Approved_Flag
P2    25452
P3     6440
P4     5264
P1     4908
Name: count, dtype: int64

In [None]:
from sklearn.preprocessing import StandardScaler

columns_to_be_scaled = ['Age_Oldest_TL','Age_Newest_TL','time_since_recent_payment',
'max_recent_level_of_deliq','recent_level_of_deliq',
'time_since_recent_enq','NETMONTHLYINCOME','Time_With_Curr_Empr']

for i in columns_to_be_scaled:
    column_data = df_encoded[i].values.reshape(-1, 1)
    scaler = StandardScaler()
    scaled_column = scaler.fit_transform(column_data)
    df_encoded[i] = scaled_column


In [None]:
y = df_encoded['Approved_Flag']
x = df_encoded. drop ( ['Approved_Flag'], axis = 1 )

x['EDUCATION'] = x['EDUCATION'].astype('int64')
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)




xgb_classifier.fit(x_train, y_train)
y_pred = xgb_classifier.predict(x_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()


Accuracy: 0.78
Class p1:
Precision: 0.823906083244397
Recall: 0.7613412228796844
F1 Score: 0.7913890312660173

Class p2:
Precision: 0.8255418233924413
Recall: 0.913577799801784
F1 Score: 0.8673315769665036

Class p3:
Precision: 0.4756380510440835
Recall: 0.30943396226415093
F1 Score: 0.3749428440786465

Class p4:
Precision: 0.7342386032977691
Recall: 0.7356656948493683
F1 Score: 0.7349514563106796



No improvemnet in model ,after scaling.

In [None]:
xgb_model_tuned = xgb.XGBClassifier(objective='multi:softmax', num_class=4, learning_rate= 1, max_depth= 3, n_estimators= 100,colsample_bytree = 0.9,alpha=10)

In [None]:
xgb_model_tuned.fit(x_train,y_train)

In [None]:
y_pred=xgb_model_tuned.predict(x_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')


precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()

Accuracy: 0.78
Class p1:
Precision: 0.8374070138150903
Recall: 0.777120315581854
F1 Score: 0.8061381074168797

Class p2:
Precision: 0.8240740740740741
Recall: 0.9173439048562934
F1 Score: 0.868211237219773

Class p3:
Precision: 0.4655380894800484
Recall: 0.29056603773584905
F1 Score: 0.3578066914498141

Class p4:
Precision: 0.738581146744412
Recall: 0.738581146744412
F1 Score: 0.7385811467444121

