In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, precision_recall_fscore_support

from scipy.stats import chi2_contingency
from statsmodels.stats.outliers_influence import variance_inflation_factor

import warnings
import os


In [2]:
a1 = pd.read_excel('case_study1.xlsx')
a2 = pd.read_excel('case_study2.xlsx')

In [3]:
df1 = a1.copy()
df2 = a2.copy()

In [4]:
df1.head()

Unnamed: 0,PROSPECTID,Total_TL,Tot_Closed_TL,Tot_Active_TL,Total_TL_opened_L6M,Tot_TL_closed_L6M,pct_tl_open_L6M,pct_tl_closed_L6M,pct_active_tl,pct_closed_tl,...,CC_TL,Consumer_TL,Gold_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,Other_TL,Age_Oldest_TL,Age_Newest_TL
0,1,5,4,1,0,0,0.0,0.0,0.2,0.8,...,0,0,1,0,4,1,4,0,72,18
1,2,1,0,1,0,0,0.0,0.0,1.0,0.0,...,0,1,0,0,0,0,1,0,7,7
2,3,8,0,8,1,0,0.125,0.0,1.0,0.0,...,0,6,1,0,0,2,6,0,47,2
3,4,1,0,1,1,0,1.0,0.0,1.0,0.0,...,0,0,0,0,0,0,1,1,5,5
4,5,3,2,1,0,0,0.0,0.0,0.333,0.667,...,0,0,0,0,0,3,0,2,131,32


In [5]:
df1.shape

(51336, 26)

In [6]:
df2.head()

Unnamed: 0,PROSPECTID,time_since_recent_payment,time_since_first_deliquency,time_since_recent_deliquency,num_times_delinquent,max_delinquency_level,max_recent_level_of_deliq,num_deliq_6mts,num_deliq_12mts,num_deliq_6_12mts,...,pct_CC_enq_L6m_of_L12m,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,max_unsec_exposure_inPct,HL_Flag,GL_Flag,last_prod_enq2,first_prod_enq2,Credit_Score,Approved_Flag
0,1,549,35,15,11,29,29,0,0,0,...,0.0,0.0,0.0,13.333,1,0,PL,PL,696,P2
1,2,47,-99999,-99999,0,-99999,0,0,0,0,...,0.0,0.0,0.0,0.86,0,0,ConsumerLoan,ConsumerLoan,685,P2
2,3,302,11,3,9,25,25,1,9,8,...,0.0,0.0,0.0,5741.667,1,0,ConsumerLoan,others,693,P2
3,4,-99999,-99999,-99999,0,-99999,0,0,0,0,...,0.0,0.0,0.0,9.9,0,0,others,others,673,P2
4,5,583,-99999,-99999,0,-99999,0,0,0,0,...,0.0,0.0,0.0,-99999.0,0,0,AL,AL,753,P1


In [7]:
df2.shape

(51336, 62)

## Remove Nulls

In this dataset -99999 is considered as NaN values.

#### df1

In [8]:
df1 = df1.loc[df1['Age_Oldest_TL'] != -99999]

In [9]:
df1.shape

(51296, 26)

#### df2

In df2 there are a lot more null values.

If there are more than 10000(20%) Null values, then we are dropping the column.

If null values are less than 10000 we are dropping the Row.

In [10]:
columns_to_be_removed = []

for i in df2.columns:
    if df2.loc[df2[i] == -99999].shape[0] > 10000:
        columns_to_be_removed.append(i)

In [11]:
columns_to_be_removed

['time_since_first_deliquency',
 'time_since_recent_deliquency',
 'max_delinquency_level',
 'max_deliq_6mts',
 'max_deliq_12mts',
 'CC_utilization',
 'PL_utilization',
 'max_unsec_exposure_inPct']

In [12]:
df2 = df2.drop(columns_to_be_removed, axis=1)

In [13]:
df2.shape

(51336, 54)

In [14]:
for i in df2.columns:
    df2 = df2.loc[df2[i] != -99999]

In [15]:
df2.shape

(42066, 54)

## Merging

In [16]:
# Checking common column names
for i in list(df1.columns):
    if i in list(df2.columns):
        print (i)


PROSPECTID


In [17]:
# Merge the two dataframes, inner join so that no nulls are present
df = pd.merge(df1, df2, how='inner', left_on=['PROSPECTID'], right_on=['PROSPECTID'])

In [18]:
df

Unnamed: 0,PROSPECTID,Total_TL,Tot_Closed_TL,Tot_Active_TL,Total_TL_opened_L6M,Tot_TL_closed_L6M,pct_tl_open_L6M,pct_tl_closed_L6M,pct_active_tl,pct_closed_tl,...,pct_PL_enq_L6m_of_L12m,pct_CC_enq_L6m_of_L12m,pct_PL_enq_L6m_of_ever,pct_CC_enq_L6m_of_ever,HL_Flag,GL_Flag,last_prod_enq2,first_prod_enq2,Credit_Score,Approved_Flag
0,1,5,4,1,0,0,0.000,0.00,0.200,0.800,...,0.0,0.0,0.000,0.0,1,0,PL,PL,696,P2
1,2,1,0,1,0,0,0.000,0.00,1.000,0.000,...,0.0,0.0,0.000,0.0,0,0,ConsumerLoan,ConsumerLoan,685,P2
2,3,8,0,8,1,0,0.125,0.00,1.000,0.000,...,0.0,0.0,0.000,0.0,1,0,ConsumerLoan,others,693,P2
3,5,3,2,1,0,0,0.000,0.00,0.333,0.667,...,0.0,0.0,0.000,0.0,0,0,AL,AL,753,P1
4,6,6,5,1,0,0,0.000,0.00,0.167,0.833,...,1.0,0.0,0.429,0.0,1,0,ConsumerLoan,PL,668,P3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42059,51332,3,0,3,1,0,0.333,0.00,1.000,0.000,...,0.0,0.0,0.000,0.0,0,0,ConsumerLoan,ConsumerLoan,650,P4
42060,51333,4,2,2,0,1,0.000,0.25,0.500,0.500,...,0.0,0.0,0.000,0.0,0,0,others,others,702,P1
42061,51334,2,1,1,1,1,0.500,0.50,0.500,0.500,...,1.0,0.0,1.000,0.0,0,0,ConsumerLoan,others,661,P3
42062,51335,2,1,1,0,0,0.000,0.00,0.500,0.500,...,0.0,0.0,0.000,0.0,0,0,ConsumerLoan,others,686,P2


In [19]:
df.shape

(42064, 79)

In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42064 entries, 0 to 42063
Data columns (total 79 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   PROSPECTID                  42064 non-null  int64  
 1   Total_TL                    42064 non-null  int64  
 2   Tot_Closed_TL               42064 non-null  int64  
 3   Tot_Active_TL               42064 non-null  int64  
 4   Total_TL_opened_L6M         42064 non-null  int64  
 5   Tot_TL_closed_L6M           42064 non-null  int64  
 6   pct_tl_open_L6M             42064 non-null  float64
 7   pct_tl_closed_L6M           42064 non-null  float64
 8   pct_active_tl               42064 non-null  float64
 9   pct_closed_tl               42064 non-null  float64
 10  Total_TL_opened_L12M        42064 non-null  int64  
 11  Tot_TL_closed_L12M          42064 non-null  int64  
 12  pct_tl_open_L12M            42064 non-null  float64
 13  pct_tl_closed_L12M          420

## Feature Selection / Feature Engineering

We will divide the features into:

Categorical

Numerical

Treat them seperately

In [21]:
# Check how many columns are categorical

for i in df.columns:
    if df[i].dtype == 'object':
        print(i)

MARITALSTATUS
EDUCATION
GENDER
last_prod_enq2
first_prod_enq2
Approved_Flag


In [22]:
# 'Chi-square' test for columns to be kept

for i in ['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']:
    chi2, pval, _, _ = chi2_contingency(pd.crosstab(df[i], df['Approved_Flag']))
    print(i, '---', pval)

MARITALSTATUS --- 3.578180861038862e-233
EDUCATION --- 2.6942265249737532e-30
GENDER --- 1.907936100186563e-05
last_prod_enq2 --- 0.0
first_prod_enq2 --- 7.84997610555419e-287


Since all the categorical features have pval <= 0.05, we will accept all

In [23]:
# Check how many columns are numerical

numeric_columns = []
for i in df.columns:
    if df[i].dtype != 'object' and i not in['PROSPECTID', 'Approved_Flag']:
        numeric_columns.append(i)

In [24]:
len(numeric_columns)

72

In [25]:
# VIF sequentially check (Numerical Columns)
# To check if the columns are collinear to each other (multi-collinear)
# Remove the columns which are multi-collinear

vif_data = df[numeric_columns]
total_columns = vif_data.shape[1]
columns_to_be_kept = []
column_index = 0

for i in range (0,total_columns):
    
    vif_value = variance_inflation_factor(vif_data, column_index)
    print (column_index,'---',vif_value)
    
    
    if vif_value <= 6:
        columns_to_be_kept.append( numeric_columns[i] )
        column_index = column_index+1
    
    else:
        vif_data = vif_data.drop([ numeric_columns[i] ] , axis=1)

  vif = 1. / (1. - r_squared_i)
  vif = 1. / (1. - r_squared_i)


0 --- inf
0 --- inf
0 --- 11.320180023967996
0 --- 8.363698035000336
0 --- 6.520647877790928
0 --- 5.149501618212625
1 --- 2.611111040579735
2 --- inf


  vif = 1. / (1. - r_squared_i)


2 --- 1788.7926256209232
2 --- 8.601028256477228
2 --- 3.8328007921530785
3 --- 6.0996533816467355
3 --- 5.581352009642762
4 --- 1.9855843530987785


  vif = 1. / (1. - r_squared_i)


5 --- inf
5 --- 4.809538302819343
6 --- 23.270628983464636
6 --- 30.595522588100053
6 --- 4.384346405965583
7 --- 3.0646584155234238
8 --- 2.898639771299253
9 --- 4.377876915347324
10 --- 2.207853583695844
11 --- 4.916914200506864
12 --- 5.214702030064725
13 --- 3.3861625024231476
14 --- 7.840583309478997
14 --- 5.255034641721438


  vif = 1. / (1. - r_squared_i)


15 --- inf
15 --- 7.380634506427232
15 --- 1.421005001517573
16 --- 8.083255010190323
16 --- 1.6241227524040114
17 --- 7.257811920140003
17 --- 15.59624383268298
17 --- 1.825857047132431
18 --- 1.5080839450032664
19 --- 2.172088834824577
20 --- 2.6233975535272283
21 --- 2.2959970812106167
22 --- 7.360578319196446
22 --- 2.1602387773102554
23 --- 2.8686288267891467
24 --- 6.458218003637277
24 --- 2.8474118865638247
25 --- 4.753198156284083
26 --- 16.22735475594825
26 --- 6.424377256363877
26 --- 8.887080381808687
26 --- 2.3804746142952653
27 --- 8.609513476514548
27 --- 13.067550935476712
27 --- 3.500040056654654
28 --- 1.9087955874813773
29 --- 17.006562234161628
29 --- 10.730485153719197
29 --- 2.3538497522950275
30 --- 22.104855915136433
30 --- 2.7971639638512924
31 --- 3.424171203217696
32 --- 10.175021454450935
32 --- 6.408710354561301
32 --- 1.0011511962625619
33 --- 3.069197305397274
34 --- 2.8091261600643724
35 --- 20.249538381980678
35 --- 15.864576541593745
35 --- 1.8331649740

Inirtially 72 numerical features.

39 remaining after VIF check.

In [26]:
# 'ANOVA' test for columns to be kept

from scipy.stats import f_oneway

columns_to_be_kept_numerical = []

for i in columns_to_be_kept:
    a = list(df[i])  
    b = list(df['Approved_Flag'])  
    
    group_P1 = [value for value, group in zip(a, b) if group == 'P1']
    group_P2 = [value for value, group in zip(a, b) if group == 'P2']
    group_P3 = [value for value, group in zip(a, b) if group == 'P3']
    group_P4 = [value for value, group in zip(a, b) if group == 'P4']


    f_statistic, p_value = f_oneway(group_P1, group_P2, group_P3, group_P4)

    if p_value <= 0.05:
        columns_to_be_kept_numerical.append(i)

In [27]:
columns_to_be_kept_numerical

['pct_tl_open_L6M',
 'pct_tl_closed_L6M',
 'Tot_TL_closed_L12M',
 'pct_tl_closed_L12M',
 'Tot_Missed_Pmnt',
 'CC_TL',
 'Home_TL',
 'PL_TL',
 'Secured_TL',
 'Unsecured_TL',
 'Other_TL',
 'Age_Oldest_TL',
 'Age_Newest_TL',
 'time_since_recent_payment',
 'max_recent_level_of_deliq',
 'num_deliq_6_12mts',
 'num_times_60p_dpd',
 'num_std_12mts',
 'num_sub',
 'num_sub_6mts',
 'num_sub_12mts',
 'num_dbt',
 'num_dbt_12mts',
 'num_lss',
 'recent_level_of_deliq',
 'CC_enq_L12m',
 'PL_enq_L12m',
 'time_since_recent_enq',
 'enq_L3m',
 'NETMONTHLYINCOME',
 'Time_With_Curr_Empr',
 'CC_Flag',
 'PL_Flag',
 'pct_PL_enq_L6m_of_ever',
 'pct_CC_enq_L6m_of_ever',
 'HL_Flag',
 'GL_Flag']

In [28]:
len(columns_to_be_kept_numerical)

37

In [29]:
# listing all the final features
features = columns_to_be_kept_numerical + ['MARITALSTATUS', 'EDUCATION', 'GENDER', 'last_prod_enq2', 'first_prod_enq2']
df = df[features + ['Approved_Flag']]

## Label Encoding

#### Categorical Columns

In [30]:
print(df['MARITALSTATUS'].unique())
print(df['EDUCATION'].unique())
print(df['GENDER'].unique())
print(df['last_prod_enq2'].unique())
print(df['first_prod_enq2'].unique())

['Married' 'Single']
['12TH' 'GRADUATE' 'SSC' 'POST-GRADUATE' 'UNDER GRADUATE' 'OTHERS'
 'PROFESSIONAL']
['M' 'F']
['PL' 'ConsumerLoan' 'AL' 'CC' 'others' 'HL']
['PL' 'ConsumerLoan' 'others' 'AL' 'HL' 'CC']


In [31]:
# Ordinal feature -- EDUCATION
# SSC            : 1
# 12TH           : 2
# GRADUATE       : 3
# UNDER GRADUATE : 3
# POST-GRADUATE  : 4
# OTHERS         : 1
# PROFESSIONAL   : 3


# Others has to be verified by the business end user 

In [32]:
# Education column
df.loc[df['EDUCATION'] == 'SSC',['EDUCATION']]              = 1
df.loc[df['EDUCATION'] == '12TH',['EDUCATION']]             = 2
df.loc[df['EDUCATION'] == 'GRADUATE',['EDUCATION']]         = 3
df.loc[df['EDUCATION'] == 'UNDER GRADUATE',['EDUCATION']]   = 3
df.loc[df['EDUCATION'] == 'POST-GRADUATE',['EDUCATION']]    = 4
df.loc[df['EDUCATION'] == 'OTHERS',['EDUCATION']]           = 1
df.loc[df['EDUCATION'] == 'PROFESSIONAL',['EDUCATION']]     = 3

In [33]:
df['EDUCATION'].value_counts()

3    18931
2    11703
1     9532
4     1898
Name: EDUCATION, dtype: int64

In [34]:
df['EDUCATION'] = df['EDUCATION'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['EDUCATION'] = df['EDUCATION'].astype(int)


In [35]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42064 entries, 0 to 42063
Data columns (total 43 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   pct_tl_open_L6M            42064 non-null  float64
 1   pct_tl_closed_L6M          42064 non-null  float64
 2   Tot_TL_closed_L12M         42064 non-null  int64  
 3   pct_tl_closed_L12M         42064 non-null  float64
 4   Tot_Missed_Pmnt            42064 non-null  int64  
 5   CC_TL                      42064 non-null  int64  
 6   Home_TL                    42064 non-null  int64  
 7   PL_TL                      42064 non-null  int64  
 8   Secured_TL                 42064 non-null  int64  
 9   Unsecured_TL               42064 non-null  int64  
 10  Other_TL                   42064 non-null  int64  
 11  Age_Oldest_TL              42064 non-null  int64  
 12  Age_Newest_TL              42064 non-null  int64  
 13  time_since_recent_payment  42064 non-null  int

In [36]:
# Other columns

df_encoded = pd.get_dummies(df, columns=['MARITALSTATUS','GENDER', 'last_prod_enq2' ,'first_prod_enq2'])

In [37]:
df_encoded.info()
k = df_encoded.describe()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 42064 entries, 0 to 42063
Data columns (total 55 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pct_tl_open_L6M               42064 non-null  float64
 1   pct_tl_closed_L6M             42064 non-null  float64
 2   Tot_TL_closed_L12M            42064 non-null  int64  
 3   pct_tl_closed_L12M            42064 non-null  float64
 4   Tot_Missed_Pmnt               42064 non-null  int64  
 5   CC_TL                         42064 non-null  int64  
 6   Home_TL                       42064 non-null  int64  
 7   PL_TL                         42064 non-null  int64  
 8   Secured_TL                    42064 non-null  int64  
 9   Unsecured_TL                  42064 non-null  int64  
 10  Other_TL                      42064 non-null  int64  
 11  Age_Oldest_TL                 42064 non-null  int64  
 12  Age_Newest_TL                 42064 non-null  int64  
 13  t

In [38]:
df_encoded.head()

Unnamed: 0,pct_tl_open_L6M,pct_tl_closed_L6M,Tot_TL_closed_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,CC_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,...,last_prod_enq2_ConsumerLoan,last_prod_enq2_HL,last_prod_enq2_PL,last_prod_enq2_others,first_prod_enq2_AL,first_prod_enq2_CC,first_prod_enq2_ConsumerLoan,first_prod_enq2_HL,first_prod_enq2_PL,first_prod_enq2_others
0,0.0,0.0,0,0.0,0,0,0,4,1,4,...,0,0,1,0,0,0,0,0,1,0
1,0.0,0.0,0,0.0,0,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0
2,0.125,0.0,0,0.0,1,0,0,0,2,6,...,1,0,0,0,0,0,0,0,0,1
3,0.0,0.0,0,0.0,0,0,0,0,3,0,...,0,0,0,0,1,0,0,0,0,0
4,0.0,0.0,1,0.167,0,0,0,0,6,0,...,1,0,0,0,0,0,0,0,1,0


## Machine Learing model fitting

### Data processing

### 1. Random Forest

In [39]:
x = df_encoded. drop ( ['Approved_Flag'], axis = 1 )
y = df_encoded['Approved_Flag']

In [40]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [41]:
rf_classifier = RandomForestClassifier(n_estimators = 200, random_state=42)

In [42]:
rf_classifier.fit(x_train, y_train)

In [43]:
y_pred = rf_classifier.predict(x_test)

In [44]:
accuracy = accuracy_score(y_test, y_pred)
print ()
print(f'Accuracy: {accuracy}')
print ()
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)


Accuracy: 0.7636990372043266



In [45]:
for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()
    

Class p1:
Precision: 0.8370457209847597
Recall: 0.7041420118343196
F1 Score: 0.7648634172469203

Class p2:
Precision: 0.7957519116397621
Recall: 0.9282457879088206
F1 Score: 0.8569075937785909

Class p3:
Precision: 0.4423380726698262
Recall: 0.21132075471698114
F1 Score: 0.28600612870275793

Class p4:
Precision: 0.7178502879078695
Recall: 0.7269193391642371
F1 Score: 0.7223563495895703



### 2. xgboost

In [46]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [47]:
xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',  num_class=4)

In [48]:
x = df_encoded. drop ( ['Approved_Flag'], axis = 1 )
y = df_encoded['Approved_Flag']

In [49]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [50]:
x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

In [51]:
xgb_classifier.fit(x_train, y_train)
y_pred = xgb_classifier.predict(x_test)

In [52]:
accuracy = accuracy_score(y_test, y_pred)
print ()
print(f'Accuracy: {accuracy:.2f}')
print ()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()


Accuracy: 0.78

Class p1:
Precision: 0.823906083244397
Recall: 0.7613412228796844
F1 Score: 0.7913890312660173

Class p2:
Precision: 0.8255418233924413
Recall: 0.913577799801784
F1 Score: 0.8673315769665036

Class p3:
Precision: 0.4756380510440835
Recall: 0.30943396226415093
F1 Score: 0.3749428440786465

Class p4:
Precision: 0.7342386032977691
Recall: 0.7356656948493683
F1 Score: 0.7349514563106796



### 3. Decision Tree

In [53]:
from sklearn.tree import DecisionTreeClassifier

In [54]:
x = df_encoded. drop ( ['Approved_Flag'], axis = 1 )
y = df_encoded['Approved_Flag']

In [55]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [56]:
dt_model = DecisionTreeClassifier(max_depth=20, min_samples_split=10)

In [57]:
dt_model.fit(x_train, y_train)

In [58]:
y_pred = dt_model.predict(x_test)
y_pred

array(['P3', 'P4', 'P2', ..., 'P3', 'P4', 'P4'], dtype=object)

In [59]:
accuracy = accuracy_score(y_test, y_pred)
print ()
print(f"Accuracy: {accuracy:.2f}")
print ()

precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()



Accuracy: 0.71

Class p1:
Precision: 0.7227138643067846
Recall: 0.7248520710059172
F1 Score: 0.7237813884785819

Class p2:
Precision: 0.8111479243812122
Recall: 0.8249752229930625
F1 Score: 0.8180031446540881

Class p3:
Precision: 0.3484126984126984
Recall: 0.33132075471698114
F1 Score: 0.339651837524178

Class p4:
Precision: 0.6507462686567164
Recall: 0.6355685131195336
F1 Score: 0.6430678466076696



Random Forest = 0.76

Xboost = 0.78

Decision Tree = 0.71

Accuracy value of Xboost is better



### Finetuning

In [60]:
# Apply standard scaler 

from sklearn.preprocessing import StandardScaler

In [61]:
columns_to_be_scaled = ['Age_Oldest_TL','Age_Newest_TL','time_since_recent_payment',
'max_recent_level_of_deliq','recent_level_of_deliq',
'time_since_recent_enq','NETMONTHLYINCOME','Time_With_Curr_Empr']

In [62]:
for i in columns_to_be_scaled:
    column_data = df_encoded[i].values.reshape(-1, 1)
    scaler = StandardScaler()
    scaled_column = scaler.fit_transform(column_data)
    df_encoded[i] = scaled_column

In [63]:
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

In [64]:
xgb_classifier = xgb.XGBClassifier(objective='multi:softmax',  num_class=4)

In [65]:
x = df_encoded. drop ( ['Approved_Flag'], axis = 1 )
y = df_encoded['Approved_Flag']

In [66]:
label_encoder = LabelEncoder()

In [67]:
y_encoded = label_encoder.fit_transform(y)

In [68]:
x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

In [69]:
xgb_classifier.fit(x_train, y_train)

In [70]:
y_pred = xgb_classifier.predict(x_test)

In [71]:
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

Accuracy: 0.78


In [72]:
precision, recall, f1_score, _ = precision_recall_fscore_support(y_test, y_pred)

for i, v in enumerate(['p1', 'p2', 'p3', 'p4']):
    print(f"Class {v}:")
    print(f"Precision: {precision[i]}")
    print(f"Recall: {recall[i]}")
    print(f"F1 Score: {f1_score[i]}")
    print()

Class p1:
Precision: 0.823906083244397
Recall: 0.7613412228796844
F1 Score: 0.7913890312660173

Class p2:
Precision: 0.8255418233924413
Recall: 0.913577799801784
F1 Score: 0.8673315769665036

Class p3:
Precision: 0.4756380510440835
Recall: 0.30943396226415093
F1 Score: 0.3749428440786465

Class p4:
Precision: 0.7342386032977691
Recall: 0.7356656948493683
F1 Score: 0.7349514563106796



### Hyperparameter tuning for xgboost

In [73]:
# Define the hyperparameter grid
param_grid = {
  'colsample_bytree': [0.1, 0.3, 0.5, 0.7, 0.9],
  'learning_rate'   : [0.001, 0.01, 0.1, 1],
  'max_depth'       : [3, 5, 8, 10],
  'alpha'           : [1, 10, 100],
  'n_estimators'    : [10,50,100]
}

index = 0

answers_grid = {
    'combination'       :[],
    'train_Accuracy'    :[],
    'test_Accuracy'     :[],
    'colsample_bytree'  :[],
    'learning_rate'     :[],
    'max_depth'         :[],
    'alpha'             :[],
    'n_estimators'      :[]

    }

In [74]:
# Loop through each combination of hyperparameters
for colsample_bytree in param_grid['colsample_bytree']:
  for learning_rate in param_grid['learning_rate']:
    for max_depth in param_grid['max_depth']:
      for alpha in param_grid['alpha']:
          for n_estimators in param_grid['n_estimators']:
             
              index = index + 1
             
              # Define and train the XGBoost model
              model = xgb.XGBClassifier(objective='multi:softmax',  
                                       num_class=4,
                                       colsample_bytree = colsample_bytree,
                                       learning_rate = learning_rate,
                                       max_depth = max_depth,
                                       alpha = alpha,
                                       n_estimators = n_estimators)
               
       
                     
              y = df_encoded['Approved_Flag']
              x = df_encoded. drop ( ['Approved_Flag'], axis = 1 )

              label_encoder = LabelEncoder()
              y_encoded = label_encoder.fit_transform(y)


              x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)


              model.fit(x_train, y_train)
  

       
              # Predict on training and testing sets
              y_pred_train = model.predict(x_train)
              y_pred_test = model.predict(x_test)
       
       
              # Calculate train and test results
              
              train_accuracy =  accuracy_score (y_train, y_pred_train)
              test_accuracy  =  accuracy_score (y_test , y_pred_test)
              
              
       
              # Include into the lists
              answers_grid ['combination']   .append(index)
              answers_grid ['train_Accuracy']    .append(train_accuracy)
              answers_grid ['test_Accuracy']     .append(test_accuracy)
              answers_grid ['colsample_bytree']   .append(colsample_bytree)
              answers_grid ['learning_rate']      .append(learning_rate)
              answers_grid ['max_depth']          .append(max_depth)
              answers_grid ['alpha']              .append(alpha)
              answers_grid ['n_estimators']       .append(n_estimators)
       
       
              # Print results for this combination
              print(f"Combination {index}")
              print(f"colsample_bytree: {colsample_bytree}, learning_rate: {learning_rate}, max_depth: {max_depth}, alpha: {alpha}, n_estimators: {n_estimators}")
              print(f"Train Accuracy: {train_accuracy:.2f}")
              print(f"Test Accuracy : {test_accuracy :.2f}")
              print("-" * 30)

Combination 1
colsample_bytree: 0.1, learning_rate: 0.001, max_depth: 3, alpha: 1, n_estimators: 10
Train Accuracy: 0.61
Test Accuracy : 0.61
------------------------------
Combination 2
colsample_bytree: 0.1, learning_rate: 0.001, max_depth: 3, alpha: 1, n_estimators: 50
Train Accuracy: 0.61
Test Accuracy : 0.60
------------------------------
Combination 3
colsample_bytree: 0.1, learning_rate: 0.001, max_depth: 3, alpha: 1, n_estimators: 100
Train Accuracy: 0.61
Test Accuracy : 0.60
------------------------------
Combination 4
colsample_bytree: 0.1, learning_rate: 0.001, max_depth: 3, alpha: 10, n_estimators: 10
Train Accuracy: 0.61
Test Accuracy : 0.60
------------------------------
Combination 5
colsample_bytree: 0.1, learning_rate: 0.001, max_depth: 3, alpha: 10, n_estimators: 50
Train Accuracy: 0.61
Test Accuracy : 0.60
------------------------------
Combination 6
colsample_bytree: 0.1, learning_rate: 0.001, max_depth: 3, alpha: 10, n_estimators: 100
Train Accuracy: 0.61
Test Accu

Combination 48
colsample_bytree: 0.1, learning_rate: 0.01, max_depth: 5, alpha: 1, n_estimators: 100
Train Accuracy: 0.62
Test Accuracy : 0.62
------------------------------
Combination 49
colsample_bytree: 0.1, learning_rate: 0.01, max_depth: 5, alpha: 10, n_estimators: 10
Train Accuracy: 0.61
Test Accuracy : 0.61
------------------------------
Combination 50
colsample_bytree: 0.1, learning_rate: 0.01, max_depth: 5, alpha: 10, n_estimators: 50
Train Accuracy: 0.61
Test Accuracy : 0.61
------------------------------
Combination 51
colsample_bytree: 0.1, learning_rate: 0.01, max_depth: 5, alpha: 10, n_estimators: 100
Train Accuracy: 0.62
Test Accuracy : 0.61
------------------------------
Combination 52
colsample_bytree: 0.1, learning_rate: 0.01, max_depth: 5, alpha: 100, n_estimators: 10
Train Accuracy: 0.61
Test Accuracy : 0.60
------------------------------
Combination 53
colsample_bytree: 0.1, learning_rate: 0.01, max_depth: 5, alpha: 100, n_estimators: 50
Train Accuracy: 0.61
Test 

Combination 96
colsample_bytree: 0.1, learning_rate: 0.1, max_depth: 8, alpha: 10, n_estimators: 100
Train Accuracy: 0.73
Test Accuracy : 0.72
------------------------------
Combination 97
colsample_bytree: 0.1, learning_rate: 0.1, max_depth: 8, alpha: 100, n_estimators: 10
Train Accuracy: 0.62
Test Accuracy : 0.61
------------------------------
Combination 98
colsample_bytree: 0.1, learning_rate: 0.1, max_depth: 8, alpha: 100, n_estimators: 50
Train Accuracy: 0.67
Test Accuracy : 0.66
------------------------------
Combination 99
colsample_bytree: 0.1, learning_rate: 0.1, max_depth: 8, alpha: 100, n_estimators: 100
Train Accuracy: 0.71
Test Accuracy : 0.70
------------------------------
Combination 100
colsample_bytree: 0.1, learning_rate: 0.1, max_depth: 10, alpha: 1, n_estimators: 10
Train Accuracy: 0.64
Test Accuracy : 0.63
------------------------------
Combination 101
colsample_bytree: 0.1, learning_rate: 0.1, max_depth: 10, alpha: 1, n_estimators: 50
Train Accuracy: 0.72
Test Ac

Combination 144
colsample_bytree: 0.1, learning_rate: 1, max_depth: 10, alpha: 100, n_estimators: 100
Train Accuracy: 0.77
Test Accuracy : 0.76
------------------------------
Combination 145
colsample_bytree: 0.3, learning_rate: 0.001, max_depth: 3, alpha: 1, n_estimators: 10
Train Accuracy: 0.67
Test Accuracy : 0.67
------------------------------
Combination 146
colsample_bytree: 0.3, learning_rate: 0.001, max_depth: 3, alpha: 1, n_estimators: 50
Train Accuracy: 0.67
Test Accuracy : 0.66
------------------------------
Combination 147
colsample_bytree: 0.3, learning_rate: 0.001, max_depth: 3, alpha: 1, n_estimators: 100
Train Accuracy: 0.66
Test Accuracy : 0.66
------------------------------
Combination 148
colsample_bytree: 0.3, learning_rate: 0.001, max_depth: 3, alpha: 10, n_estimators: 10
Train Accuracy: 0.67
Test Accuracy : 0.67
------------------------------
Combination 149
colsample_bytree: 0.3, learning_rate: 0.001, max_depth: 3, alpha: 10, n_estimators: 50
Train Accuracy: 0.67

Combination 191
colsample_bytree: 0.3, learning_rate: 0.01, max_depth: 5, alpha: 1, n_estimators: 50
Train Accuracy: 0.70
Test Accuracy : 0.69
------------------------------
Combination 192
colsample_bytree: 0.3, learning_rate: 0.01, max_depth: 5, alpha: 1, n_estimators: 100
Train Accuracy: 0.71
Test Accuracy : 0.70
------------------------------
Combination 193
colsample_bytree: 0.3, learning_rate: 0.01, max_depth: 5, alpha: 10, n_estimators: 10
Train Accuracy: 0.69
Test Accuracy : 0.69
------------------------------
Combination 194
colsample_bytree: 0.3, learning_rate: 0.01, max_depth: 5, alpha: 10, n_estimators: 50
Train Accuracy: 0.70
Test Accuracy : 0.69
------------------------------
Combination 195
colsample_bytree: 0.3, learning_rate: 0.01, max_depth: 5, alpha: 10, n_estimators: 100
Train Accuracy: 0.70
Test Accuracy : 0.69
------------------------------
Combination 196
colsample_bytree: 0.3, learning_rate: 0.01, max_depth: 5, alpha: 100, n_estimators: 10
Train Accuracy: 0.68
T

Combination 238
colsample_bytree: 0.3, learning_rate: 0.1, max_depth: 8, alpha: 10, n_estimators: 10
Train Accuracy: 0.73
Test Accuracy : 0.71
------------------------------
Combination 239
colsample_bytree: 0.3, learning_rate: 0.1, max_depth: 8, alpha: 10, n_estimators: 50
Train Accuracy: 0.78
Test Accuracy : 0.75
------------------------------
Combination 240
colsample_bytree: 0.3, learning_rate: 0.1, max_depth: 8, alpha: 10, n_estimators: 100
Train Accuracy: 0.80
Test Accuracy : 0.77
------------------------------
Combination 241
colsample_bytree: 0.3, learning_rate: 0.1, max_depth: 8, alpha: 100, n_estimators: 10
Train Accuracy: 0.69
Test Accuracy : 0.68
------------------------------
Combination 242
colsample_bytree: 0.3, learning_rate: 0.1, max_depth: 8, alpha: 100, n_estimators: 50
Train Accuracy: 0.74
Test Accuracy : 0.73
------------------------------
Combination 243
colsample_bytree: 0.3, learning_rate: 0.1, max_depth: 8, alpha: 100, n_estimators: 100
Train Accuracy: 0.76
Tes

Combination 286
colsample_bytree: 0.3, learning_rate: 1, max_depth: 10, alpha: 100, n_estimators: 10
Train Accuracy: 0.76
Test Accuracy : 0.75
------------------------------
Combination 287
colsample_bytree: 0.3, learning_rate: 1, max_depth: 10, alpha: 100, n_estimators: 50
Train Accuracy: 0.78
Test Accuracy : 0.77
------------------------------
Combination 288
colsample_bytree: 0.3, learning_rate: 1, max_depth: 10, alpha: 100, n_estimators: 100
Train Accuracy: 0.78
Test Accuracy : 0.77
------------------------------
Combination 289
colsample_bytree: 0.5, learning_rate: 0.001, max_depth: 3, alpha: 1, n_estimators: 10
Train Accuracy: 0.70
Test Accuracy : 0.69
------------------------------
Combination 290
colsample_bytree: 0.5, learning_rate: 0.001, max_depth: 3, alpha: 1, n_estimators: 50
Train Accuracy: 0.70
Test Accuracy : 0.69
------------------------------
Combination 291
colsample_bytree: 0.5, learning_rate: 0.001, max_depth: 3, alpha: 1, n_estimators: 100
Train Accuracy: 0.70
Tes

Combination 333
colsample_bytree: 0.5, learning_rate: 0.01, max_depth: 3, alpha: 100, n_estimators: 100
Train Accuracy: 0.70
Test Accuracy : 0.70
------------------------------
Combination 334
colsample_bytree: 0.5, learning_rate: 0.01, max_depth: 5, alpha: 1, n_estimators: 10
Train Accuracy: 0.74
Test Accuracy : 0.73
------------------------------
Combination 335
colsample_bytree: 0.5, learning_rate: 0.01, max_depth: 5, alpha: 1, n_estimators: 50
Train Accuracy: 0.74
Test Accuracy : 0.74
------------------------------
Combination 336
colsample_bytree: 0.5, learning_rate: 0.01, max_depth: 5, alpha: 1, n_estimators: 100
Train Accuracy: 0.75
Test Accuracy : 0.74
------------------------------
Combination 337
colsample_bytree: 0.5, learning_rate: 0.01, max_depth: 5, alpha: 10, n_estimators: 10
Train Accuracy: 0.73
Test Accuracy : 0.72
------------------------------
Combination 338
colsample_bytree: 0.5, learning_rate: 0.01, max_depth: 5, alpha: 10, n_estimators: 50
Train Accuracy: 0.74
Te

Combination 380
colsample_bytree: 0.5, learning_rate: 0.1, max_depth: 8, alpha: 1, n_estimators: 50
Train Accuracy: 0.84
Test Accuracy : 0.77
------------------------------
Combination 381
colsample_bytree: 0.5, learning_rate: 0.1, max_depth: 8, alpha: 1, n_estimators: 100
Train Accuracy: 0.87
Test Accuracy : 0.78
------------------------------
Combination 382
colsample_bytree: 0.5, learning_rate: 0.1, max_depth: 8, alpha: 10, n_estimators: 10
Train Accuracy: 0.77
Test Accuracy : 0.75
------------------------------
Combination 383
colsample_bytree: 0.5, learning_rate: 0.1, max_depth: 8, alpha: 10, n_estimators: 50
Train Accuracy: 0.80
Test Accuracy : 0.77
------------------------------
Combination 384
colsample_bytree: 0.5, learning_rate: 0.1, max_depth: 8, alpha: 10, n_estimators: 100
Train Accuracy: 0.82
Test Accuracy : 0.78
------------------------------
Combination 385
colsample_bytree: 0.5, learning_rate: 0.1, max_depth: 8, alpha: 100, n_estimators: 10
Train Accuracy: 0.73
Test Ac

Combination 428
colsample_bytree: 0.5, learning_rate: 1, max_depth: 10, alpha: 10, n_estimators: 50
Train Accuracy: 0.91
Test Accuracy : 0.77
------------------------------
Combination 429
colsample_bytree: 0.5, learning_rate: 1, max_depth: 10, alpha: 10, n_estimators: 100
Train Accuracy: 0.95
Test Accuracy : 0.77
------------------------------
Combination 430
colsample_bytree: 0.5, learning_rate: 1, max_depth: 10, alpha: 100, n_estimators: 10
Train Accuracy: 0.77
Test Accuracy : 0.76
------------------------------
Combination 431
colsample_bytree: 0.5, learning_rate: 1, max_depth: 10, alpha: 100, n_estimators: 50
Train Accuracy: 0.78
Test Accuracy : 0.77
------------------------------
Combination 432
colsample_bytree: 0.5, learning_rate: 1, max_depth: 10, alpha: 100, n_estimators: 100
Train Accuracy: 0.78
Test Accuracy : 0.77
------------------------------
Combination 433
colsample_bytree: 0.7, learning_rate: 0.001, max_depth: 3, alpha: 1, n_estimators: 10
Train Accuracy: 0.71
Test Ac

Combination 476
colsample_bytree: 0.7, learning_rate: 0.01, max_depth: 3, alpha: 100, n_estimators: 50
Train Accuracy: 0.71
Test Accuracy : 0.71
------------------------------
Combination 477
colsample_bytree: 0.7, learning_rate: 0.01, max_depth: 3, alpha: 100, n_estimators: 100
Train Accuracy: 0.72
Test Accuracy : 0.72
------------------------------
Combination 478
colsample_bytree: 0.7, learning_rate: 0.01, max_depth: 5, alpha: 1, n_estimators: 10
Train Accuracy: 0.75
Test Accuracy : 0.74
------------------------------
Combination 479
colsample_bytree: 0.7, learning_rate: 0.01, max_depth: 5, alpha: 1, n_estimators: 50
Train Accuracy: 0.75
Test Accuracy : 0.74
------------------------------
Combination 480
colsample_bytree: 0.7, learning_rate: 0.01, max_depth: 5, alpha: 1, n_estimators: 100
Train Accuracy: 0.76
Test Accuracy : 0.75
------------------------------
Combination 481
colsample_bytree: 0.7, learning_rate: 0.01, max_depth: 5, alpha: 10, n_estimators: 10
Train Accuracy: 0.75
T

Combination 523
colsample_bytree: 0.7, learning_rate: 0.1, max_depth: 8, alpha: 1, n_estimators: 10
Train Accuracy: 0.80
Test Accuracy : 0.77
------------------------------
Combination 524
colsample_bytree: 0.7, learning_rate: 0.1, max_depth: 8, alpha: 1, n_estimators: 50
Train Accuracy: 0.84
Test Accuracy : 0.78
------------------------------
Combination 525
colsample_bytree: 0.7, learning_rate: 0.1, max_depth: 8, alpha: 1, n_estimators: 100
Train Accuracy: 0.88
Test Accuracy : 0.78
------------------------------
Combination 526
colsample_bytree: 0.7, learning_rate: 0.1, max_depth: 8, alpha: 10, n_estimators: 10
Train Accuracy: 0.78
Test Accuracy : 0.76
------------------------------
Combination 527
colsample_bytree: 0.7, learning_rate: 0.1, max_depth: 8, alpha: 10, n_estimators: 50
Train Accuracy: 0.80
Test Accuracy : 0.77
------------------------------
Combination 528
colsample_bytree: 0.7, learning_rate: 0.1, max_depth: 8, alpha: 10, n_estimators: 100
Train Accuracy: 0.82
Test Accu

Combination 571
colsample_bytree: 0.7, learning_rate: 1, max_depth: 10, alpha: 10, n_estimators: 10
Train Accuracy: 0.83
Test Accuracy : 0.77
------------------------------
Combination 572
colsample_bytree: 0.7, learning_rate: 1, max_depth: 10, alpha: 10, n_estimators: 50
Train Accuracy: 0.93
Test Accuracy : 0.76
------------------------------
Combination 573
colsample_bytree: 0.7, learning_rate: 1, max_depth: 10, alpha: 10, n_estimators: 100
Train Accuracy: 0.96
Test Accuracy : 0.76
------------------------------
Combination 574
colsample_bytree: 0.7, learning_rate: 1, max_depth: 10, alpha: 100, n_estimators: 10
Train Accuracy: 0.78
Test Accuracy : 0.77
------------------------------
Combination 575
colsample_bytree: 0.7, learning_rate: 1, max_depth: 10, alpha: 100, n_estimators: 50
Train Accuracy: 0.78
Test Accuracy : 0.77
------------------------------
Combination 576
colsample_bytree: 0.7, learning_rate: 1, max_depth: 10, alpha: 100, n_estimators: 100
Train Accuracy: 0.78
Test Accu

Combination 618
colsample_bytree: 0.9, learning_rate: 0.01, max_depth: 3, alpha: 10, n_estimators: 100
Train Accuracy: 0.72
Test Accuracy : 0.72
------------------------------
Combination 619
colsample_bytree: 0.9, learning_rate: 0.01, max_depth: 3, alpha: 100, n_estimators: 10
Train Accuracy: 0.70
Test Accuracy : 0.70
------------------------------
Combination 620
colsample_bytree: 0.9, learning_rate: 0.01, max_depth: 3, alpha: 100, n_estimators: 50
Train Accuracy: 0.71
Test Accuracy : 0.71
------------------------------
Combination 621
colsample_bytree: 0.9, learning_rate: 0.01, max_depth: 3, alpha: 100, n_estimators: 100
Train Accuracy: 0.72
Test Accuracy : 0.71
------------------------------
Combination 622
colsample_bytree: 0.9, learning_rate: 0.01, max_depth: 5, alpha: 1, n_estimators: 10
Train Accuracy: 0.75
Test Accuracy : 0.75
------------------------------
Combination 623
colsample_bytree: 0.9, learning_rate: 0.01, max_depth: 5, alpha: 1, n_estimators: 50
Train Accuracy: 0.76

Combination 665
colsample_bytree: 0.9, learning_rate: 0.1, max_depth: 5, alpha: 100, n_estimators: 50
Train Accuracy: 0.77
Test Accuracy : 0.76
------------------------------
Combination 666
colsample_bytree: 0.9, learning_rate: 0.1, max_depth: 5, alpha: 100, n_estimators: 100
Train Accuracy: 0.78
Test Accuracy : 0.77
------------------------------
Combination 667
colsample_bytree: 0.9, learning_rate: 0.1, max_depth: 8, alpha: 1, n_estimators: 10
Train Accuracy: 0.80
Test Accuracy : 0.77
------------------------------
Combination 668
colsample_bytree: 0.9, learning_rate: 0.1, max_depth: 8, alpha: 1, n_estimators: 50
Train Accuracy: 0.84
Test Accuracy : 0.78
------------------------------
Combination 669
colsample_bytree: 0.9, learning_rate: 0.1, max_depth: 8, alpha: 1, n_estimators: 100
Train Accuracy: 0.88
Test Accuracy : 0.78
------------------------------
Combination 670
colsample_bytree: 0.9, learning_rate: 0.1, max_depth: 8, alpha: 10, n_estimators: 10
Train Accuracy: 0.79
Test Ac

Combination 713
colsample_bytree: 0.9, learning_rate: 1, max_depth: 10, alpha: 1, n_estimators: 50
Train Accuracy: 1.00
Test Accuracy : 0.76
------------------------------
Combination 714
colsample_bytree: 0.9, learning_rate: 1, max_depth: 10, alpha: 1, n_estimators: 100
Train Accuracy: 1.00
Test Accuracy : 0.76
------------------------------
Combination 715
colsample_bytree: 0.9, learning_rate: 1, max_depth: 10, alpha: 10, n_estimators: 10
Train Accuracy: 0.84
Test Accuracy : 0.77
------------------------------
Combination 716
colsample_bytree: 0.9, learning_rate: 1, max_depth: 10, alpha: 10, n_estimators: 50
Train Accuracy: 0.94
Test Accuracy : 0.77
------------------------------
Combination 717
colsample_bytree: 0.9, learning_rate: 1, max_depth: 10, alpha: 10, n_estimators: 100
Train Accuracy: 0.96
Test Accuracy : 0.77
------------------------------
Combination 718
colsample_bytree: 0.9, learning_rate: 1, max_depth: 10, alpha: 100, n_estimators: 10
Train Accuracy: 0.78
Test Accuracy

In [79]:
# Create a DataFrame from the answers_grid dictionary
df_results = pd.DataFrame(answers_grid)

# Export the DataFrame to an Excel file
df_results.to_excel("hyperparameter_results.xlsx", index=False)


### Predict with unseen data

In [80]:
a3 = pd.read_excel('Unseen_Dataset.xlsx')

In [81]:
cols_in_df = list(df.columns)

In [82]:
cols_in_df.pop(42)

'Approved_Flag'

In [83]:
df_unseen = a3[cols_in_df]

In [87]:
print(df_unseen['MARITALSTATUS'].unique())
print(df_unseen['EDUCATION'].unique())
print(df_unseen['GENDER'].unique())
print(df_unseen['last_prod_enq2'].unique())
print(df_unseen['first_prod_enq2'].unique())

['Married' 'Single']
['12TH' 'GRADUATE' 'SSC' 'POST-GRADUATE' 'UNDER GRADUATE' 'OTHERS']
['M' 'F']
['PL' 'ConsumerLoan' 'AL' 'CC' 'others' 'HL']
['PL' 'ConsumerLoan' 'others' 'AL' 'HL' 'CC']


In [88]:
# Ordinal feature -- EDUCATION
# SSC            : 1
# 12TH           : 2
# GRADUATE       : 3
# UNDER GRADUATE : 3
# POST-GRADUATE  : 4
# OTHERS         : 1
# PROFESSIONAL   : 3


# Others has to be verified by the business end user

In [89]:
df_unseen.loc[df_unseen['EDUCATION'] == 'SSC',['EDUCATION']]              = 1
df_unseen.loc[df_unseen['EDUCATION'] == '12TH',['EDUCATION']]             = 2
df_unseen.loc[df_unseen['EDUCATION'] == 'GRADUATE',['EDUCATION']]         = 3
df_unseen.loc[df_unseen['EDUCATION'] == 'UNDER GRADUATE',['EDUCATION']]   = 3
df_unseen.loc[df_unseen['EDUCATION'] == 'POST-GRADUATE',['EDUCATION']]    = 4
df_unseen.loc[df_unseen['EDUCATION'] == 'OTHERS',['EDUCATION']]           = 1
df_unseen.loc[df_unseen['EDUCATION'] == 'PROFESSIONAL',['EDUCATION']]     = 3

In [90]:
df_unseen['EDUCATION'].value_counts()

3    41
2    28
1    26
4     5
Name: EDUCATION, dtype: int64

In [91]:
df_unseen['EDUCATION'] = df['EDUCATION'].astype(int)

In [92]:
df_unseen.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 42 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   pct_tl_open_L6M            100 non-null    float64
 1   pct_tl_closed_L6M          100 non-null    float64
 2   Tot_TL_closed_L12M         100 non-null    int64  
 3   pct_tl_closed_L12M         100 non-null    float64
 4   Tot_Missed_Pmnt            100 non-null    int64  
 5   CC_TL                      100 non-null    int64  
 6   Home_TL                    100 non-null    int64  
 7   PL_TL                      100 non-null    int64  
 8   Secured_TL                 100 non-null    int64  
 9   Unsecured_TL               100 non-null    int64  
 10  Other_TL                   100 non-null    int64  
 11  Age_Oldest_TL              100 non-null    int64  
 12  Age_Newest_TL              100 non-null    int64  
 13  time_since_recent_payment  100 non-null    int64  


In [93]:
df_encoded_unseen = pd.get_dummies(df_unseen, columns=['MARITALSTATUS','GENDER', 'last_prod_enq2' ,'first_prod_enq2'])

In [94]:
df_encoded_unseen.info()
k = df_encoded.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 54 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   pct_tl_open_L6M               100 non-null    float64
 1   pct_tl_closed_L6M             100 non-null    float64
 2   Tot_TL_closed_L12M            100 non-null    int64  
 3   pct_tl_closed_L12M            100 non-null    float64
 4   Tot_Missed_Pmnt               100 non-null    int64  
 5   CC_TL                         100 non-null    int64  
 6   Home_TL                       100 non-null    int64  
 7   PL_TL                         100 non-null    int64  
 8   Secured_TL                    100 non-null    int64  
 9   Unsecured_TL                  100 non-null    int64  
 10  Other_TL                      100 non-null    int64  
 11  Age_Oldest_TL                 100 non-null    int64  
 12  Age_Newest_TL                 100 non-null    int64  
 13  time_s

In [95]:
df_encoded_unseen.head()

Unnamed: 0,pct_tl_open_L6M,pct_tl_closed_L6M,Tot_TL_closed_L12M,pct_tl_closed_L12M,Tot_Missed_Pmnt,CC_TL,Home_TL,PL_TL,Secured_TL,Unsecured_TL,...,last_prod_enq2_ConsumerLoan,last_prod_enq2_HL,last_prod_enq2_PL,last_prod_enq2_others,first_prod_enq2_AL,first_prod_enq2_CC,first_prod_enq2_ConsumerLoan,first_prod_enq2_HL,first_prod_enq2_PL,first_prod_enq2_others
0,0.0,0.0,0,0.0,0,0,0,4,1,4,...,0,0,1,0,0,0,0,0,1,0
1,0.0,0.0,0,0.0,0,0,0,0,0,1,...,1,0,0,0,0,0,1,0,0,0
2,0.125,0.0,0,0.0,1,0,0,0,2,6,...,1,0,0,0,0,0,0,0,0,1
3,0.0,0.0,0,0.0,0,0,0,0,3,0,...,0,0,0,0,1,0,0,0,0,0
4,0.0,0.0,1,0.167,0,0,0,0,6,0,...,1,0,0,0,0,0,0,0,1,0


In [99]:
model = xgb.XGBClassifier(objective='multi:softmax',  
                                       num_class=4,
                                       colsample_bytree = 0.7,
                                       learning_rate = 1,
                                       max_depth = 3,
                                       alpha = 10,
                                       n_estimators = 100)

In [100]:
model.fit(x_train, y_train)

In [101]:
y_pred_unseen = model.predict(df_encoded_unseen)

In [102]:
a3['Target_variable'] = y_pred_unseen

In [103]:
a3.to_excel('Final_prediction.xlsx', index=False)