In [1]:
# Importing the required modules and sub-modules

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier, StackingClassifier

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.feature_selection import RFE
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold, cross_val_score

In [2]:
# Loading in the data and replicating for redundancy

def import_data():
    train_data = pd.read_csv('Train.csv')
    test_data = pd.read_csv('Test.csv')
    
    train = train_data.copy()
    test = test_data.copy()
    
    return train, test

In [3]:
train, test = import_data()

In [4]:
# Peeking at the data

train.head(10)

Unnamed: 0,Applicant_ID,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,...,form_field42,form_field43,form_field44,form_field45,form_field46,form_field47,form_field48,form_field49,form_field50,default_status
0,Apcnt_1000000,3436.0,0.28505,1.656,0.0,0.0,0.0,10689720.0,252072.0,4272776.0,...,0.392854,2.02,0.711632,0.0,0.0,charge,,1.129518,0.044335,no
1,Apcnt_1000004,3456.0,0.674,0.2342,0.0,0.0,0.0,898979.0,497531.0,9073814.0,...,0.314281,8.08,0.183584,,0.0,charge,349.80573,1.620483,0.322436,no
2,Apcnt_1000008,3276.0,0.53845,3.151,0.0,6.282,,956940.0,,192944.0,...,0.162965,18.18,0.791136,0.0,0.0,charge,,1.51337,0.01164,yes
3,Apcnt_1000012,3372.0,0.17005,0.505,0.0,0.0,192166.0,3044703.0,385499.0,3986472.0,...,0.488884,2.02,0.685168,,0.0,charge,89.9401,0.664452,0.082729,no
4,Apcnt_1000016,3370.0,0.7727,1.101,0.0,0.0,1556.0,214728.0,214728.0,1284089.0,...,0.275,12.12,0.438168,0.0,0.0,charge,97.887502,1.427891,0.04563,no
5,Apcnt_1000020,3724.0,,0.0,0.0,0.0,192944.0,9244585.0,6761209.0,67963357.0,...,0.030558,12.12,0.38296,,0.0,lending,,0.0,,no
6,Apcnt_1000024,3350.0,0.24205,0.3132,0.0,2.8188,,580388.0,,96472.0,...,0.66,1.01,0.513504,,0.0,charge,,2.146512,0.00439,yes
7,Apcnt_1000028,3416.0,0.8702,0.057,0.0,0.0,17505.0,186331.0,186331.0,1484813.0,...,0.1375,3.03,0.63596,,0.0,charge,155.295994,0.946074,0.128389,no
8,Apcnt_1000036,3376.0,0.2918,0.5586,0.0,0.0,18283.0,1105149.0,18283.0,115533.0,...,0.471427,4.04,0.41088,0.0,0.0,charge,,0.641601,0.00999,no
9,Apcnt_1000040,3310.0,0.24395,0.1562,0.0,0.0,,0.0,,,...,0.825,1.01,0.8,,,charge,,0.0,,yes


In [5]:
test.head(10)

Unnamed: 0,Applicant_ID,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,...,form_field41,form_field42,form_field43,form_field44,form_field45,form_field46,form_field47,form_field48,form_field49,form_field50
0,Apcnt_1000032,3236.0,0.34875,10.2006,0.0,0.0,418564.0,418564.0,418564.0,540710.0,...,,0.825,1.01,0.8,,0.0,charge,,0.0,0.011221
1,Apcnt_1000048,3284.0,1.2736,2.9606,9.0198,0.0,0.0,9858816.0,49014.0,1510098.0,...,18.8415,0.507694,4.04,0.623248,1.0,0.0,lending,,0.504974,0.043525
2,Apcnt_1000052,,0.27505,0.06,0.0,0.0,,,,,...,,,0.0,,,,charge,,0.0,
3,Apcnt_1000076,3232.0,0.28505,2.8032,0.0,0.0,0.0,473802.0,473802.0,1724437.0,...,,0.916663,2.02,0.464224,,,charge,90.163742,0.788809,0.104029
4,Apcnt_1000080,3466.0,2.09545,0.8318,2.5182,0.0,19839.0,1150662.0,1150662.0,7860523.0,...,,0.234047,23.23,0.726688,0.0,0.0,lending,1303.587148,1.637733,0.163124
5,Apcnt_1000084,,0.41005,,,,,,,,...,,,,,,,charge,,,
6,Apcnt_1000104,3408.0,0.74605,0.6064,0.0,0.0,0.0,3769410.0,12059.0,1230018.0,...,,0.061116,12.12,0.642296,0.0,0.0,charge,221.04733,1.24502,0.074203
7,Apcnt_1000116,3296.0,0.41745,0.9796,0.0,0.0,,56794.0,,,...,,1.1,1.01,0.532248,,,charge,,0.0,
8,Apcnt_1000128,3424.0,0.1738,0.104,0.0,0.0,0.0,2317662.0,1467308.0,55619609.0,...,,0.0,12.12,0.483792,0.0,0.0,charge,1464.871576,3.623188,1.154236
9,Apcnt_1000156,3408.0,0.20105,0.0,0.0,0.0,0.0,911427.0,389389.0,3613032.0,...,,0.194117,12.12,0.760912,0.0,0.0,charge,444.468343,4.0,0.037489


In [6]:
# Data description

train.describe()

Unnamed: 0,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,...,form_field40,form_field41,form_field42,form_field43,form_field44,form_field45,form_field46,form_field48,form_field49,form_field50
count,53471.0,52156.0,55645.0,55645.0,55645.0,42640.0,50837.0,42640.0,47992.0,55645.0,...,12271.0,17771.0,54677.0,55432.0,50617.0,24683.0,40096.0,35111.0,55645.0,44944.0
mean,3491.795665,0.550737,1.052225,0.851979,1.956317,624447.9,6865210.0,2626690.0,13160020.0,11855850.0,...,147.797977,108.117363,0.368215,6.634511,0.563377,0.07252,0.095371,305244.9,1.049061,600586.2
std,188.462426,0.820979,2.147768,3.157692,10.512396,1433422.0,19127290.0,3927355.0,19779630.0,26694590.0,...,45.085889,36.765769,0.412858,6.378946,0.196973,0.291146,0.33521,1647757.0,1.80616,5842405.0
min,2990.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.021,0.0,0.0,0.0,0.048528,0.0,0.0,0.0,0.0,0.0
25%,3358.0,0.070788,0.0,0.0,0.0,14004.0,686974.0,192944.0,1368502.0,450073.0,...,136.19175,87.0555,0.0,2.02,0.411672,0.0,0.0,77.10079,0.0,0.04560141
50%,3484.0,0.267575,0.062,0.0,0.0,115533.0,2704328.0,963942.0,5506295.0,3707559.0,...,150.0,118.4415,0.22,5.05,0.588648,0.0,0.0,318.1243,0.0,0.1703996
75%,3620.0,0.719512,1.282,0.0,0.0,525928.0,6993831.0,3751516.0,16945520.0,13261010.0,...,167.754,137.49675,0.628573,10.1,0.75384,0.0,0.0,1153.022,1.181754,0.499337
max,3900.0,18.01505,57.3716,91.6722,407.7486,53135460.0,2158794000.0,103739700.0,320053300.0,2191361000.0,...,645.45,255.0,2.2,91.91,0.8,5.0,5.0,53210140.0,28.0,230757100.0


In [7]:
test.describe()

Unnamed: 0,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,...,form_field40,form_field41,form_field42,form_field43,form_field44,form_field45,form_field46,form_field48,form_field49,form_field50
count,22890.0,22291.0,23854.0,23854.0,23854.0,18396.0,21769.0,18396.0,20600.0,23853.0,...,5172.0,7651.0,23422.0,23750.0,21638.0,10462.0,17115.0,15078.0,23854.0,19203.0
mean,3492.284404,0.557676,1.065443,0.859146,2.183538,626303.6,6797033.0,2654142.0,13505930.0,11874780.0,...,147.62328,108.209648,0.369684,6.58048,0.566219,0.066526,0.097926,301544.0,1.064118,674984.3
std,190.502764,0.826543,2.198444,3.403115,11.415706,1457540.0,16260220.0,3968185.0,22891250.0,24771130.0,...,43.580328,36.426276,0.414077,6.363075,0.19606,0.278211,0.370392,1868574.0,1.816837,6561031.0
min,2986.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.075,0.0,0.0,0.0,0.066432,0.0,0.0,0.0,0.0,0.0
25%,3356.0,0.068675,0.0,0.0,0.0,14004.0,672581.0,181663.0,1349441.0,420898.0,...,135.53175,87.4995,0.0,2.02,0.413268,0.0,0.0,72.72364,0.0,0.0450051
50%,3484.0,0.27325,0.0582,0.0,0.0,115533.0,2719888.0,959468.5,5529830.0,3651543.0,...,150.0,117.984,0.22,5.05,0.5921,0.0,0.0,313.1754,0.0,0.168
75%,3624.0,0.72885,1.30425,0.0,0.0,515911.2,7073576.0,3799849.0,17286580.0,13181260.0,...,167.07825,137.20725,0.628573,10.1,0.756544,0.0,0.0,1195.997,1.254155,0.5007093
max,3900.0,22.31505,34.5414,206.4528,297.8856,48187380.0,770988700.0,113514100.0,1443921000.0,774101400.0,...,401.4135,211.6935,2.2,91.91,0.8,5.0,19.0,121399100.0,24.0,252459100.0


In [8]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56000 entries, 0 to 55999
Data columns (total 52 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Applicant_ID    56000 non-null  object 
 1   form_field1     53471 non-null  float64
 2   form_field2     52156 non-null  float64
 3   form_field3     55645 non-null  float64
 4   form_field4     55645 non-null  float64
 5   form_field5     55645 non-null  float64
 6   form_field6     42640 non-null  float64
 7   form_field7     50837 non-null  float64
 8   form_field8     42640 non-null  float64
 9   form_field9     47992 non-null  float64
 10  form_field10    55645 non-null  float64
 11  form_field11    24579 non-null  float64
 12  form_field12    46105 non-null  float64
 13  form_field13    50111 non-null  float64
 14  form_field14    56000 non-null  int64  
 15  form_field15    33525 non-null  float64
 16  form_field16    42964 non-null  float64
 17  form_field17    44849 non-null 

In [9]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24000 entries, 0 to 23999
Data columns (total 51 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Applicant_ID  24000 non-null  object 
 1   form_field1   22890 non-null  float64
 2   form_field2   22291 non-null  float64
 3   form_field3   23854 non-null  float64
 4   form_field4   23854 non-null  float64
 5   form_field5   23854 non-null  float64
 6   form_field6   18396 non-null  float64
 7   form_field7   21769 non-null  float64
 8   form_field8   18396 non-null  float64
 9   form_field9   20600 non-null  float64
 10  form_field10  23853 non-null  float64
 11  form_field11  10602 non-null  float64
 12  form_field12  19817 non-null  float64
 13  form_field13  21537 non-null  float64
 14  form_field14  24000 non-null  int64  
 15  form_field15  14408 non-null  float64
 16  form_field16  18526 non-null  float64
 17  form_field17  19305 non-null  float64
 18  form_field18  19631 non-nu

In [10]:
train.isnull().sum()

Applicant_ID          0
form_field1        2529
form_field2        3844
form_field3         355
form_field4         355
form_field5         355
form_field6       13360
form_field7        5163
form_field8       13360
form_field9        8008
form_field10        355
form_field11      31421
form_field12       9895
form_field13       5889
form_field14          0
form_field15      22475
form_field16      13036
form_field17      11151
form_field18      10402
form_field19          4
form_field20        355
form_field21      15854
form_field22      20400
form_field23      28123
form_field24      13297
form_field25       5450
form_field26       7438
form_field27       9299
form_field28        355
form_field29        355
form_field30      25509
form_field31      39408
form_field32       5450
form_field33       1256
form_field34        355
form_field35      23148
form_field36       1995
form_field37       5450
form_field38        355
form_field39       4211
form_field40      43729
form_field41    

In [11]:
test.isnull().sum()

Applicant_ID        0
form_field1      1110
form_field2      1709
form_field3       146
form_field4       146
form_field5       146
form_field6      5604
form_field7      2231
form_field8      5604
form_field9      3400
form_field10      147
form_field11    13398
form_field12     4183
form_field13     2463
form_field14        0
form_field15     9592
form_field16     5474
form_field17     4695
form_field18     4369
form_field19        0
form_field20      147
form_field21     6707
form_field22     8724
form_field23    12125
form_field24     5605
form_field25     2256
form_field26     3172
form_field27     3910
form_field28      147
form_field29      147
form_field30    10908
form_field31    16810
form_field32     2256
form_field33      495
form_field34      147
form_field35     9866
form_field36      903
form_field37     2256
form_field38      147
form_field39     1829
form_field40    18828
form_field41    16349
form_field42      578
form_field43      250
form_field44     2362
form_field

In [12]:
# Encoding categorical features

train['form_field47'] = pd.get_dummies(train[['form_field47']])
test['form_field47'] = pd.get_dummies(test[['form_field47']])

In [13]:
train['form_field47'].unique()

array([1, 0], dtype=uint8)

In [14]:
test['form_field47'].unique()

array([1, 0], dtype=uint8)

In [15]:
train.head()

Unnamed: 0,Applicant_ID,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,...,form_field42,form_field43,form_field44,form_field45,form_field46,form_field47,form_field48,form_field49,form_field50,default_status
0,Apcnt_1000000,3436.0,0.28505,1.656,0.0,0.0,0.0,10689720.0,252072.0,4272776.0,...,0.392854,2.02,0.711632,0.0,0.0,1,,1.129518,0.044335,no
1,Apcnt_1000004,3456.0,0.674,0.2342,0.0,0.0,0.0,898979.0,497531.0,9073814.0,...,0.314281,8.08,0.183584,,0.0,1,349.80573,1.620483,0.322436,no
2,Apcnt_1000008,3276.0,0.53845,3.151,0.0,6.282,,956940.0,,192944.0,...,0.162965,18.18,0.791136,0.0,0.0,1,,1.51337,0.01164,yes
3,Apcnt_1000012,3372.0,0.17005,0.505,0.0,0.0,192166.0,3044703.0,385499.0,3986472.0,...,0.488884,2.02,0.685168,,0.0,1,89.9401,0.664452,0.082729,no
4,Apcnt_1000016,3370.0,0.7727,1.101,0.0,0.0,1556.0,214728.0,214728.0,1284089.0,...,0.275,12.12,0.438168,0.0,0.0,1,97.887502,1.427891,0.04563,no


In [16]:
test.head()

Unnamed: 0,Applicant_ID,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,...,form_field41,form_field42,form_field43,form_field44,form_field45,form_field46,form_field47,form_field48,form_field49,form_field50
0,Apcnt_1000032,3236.0,0.34875,10.2006,0.0,0.0,418564.0,418564.0,418564.0,540710.0,...,,0.825,1.01,0.8,,0.0,1,,0.0,0.011221
1,Apcnt_1000048,3284.0,1.2736,2.9606,9.0198,0.0,0.0,9858816.0,49014.0,1510098.0,...,18.8415,0.507694,4.04,0.623248,1.0,0.0,0,,0.504974,0.043525
2,Apcnt_1000052,,0.27505,0.06,0.0,0.0,,,,,...,,,0.0,,,,1,,0.0,
3,Apcnt_1000076,3232.0,0.28505,2.8032,0.0,0.0,0.0,473802.0,473802.0,1724437.0,...,,0.916663,2.02,0.464224,,,1,90.163742,0.788809,0.104029
4,Apcnt_1000080,3466.0,2.09545,0.8318,2.5182,0.0,19839.0,1150662.0,1150662.0,7860523.0,...,,0.234047,23.23,0.726688,0.0,0.0,0,1303.587148,1.637733,0.163124


In [17]:
# Initializing required estimators and transformers

labelEncoder = LabelEncoder()

In [18]:
# Encoding class label

train['default_status'] = labelEncoder.fit_transform(train['default_status'])

# Splitting data into features and targets

X = train.drop(['Applicant_ID', 'default_status'], axis = 1).values
y = train['default_status'].values

In [19]:
modelA, modelB, modelC, modelD, modelE = [AdaBoostClassifier(random_state = 1),
                                          
                                          RandomForestClassifier(criterion = 'entropy',
                                                                 random_state = 1),
                                          
                                          ExtraTreesClassifier(criterion = 'entropy',
                                                               random_state = 1),
                                          
                                          LGBMClassifier(random_state = 1),
                                          
                                          CatBoostClassifier(random_state = 1)]

In [20]:
strategies = ['mean', 'median', 'most_frequent', 'constant']

# Stating the evaluation procedure for model
cv = RepeatedStratifiedKFold(random_state = 1, n_repeats = 3, n_splits = 10)

In [21]:
scaler = StandardScaler()

# Evaluating different models...

In [22]:
# Creating pipeline and evaluating model for AdaBoost

for strategy in strategies:
    imputer = SimpleImputer(strategy = strategy)
    steps = [('imputer', imputer),
             ('scaler', scaler),
             ('model', modelA)]
    
    pipeline = Pipeline(steps = steps)
    scores = cross_val_score(pipeline, X, y, cv = cv, scoring = 'roc_auc', n_jobs = -1)
    
    print('{} {} Accuracy: {}, std: {}'.format(modelA, strategy.capitalize(), np.mean(scores)* 100, np.std(scores)* 100))

AdaBoostClassifier(random_state=1) Mean Accuracy: 82.78566299721959, std: 0.6517138203376078
AdaBoostClassifier(random_state=1) Median Accuracy: 82.80458702109976, std: 0.7208002278011406
AdaBoostClassifier(random_state=1) Most_frequent Accuracy: 82.70401522591737, std: 0.6972186681041815
AdaBoostClassifier(random_state=1) Constant Accuracy: 82.8136270816982, std: 0.713673036694503


In [23]:
# Creating pipeline and evaluating model for AdaBoost (unscaled data)

for strategy in strategies:
    imputer = SimpleImputer(strategy = strategy)
    steps = [('imputer', imputer),
             #('scaler', scaler),
             ('model', modelA)]
    
    pipeline = Pipeline(steps = steps)
    scores = cross_val_score(pipeline, X, y, cv = cv, scoring = 'roc_auc', n_jobs = -1)
    
    print('{} {} Accuracy: {}, std: {}'.format(modelA, strategy.capitalize(), np.mean(scores)* 100, np.std(scores)* 100))

AdaBoostClassifier(random_state=1) Mean Accuracy: 82.79256190334728, std: 0.6589669018030947
AdaBoostClassifier(random_state=1) Median Accuracy: 82.80575166310177, std: 0.7203632593175983
AdaBoostClassifier(random_state=1) Most_frequent Accuracy: 82.70140294975944, std: 0.6976275274274626
AdaBoostClassifier(random_state=1) Constant Accuracy: 82.81649494592087, std: 0.713986609963105


In [24]:
# Creating pipeline and evaluating model for RFC

for strategy in strategies:
    imputer = SimpleImputer(strategy = strategy)
    steps = [('imputer', imputer),
             ('scaler', scaler),
             ('model', modelB)]
    
    pipeline = Pipeline(steps = steps)
    scores = cross_val_score(pipeline, X, y, cv = cv, scoring = 'roc_auc', n_jobs = -1)
    
    print('{} {} Accuracy: {}, std: {}'.format(modelB, strategy.capitalize(), np.mean(scores)* 100, np.std(scores)* 100))

RandomForestClassifier(criterion='entropy', random_state=1) Mean Accuracy: 83.25432894364849, std: 0.6553163154395325
RandomForestClassifier(criterion='entropy', random_state=1) Median Accuracy: 83.24425234142471, std: 0.7482122344567416
RandomForestClassifier(criterion='entropy', random_state=1) Most_frequent Accuracy: 83.2837617078452, std: 0.718847752043315
RandomForestClassifier(criterion='entropy', random_state=1) Constant Accuracy: 83.28146142632392, std: 0.6721365244905828


In [25]:
# Creating pipeline and evaluating model for RFC (unscaled data)

for strategy in strategies:
    imputer = SimpleImputer(strategy = strategy)
    steps = [('imputer', imputer),
             #('scaler', scaler),
             ('model', modelB)]
    
    pipeline = Pipeline(steps = steps)
    scores = cross_val_score(pipeline, X, y, cv = cv, scoring = 'roc_auc', n_jobs = -1)
    
    print('{} {} Accuracy: {}, std: {}'.format(modelB, strategy.capitalize(), np.mean(scores)* 100, np.std(scores)* 100))

RandomForestClassifier(criterion='entropy', random_state=1) Mean Accuracy: 83.26269472497685, std: 0.7205185767848161
RandomForestClassifier(criterion='entropy', random_state=1) Median Accuracy: 83.2125754028374, std: 0.703765569357522
RandomForestClassifier(criterion='entropy', random_state=1) Most_frequent Accuracy: 83.27026034854224, std: 0.7036338386745228
RandomForestClassifier(criterion='entropy', random_state=1) Constant Accuracy: 83.2464887018557, std: 0.6980248015618805


In [26]:
# Creating pipeline and evaluating model for ExtraTreesClassifier

for strategy in strategies:
    imputer = SimpleImputer(strategy = strategy)
    steps = [('imputer', imputer),
             ('scaler', scaler),
             ('model', modelC)]
    
    pipeline = Pipeline(steps = steps)
    scores = cross_val_score(pipeline, X, y, cv = cv, scoring = 'roc_auc', n_jobs = -1)
    
    print('{} {} Accuracy: {}, std: {}'.format(modelC, strategy.capitalize(), np.mean(scores)* 100, np.std(scores)* 100))

ExtraTreesClassifier(criterion='entropy', random_state=1) Mean Accuracy: 83.00798309585694, std: 0.6872156559516928
ExtraTreesClassifier(criterion='entropy', random_state=1) Median Accuracy: 82.97549702581564, std: 0.7103411424200217
ExtraTreesClassifier(criterion='entropy', random_state=1) Most_frequent Accuracy: 83.01790174267225, std: 0.6980048033594202
ExtraTreesClassifier(criterion='entropy', random_state=1) Constant Accuracy: 83.06211421269555, std: 0.7180927355595889


In [27]:
# Creating pipeline and evaluating model for ExtraTreesClassifier (unscaled data)

for strategy in strategies:
    imputer = SimpleImputer(strategy = strategy)
    steps = [('imputer', imputer),
             #('scaler', scaler),
             ('model', modelC)]
    
    pipeline = Pipeline(steps = steps)
    scores = cross_val_score(pipeline, X, y, cv = cv, scoring = 'roc_auc', n_jobs = -1)
    
    print('{} {} Accuracy: {}, std: {}'.format(modelC, strategy.capitalize(), np.mean(scores)* 100, np.std(scores)* 100))

ExtraTreesClassifier(criterion='entropy', random_state=1) Mean Accuracy: 82.97332134839705, std: 0.6641763423030389
ExtraTreesClassifier(criterion='entropy', random_state=1) Median Accuracy: 83.05615650186559, std: 0.7256379301354137
ExtraTreesClassifier(criterion='entropy', random_state=1) Most_frequent Accuracy: 83.04969151801048, std: 0.7717199403234257
ExtraTreesClassifier(criterion='entropy', random_state=1) Constant Accuracy: 83.09100156149094, std: 0.7202644747379302


In [28]:
# Creating pipeline and evaluating model for LGBM

for strategy in strategies:
    imputer = SimpleImputer(strategy = strategy)
    steps = [('imputer', imputer),
             ('scaler', scaler),
             ('model', modelD)]
    
    pipeline = Pipeline(steps = steps)
    scores = cross_val_score(pipeline, X, y, cv = cv, scoring = 'roc_auc', n_jobs = -1)
    
    print('{} {} Accuracy: {}, std: {}'.format(modelD, strategy.capitalize(), np.mean(scores)* 100, np.std(scores)* 100))

LGBMClassifier(random_state=1) Mean Accuracy: 83.69614669418311, std: 0.625202418985817
LGBMClassifier(random_state=1) Median Accuracy: 83.63134512795264, std: 0.6794749970880637
LGBMClassifier(random_state=1) Most_frequent Accuracy: 83.65418498995922, std: 0.7039529602422804
LGBMClassifier(random_state=1) Constant Accuracy: 83.74415252711536, std: 0.6640624777991867


In [29]:
# Creating pipeline and evaluating model for LGBM (unscaled data)

for strategy in strategies:
    imputer = SimpleImputer(strategy = strategy)
    steps = [('imputer', imputer),
             #('scaler', scaler),
             ('model', modelD)]
    dx
    pipeline = Pipeline(steps = steps)
    scores = cross_val_score(pipeline, X, y, cv = cv, scoring = 'roc_auc', n_jobs = -1)
    
    print('{} {} Accuracy: {}, std: {}'.format(modelD, strategy.capitalize(), np.mean(scores)* 100, np.std(scores)* 100))

LGBMClassifier(random_state=1) Mean Accuracy: 83.70988264988894, std: 0.6631803592997823
LGBMClassifier(random_state=1) Median Accuracy: 83.64108240485628, std: 0.6595503099646315
LGBMClassifier(random_state=1) Most_frequent Accuracy: 83.62121672178759, std: 0.6745685834477981
LGBMClassifier(random_state=1) Constant Accuracy: 83.71798622199432, std: 0.6594814820455736


Apparently, scaling the data through standardization techniques has very little benefits when training the model.

# Repeating the above steps with Recursive Feature Elimination (RFE)

In [30]:
rfe = RFE(n_features_to_select = 40, estimator = DecisionTreeClassifier())

In [31]:
# Creating pipeline and evaluating model for AdaBoost

for strategy in strategies:
    imputer = SimpleImputer(strategy = strategy)
    steps = [('imputer', imputer),
             ('rfe', rfe),
             ('scaler', scaler),
             ('model', modelA)]
    
    pipeline = Pipeline(steps = steps)
    scores = cross_val_score(pipeline, X, y, cv = cv, scoring = 'roc_auc', n_jobs = -1)
    
    print('{} {} Accuracy: {}, std: {}'.format(modelA, strategy.capitalize(), np.mean(scores)* 100, np.std(scores)* 100))

LGBMClassifier(random_state=1) Mean Accuracy: 82.59943614570027, std: 0.6914040504982352
LGBMClassifier(random_state=1) Median Accuracy: 82.61000012486052, std: 0.701984678246399
LGBMClassifier(random_state=1) Most_frequent Accuracy: 82.4554295818605, std: 0.7265745458187158
LGBMClassifier(random_state=1) Constant Accuracy: 82.60843646225862, std: 0.6836802900858268


In [32]:
# Creating pipeline and evaluating model for AdaBoost (unscaled data)

for strategy in strategies:
    imputer = SimpleImputer(strategy = strategy)
    steps = [('imputer', imputer),
             ('rfe', rfe),
             #('scaler', scaler),
             ('model', modelA)]
    
    pipeline = Pipeline(steps = steps)
    scores = cross_val_score(pipeline, X, y, cv = cv, scoring = 'roc_auc', n_jobs = -1)
    
    print('{} {} Accuracy: {}, std: {}'.format(modelA, strategy.capitalize(), np.mean(scores)* 100, np.std(scores)* 100))

AdaBoostClassifier(random_state=1) Mean Accuracy: 82.59621258779194, std: 0.6933219396025673
AdaBoostClassifier(random_state=1) Median Accuracy: 82.61420826552113, std: 0.7014765637498815
AdaBoostClassifier(random_state=1) Most_frequent Accuracy: 82.4202024345345, std: 0.7239993906356754
AdaBoostClassifier(random_state=1) Constant Accuracy: 82.60472606499324, std: 0.6828649991115419


In [33]:
# Creating pipeline and evaluating model for RFC

for strategy in strategies:
    imputer = SimpleImputer(strategy = strategy)
    steps = [('imputer', imputer),
             ('rfe', rfe),
             ('scaler', scaler),
             ('model', modelB)]
    
    pipeline = Pipeline(steps = steps)
    scores = cross_val_score(pipeline, X, y, cv = cv, scoring = 'roc_auc', n_jobs = -1)
    
    print('{} {} Accuracy: {}, std: {}'.format(modelB, strategy.capitalize(), np.mean(scores)* 100, np.std(scores)* 100))

RandomForestClassifier(criterion='entropy', random_state=1) Mean Accuracy: 83.10316723697123, std: 0.6919734469270152
RandomForestClassifier(criterion='entropy', random_state=1) Median Accuracy: 83.09321824347069, std: 0.7273141929842617
RandomForestClassifier(criterion='entropy', random_state=1) Most_frequent Accuracy: 83.0593410606333, std: 0.6776298833987879
RandomForestClassifier(criterion='entropy', random_state=1) Constant Accuracy: 83.12558608630061, std: 0.6641532592669509


In [34]:
# Creating pipeline and evaluating model for RFC (unscaled data)

for strategy in strategies:
    imputer = SimpleImputer(strategy = strategy)
    steps = [('imputer', imputer),
             ('rfe', rfe),
             #('scaler', scaler),
             ('model', modelB)]
    
    pipeline = Pipeline(steps = steps)
    scores = cross_val_score(pipeline, X, y, cv = cv, scoring = 'roc_auc', n_jobs = -1)
    
    print('{} {} Accuracy: {}, std: {}'.format(modelB, strategy.capitalize(), np.mean(scores)* 100, np.std(scores)* 100))

RandomForestClassifier(criterion='entropy', random_state=1) Mean Accuracy: 83.02988565981106, std: 0.7349996927088459
RandomForestClassifier(criterion='entropy', random_state=1) Median Accuracy: 83.10690821497249, std: 0.7062114085617177
RandomForestClassifier(criterion='entropy', random_state=1) Most_frequent Accuracy: 83.08968563729862, std: 0.6910043533505543
RandomForestClassifier(criterion='entropy', random_state=1) Constant Accuracy: 83.10965728869209, std: 0.6776336256864732


In [35]:
# Creating pipeline and evaluating model for ExtraTreesClassifier

for strategy in strategies:
    imputer = SimpleImputer(strategy = strategy)
    steps = [('imputer', imputer),
             ('rfe', rfe),
             ('scaler', scaler),
             ('model', modelC)]
    
    pipeline = Pipeline(steps = steps)
    scores = cross_val_score(pipeline, X, y, cv = cv, scoring = 'roc_auc', n_jobs = -1)
    
    print('{} {} Accuracy: {}, std: {}'.format(modelC, strategy.capitalize(), np.mean(scores)* 100, np.std(scores)* 100))

ExtraTreesClassifier(criterion='entropy', random_state=1) Mean Accuracy: 82.76937002213421, std: 0.7193666630258247
ExtraTreesClassifier(criterion='entropy', random_state=1) Median Accuracy: 82.8230470775048, std: 0.7382087265229298
ExtraTreesClassifier(criterion='entropy', random_state=1) Most_frequent Accuracy: 82.8005551932146, std: 0.6646661702862978
ExtraTreesClassifier(criterion='entropy', random_state=1) Constant Accuracy: 82.86825084338219, std: 0.7262650565410453


In [36]:
# Creating pipeline and evaluating model for ExtraTreesClassifier (unscaled data)

for strategy in strategies:
    imputer = SimpleImputer(strategy = strategy)
    steps = [('imputer', imputer),
             ('rfe', rfe),
             #('scaler', scaler),
             ('model', modelC)]
    
    pipeline = Pipeline(steps = steps)
    scores = cross_val_score(pipeline, X, y, cv = cv, scoring = 'roc_auc', n_jobs = -1)
    
    print('{} {} Accuracy: {}, std: {}'.format(modelC, strategy.capitalize(), np.mean(scores)* 100, np.std(scores)* 100))

ExtraTreesClassifier(criterion='entropy', random_state=1) Mean Accuracy: 82.77847660732203, std: 0.7287202012265646




ExtraTreesClassifier(criterion='entropy', random_state=1) Median Accuracy: 82.84773839388284, std: 0.7132447243938762
ExtraTreesClassifier(criterion='entropy', random_state=1) Most_frequent Accuracy: 82.80714724006583, std: 0.7206968563828627
ExtraTreesClassifier(criterion='entropy', random_state=1) Constant Accuracy: 82.83896722806536, std: 0.7093945111420528


In [37]:
# Creating pipeline and evaluating model for LGBM

for strategy in strategies:
    imputer = SimpleImputer(strategy = strategy)
    steps = [('imputer', imputer),
             ('rfe', rfe),
             ('scaler', scaler),
             ('model', modelD)]
    
    pipeline = Pipeline(steps = steps)
    scores = cross_val_score(pipeline, X, y, cv = cv, scoring = 'roc_auc', n_jobs = -1)
    
    print('{} {} Accuracy: {}, std: {}'.format(modelD, strategy.capitalize(), np.mean(scores)* 100, np.std(scores)* 100))

LGBMClassifier(random_state=1) Mean Accuracy: 83.46483577670699, std: 0.6048860518991277
LGBMClassifier(random_state=1) Median Accuracy: 83.40733596974367, std: 0.6538726017204718
LGBMClassifier(random_state=1) Most_frequent Accuracy: 83.49117745151844, std: 0.6863904584109425
LGBMClassifier(random_state=1) Constant Accuracy: 83.55382728756305, std: 0.7097824413913973


In [38]:
# Creating pipeline and evaluating model for LGBM (unscaled data)

for strategy in strategies:
    imputer = SimpleImputer(strategy = strategy)
    steps = [('imputer', imputer),
             ('rfe', rfe),
             #('scaler', scaler),
             ('model', modelD)]
    
    pipeline = Pipeline(steps = steps)
    scores = cross_val_score(pipeline, X, y, cv = cv, scoring = 'roc_auc', n_jobs = -1)
    
    print('{} {} Accuracy: {}, std: {}'.format(modelD, strategy.capitalize(), np.mean(scores)* 100, np.std(scores)* 100))

LGBMClassifier(random_state=1) Mean Accuracy: 83.46504292408564, std: 0.6873194433890958
LGBMClassifier(random_state=1) Median Accuracy: 83.4038496844699, std: 0.6719701542018323
LGBMClassifier(random_state=1) Most_frequent Accuracy: 83.4414901247886, std: 0.7112382863604688
LGBMClassifier(random_state=1) Constant Accuracy: 83.53074580809334, std: 0.658868678434304


Apparently, scaling the data does not give much change in accuracy for the LGBM algorithm.

Standardization does not provide much change in accuracy for the ExtraTreesClassifier. The constant strategy seems to produce the highest accuracy irrespective of standard scaling.

# The RandomForest, AdaBoost, LGBM and ExtraTrees models will be fitted using the scaled data and the constant imputer strategy, after which they will be combined through a voting classifier.

# Preparing the prediction pipelines...

In [22]:
# Fitting the AdaBoost pipeline with scaled data for the constant stategy

imputer = SimpleImputer(strategy = 'constant')
steps = [('imputer', imputer),
         ('scaler', scaler),
         ('model', modelA)]

pipelineA = Pipeline(steps = steps)

In [23]:
# Fitting the RandomForest pipeline with scaled data for the constant stategy

imputer = SimpleImputer(strategy = 'constant')
steps = [('imputer', imputer),
         ('scaler', scaler),
         ('model', modelB)]

pipelineB = Pipeline(steps = steps)

In [24]:
# Fitting the ExtraTrees pipeline with scaled data for the constant stategy

imputer = SimpleImputer(strategy = 'constant')
steps = [('imputer', imputer),
         ('scaler', scaler),
         ('model', modelC)]

pipelineC = Pipeline(steps = steps)

In [25]:
# Fitting the LGBM pipeline with scaled data for the constant stategy

imputer = SimpleImputer(strategy = 'constant')
steps = [('imputer', imputer),
         ('scaler', scaler),
         ('model', modelD)]

pipelineD = Pipeline(steps = steps)

In [26]:
# Fitting the CatBoost pipeline with scaled data for the constant stategy

imputer = SimpleImputer(strategy = 'constant')
steps = [('imputer', imputer),
         ('scaler', scaler),
         ('model', modelE)]

pipelineE = Pipeline(steps = steps)

In [27]:
model_pipeA = [('Adaboost', pipelineA),
               ('RFC', pipelineB),
               ('XTC', pipelineC),
               ('LGBM', pipelineD),
               ('CatBoost', pipelineE)]

In [28]:
electionS = VotingClassifier(estimators = model_pipeA, n_jobs = -1, voting = 'soft')

In [29]:
electionS.fit(X, y)

VotingClassifier(estimators=[('Adaboost',
                              Pipeline(steps=[('imputer',
                                               SimpleImputer(strategy='constant')),
                                              ('scaler', StandardScaler()),
                                              ('model',
                                               AdaBoostClassifier(random_state=1))])),
                             ('RFC',
                              Pipeline(steps=[('imputer',
                                               SimpleImputer(strategy='constant')),
                                              ('scaler', StandardScaler()),
                                              ('model',
                                               RandomForestClassifier(criterion='entropy',
                                                                      random_state=1))])),...
                                                                    random_state=1))])),
            

In [30]:
scoresS = cross_val_score(electionS, X, y, cv = cv, scoring = 'roc_auc', n_jobs = -1)

In [31]:
print('electionS roc_auc : {}, electionS std : {}'.format(100*np.mean(scoresS), 100*np.std(scoresS)))

electionS roc_auc : 84.10001523304291, electionS std : 0.6619475146280962


# For the test set

In [36]:
ID = test['Applicant_ID']
test = test.drop(['Applicant_ID'], axis = 1)

In [33]:
# For soft voting classification
yhatS = electionS.predict_proba(test)

In [34]:
dfS = pd.DataFrame({'Applicant_ID' : ID, 'default_status' : yhatS[:, 1]})

In [35]:
Submission = dfS.to_csv('Submission12S.csv', index = False)

# The RandomForest, AdaBoost, LGBM and ExtraTrees models will be fitted using the unscaled data and the constant imputer strategy, after which they will be combined through a voting classifier.

# Preparing pipelines...

In [29]:
# Fitting the pipelines with unscaled data for the constant imputer stategy

imputer = SimpleImputer(strategy = 'constant')

pipelineF = Pipeline(steps = [('imputer', imputer),
                              ('model', modelA)])

pipelineG = Pipeline(steps = [('imputer', imputer),
                              ('model', modelB)])

pipelineH = Pipeline(steps = [('imputer', imputer),
                              ('model', modelC)])

pipelineI = Pipeline(steps = [('imputer', imputer),
                              ('model', modelD)])

pipelineJ = Pipeline(steps = [('imputer', imputer),
                              ('model', modelE)])

In [30]:
model_pipeB = [('Adaboost', pipelineF),
               ('RFC', pipelineG),
               ('XTC', pipelineH),
               ('LGBM', pipelineI),
               ('CatBoost', pipelineJ)]

In [31]:
electionH = VotingClassifier(estimators = model_pipeB, n_jobs = -1, voting = 'soft')

In [32]:
electionH.fit(X, y)

VotingClassifier(estimators=[('Adaboost',
                              Pipeline(steps=[('imputer',
                                               SimpleImputer(strategy='constant')),
                                              ('model',
                                               AdaBoostClassifier(random_state=1))])),
                             ('RFC',
                              Pipeline(steps=[('imputer',
                                               SimpleImputer(strategy='constant')),
                                              ('model',
                                               RandomForestClassifier(criterion='entropy',
                                                                      random_state=1))])),
                             ('XTC',
                              Pipeline(steps=[('imputer',
                                               SimpleImputer(strategy...
                                               ExtraTreesClassifier(criterion='entropy',

In [33]:
scoresH = cross_val_score(electionH, X, y, cv = cv, scoring = 'roc_auc', n_jobs = -1)



In [34]:
print('electionH roc_auc : {}, electionH std : {}'.format(100*np.mean(scoresH), 100*np.std(scoresH)))

electionH roc_auc : 84.10094138784115, electionH std : 0.6705525008841788


In [37]:
# For soft voting classification
yhatH = electionH.predict_proba(test)

In [38]:
dfH = pd.DataFrame({'Applicant_ID' : ID, 'default_status' : yhatH[:, 1]})

In [39]:
Submission = dfH.to_csv('Submission12H.csv', index = False)