In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, LogisticRegression
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss, roc_auc_score

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('SampleSubmission.csv')

In [3]:
train.head()

Unnamed: 0,Applicant_ID,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,...,form_field42,form_field43,form_field44,form_field45,form_field46,form_field47,form_field48,form_field49,form_field50,default_status
0,Apcnt_1000000,3436.0,0.28505,1.656,0.0,0.0,0.0,10689720.0,252072.0,4272776.0,...,0.392854,2.02,0.711632,0.0,0.0,charge,,1.129518,0.044335,no
1,Apcnt_1000004,3456.0,0.674,0.2342,0.0,0.0,0.0,898979.0,497531.0,9073814.0,...,0.314281,8.08,0.183584,,0.0,charge,349.80573,1.620483,0.322436,no
2,Apcnt_1000008,3276.0,0.53845,3.151,0.0,6.282,,956940.0,,192944.0,...,0.162965,18.18,0.791136,0.0,0.0,charge,,1.51337,0.01164,yes
3,Apcnt_1000012,3372.0,0.17005,0.505,0.0,0.0,192166.0,3044703.0,385499.0,3986472.0,...,0.488884,2.02,0.685168,,0.0,charge,89.9401,0.664452,0.082729,no
4,Apcnt_1000016,3370.0,0.7727,1.101,0.0,0.0,1556.0,214728.0,214728.0,1284089.0,...,0.275,12.12,0.438168,0.0,0.0,charge,97.887502,1.427891,0.04563,no


In [4]:
train.isnull().sum()/len(train)*100

Applicant_ID       0.000000
form_field1        4.516071
form_field2        6.864286
form_field3        0.633929
form_field4        0.633929
form_field5        0.633929
form_field6       23.857143
form_field7        9.219643
form_field8       23.857143
form_field9       14.300000
form_field10       0.633929
form_field11      56.108929
form_field12      17.669643
form_field13      10.516071
form_field14       0.000000
form_field15      40.133929
form_field16      23.278571
form_field17      19.912500
form_field18      18.575000
form_field19       0.007143
form_field20       0.633929
form_field21      28.310714
form_field22      36.428571
form_field23      50.219643
form_field24      23.744643
form_field25       9.732143
form_field26      13.282143
form_field27      16.605357
form_field28       0.633929
form_field29       0.633929
form_field30      45.551786
form_field31      70.371429
form_field32       9.732143
form_field33       2.242857
form_field34       0.633929
form_field35      41

In [5]:
train.drop(['form_field40', 'form_field31','form_field41','form_field11', 'form_field45', 'form_field23'],axis=1, inplace = True)
test.drop(['form_field40', 'form_field31','form_field41','form_field11', 'form_field45', 'form_field23'],axis=1, inplace = True)            

In [6]:
train.fillna(-99999, inplace=True)
test.fillna(-99999, inplace=True)

In [7]:
train.isnull().sum()

Applicant_ID      0
form_field1       0
form_field2       0
form_field3       0
form_field4       0
form_field5       0
form_field6       0
form_field7       0
form_field8       0
form_field9       0
form_field10      0
form_field12      0
form_field13      0
form_field14      0
form_field15      0
form_field16      0
form_field17      0
form_field18      0
form_field19      0
form_field20      0
form_field21      0
form_field22      0
form_field24      0
form_field25      0
form_field26      0
form_field27      0
form_field28      0
form_field29      0
form_field30      0
form_field32      0
form_field33      0
form_field34      0
form_field35      0
form_field36      0
form_field37      0
form_field38      0
form_field39      0
form_field42      0
form_field43      0
form_field44      0
form_field46      0
form_field47      0
form_field48      0
form_field49      0
form_field50      0
default_status    0
dtype: int64

In [8]:
test.isnull().sum()

Applicant_ID    0
form_field1     0
form_field2     0
form_field3     0
form_field4     0
form_field5     0
form_field6     0
form_field7     0
form_field8     0
form_field9     0
form_field10    0
form_field12    0
form_field13    0
form_field14    0
form_field15    0
form_field16    0
form_field17    0
form_field18    0
form_field19    0
form_field20    0
form_field21    0
form_field22    0
form_field24    0
form_field25    0
form_field26    0
form_field27    0
form_field28    0
form_field29    0
form_field30    0
form_field32    0
form_field33    0
form_field34    0
form_field35    0
form_field36    0
form_field37    0
form_field38    0
form_field39    0
form_field42    0
form_field43    0
form_field44    0
form_field46    0
form_field47    0
form_field48    0
form_field49    0
form_field50    0
dtype: int64

In [9]:
from scipy.stats import skew,norm  # for some statistics

In [10]:
numeric_train = train.select_dtypes(include='number').columns
skew_features = train[numeric_train].apply(lambda x: skew(x)).sort_values(ascending=True)

high_skewtrain = skew_features[skew_features > 0.5]
skew_indextrain = high_skewtrain.index

print("There are {} numerical features with Skew > 0.5 :".format(high_skewtrain.shape[0]))
skewness = pd.DataFrame({'Skew' :high_skewtrain})
high_skewtrain

There are 11 numerical features with Skew > 0.5 :


form_field9       3.699367
form_field8       4.273139
form_field12      5.015246
form_field15      5.212030
form_field6       6.868198
form_field48     13.222211
form_field50     17.224354
form_field10     18.037849
form_field7      37.773409
form_field13     56.209804
form_field14    118.544302
dtype: float64

In [11]:
skew_indextrain

Index(['form_field9', 'form_field8', 'form_field12', 'form_field15',
       'form_field6', 'form_field48', 'form_field50', 'form_field10',
       'form_field7', 'form_field13', 'form_field14'],
      dtype='object')

In [12]:
# Normalize skewed features using log transformation
for column in skew_indextrain:
    train[column] = np.log1p(train[column])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [13]:
numeric_test = test.select_dtypes(include='number').columns
skew_features = test[numeric_test].apply(lambda x: skew(x)).sort_values(ascending=True)

high_skewtest = skew_features[skew_features > 0.5]
skew_indextest = high_skewtest.index

print("There are {} numerical features with Skew > 0.5 :".format(high_skewtest.shape[0]))
skewness = pd.DataFrame({'Skew' :high_skewtest})
high_skewtest

There are 12 numerical features with Skew > 0.5 :


form_field19     3.148678
form_field8      4.348827
form_field15     6.652310
form_field12     7.572344
form_field10     7.727398
form_field6      8.017278
form_field7     13.474686
form_field9     14.770864
form_field50    17.377388
form_field13    27.749131
form_field48    29.693249
form_field14    87.264067
dtype: float64

In [14]:
# Normalize skewed features using log transformation
for column in skew_indextest:
    test[column] = np.log1p(test[column])

In [15]:
train.drop(['Applicant_ID'], inplace=True, axis=1)
test.drop(['Applicant_ID'], inplace=True, axis=1)

In [16]:
train.head()

Unnamed: 0,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,...,form_field39,form_field42,form_field43,form_field44,form_field46,form_field47,form_field48,form_field49,form_field50,default_status
0,3436.0,0.28505,1.656,0.0,0.0,0.0,16.184793,12.437474,15.267775,16.243241,...,0.0,0.392854,2.02,0.711632,0.0,charge,,1.129518,0.04338,no
1,3456.0,0.674,0.2342,0.0,0.0,0.0,13.709016,13.117415,16.020903,14.744982,...,0.0,0.314281,8.08,0.183584,0.0,charge,5.860233,1.620483,0.279476,no
2,3276.0,0.53845,3.151,0.0,6.282,,13.771497,,12.17016,13.892347,...,0.0,0.162965,18.18,0.791136,0.0,charge,,1.51337,0.011572,yes
3,3372.0,0.17005,0.505,0.0,0.0,12.16612,14.928914,12.862296,15.198417,15.102531,...,1.0,0.488884,2.02,0.685168,0.0,charge,4.510201,0.664452,0.079484,no
4,3370.0,0.7727,1.101,0.0,0.0,7.350516,12.277132,12.277132,14.065561,12.798767,...,0.0,0.275,12.12,0.438168,0.0,charge,4.593983,1.427891,0.044619,no


In [17]:
for column in train:
    if column not in skew_indextrain:
        print(column)

form_field1
form_field2
form_field3
form_field4
form_field5
form_field16
form_field17
form_field18
form_field19
form_field20
form_field21
form_field22
form_field24
form_field25
form_field26
form_field27
form_field28
form_field29
form_field30
form_field32
form_field33
form_field34
form_field35
form_field36
form_field37
form_field38
form_field39
form_field42
form_field43
form_field44
form_field46
form_field47
form_field49
default_status


In [18]:
for i in numeric_train:
    if i not in skew_indextrain:
        print(i)

form_field1
form_field2
form_field3
form_field4
form_field5
form_field16
form_field17
form_field18
form_field19
form_field20
form_field21
form_field22
form_field24
form_field25
form_field26
form_field27
form_field28
form_field29
form_field30
form_field32
form_field33
form_field34
form_field35
form_field36
form_field37
form_field38
form_field39
form_field42
form_field43
form_field44
form_field46
form_field49


In [19]:
sc = StandardScaler()
for i in numeric_train:
    if i not in skew_indextrain:
        sc.fit_transform(train[[i]])

In [20]:
train.head()

Unnamed: 0,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,...,form_field39,form_field42,form_field43,form_field44,form_field46,form_field47,form_field48,form_field49,form_field50,default_status
0,3436.0,0.28505,1.656,0.0,0.0,0.0,16.184793,12.437474,15.267775,16.243241,...,0.0,0.392854,2.02,0.711632,0.0,charge,,1.129518,0.04338,no
1,3456.0,0.674,0.2342,0.0,0.0,0.0,13.709016,13.117415,16.020903,14.744982,...,0.0,0.314281,8.08,0.183584,0.0,charge,5.860233,1.620483,0.279476,no
2,3276.0,0.53845,3.151,0.0,6.282,,13.771497,,12.17016,13.892347,...,0.0,0.162965,18.18,0.791136,0.0,charge,,1.51337,0.011572,yes
3,3372.0,0.17005,0.505,0.0,0.0,12.16612,14.928914,12.862296,15.198417,15.102531,...,1.0,0.488884,2.02,0.685168,0.0,charge,4.510201,0.664452,0.079484,no
4,3370.0,0.7727,1.101,0.0,0.0,7.350516,12.277132,12.277132,14.065561,12.798767,...,0.0,0.275,12.12,0.438168,0.0,charge,4.593983,1.427891,0.044619,no


In [21]:
train = train.replace('NaN', -9999)

In [22]:
train.head()

Unnamed: 0,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,...,form_field39,form_field42,form_field43,form_field44,form_field46,form_field47,form_field48,form_field49,form_field50,default_status
0,3436.0,0.28505,1.656,0.0,0.0,0.0,16.184793,12.437474,15.267775,16.243241,...,0.0,0.392854,2.02,0.711632,0.0,charge,,1.129518,0.04338,no
1,3456.0,0.674,0.2342,0.0,0.0,0.0,13.709016,13.117415,16.020903,14.744982,...,0.0,0.314281,8.08,0.183584,0.0,charge,5.860233,1.620483,0.279476,no
2,3276.0,0.53845,3.151,0.0,6.282,,13.771497,,12.17016,13.892347,...,0.0,0.162965,18.18,0.791136,0.0,charge,,1.51337,0.011572,yes
3,3372.0,0.17005,0.505,0.0,0.0,12.16612,14.928914,12.862296,15.198417,15.102531,...,1.0,0.488884,2.02,0.685168,0.0,charge,4.510201,0.664452,0.079484,no
4,3370.0,0.7727,1.101,0.0,0.0,7.350516,12.277132,12.277132,14.065561,12.798767,...,0.0,0.275,12.12,0.438168,0.0,charge,4.593983,1.427891,0.044619,no


In [23]:
product = {'charge':1, 'lending':0}
default = {'yes':1, 'no':0}
train['default_status'].replace(default, inplace=True)
train['form_field47'].replace(product, inplace=True)

In [24]:
train.describe()

Unnamed: 0,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,...,form_field39,form_field42,form_field43,form_field44,form_field46,form_field47,form_field48,form_field49,form_field50,default_status
count,56000.0,56000.0,56000.0,56000.0,56000.0,42640.0,50837.0,42640.0,47992.0,55645.0,...,56000.0,56000.0,56000.0,56000.0,56000.0,56000.0,35111.0,56000.0,44944.0,56000.0
mean,-1181.922589,-6863.704139,-632.876677,-633.075654,-631.978317,10.000443,14.013063,13.05583,15.226505,13.205126,...,-7519.483643,-2362.116859,-1007.708353,-9611.894653,-28399.647714,0.649518,6.205387,-632.879822,0.692438,0.244911
std,21491.51786,25284.695483,7936.761742,7936.746184,7936.840126,4.911272,3.378568,3.304525,2.048421,5.403683,...,26370.798824,15187.817433,10020.625913,29476.393469,45093.674961,0.477125,3.345217,7936.761406,2.570089,0.430038
min,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,0.0,0.0,0.0,0.0,0.0,...,-99999.0,-99999.0,-99999.0,-99999.0,-99999.0,0.0,0.0,-99999.0,0.0,0.0
25%,3336.0,0.0477,0.0,0.0,0.0,9.54717,13.440053,12.17016,14.129228,13.017167,...,0.0,0.0,2.02,0.349148,-99999.0,0.0,4.358,0.0,0.044592,0.0
50%,3474.0,0.2267,0.06,0.0,0.0,11.65732,14.810364,13.778787,15.521403,15.125885,...,0.0,0.20625,5.05,0.54808,0.0,1.0,5.765581,0.0,0.157345,0.0
75%,3610.0,0.6757,1.2708,0.0,0.0,13.172922,15.760539,15.137671,16.645514,16.400339,...,0.0,0.611116,10.1,0.739474,0.0,1.0,7.051008,1.15848,0.405023,0.0
max,3900.0,18.01505,57.3716,91.6722,407.7486,17.788355,21.492816,18.457395,19.583998,21.507789,...,21.0,2.2,91.91,0.8,5.0,1.0,17.78976,28.0,19.256876,1.0


In [25]:
sc = StandardScaler()
for i in numeric_test:
    if i not in skew_indextest:
        sc.fit_transform(test[[i]])

In [26]:
test.head()

Unnamed: 0,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,...,form_field38,form_field39,form_field42,form_field43,form_field44,form_field46,form_field47,form_field48,form_field49,form_field50
0,3236.0,0.34875,10.2006,0.0,0.0,12.944587,12.944587,12.944587,13.20064,0.0,...,14.0,0.0,0.825,1.01,0.8,0.0,charge,,0.0,0.011159
1,3284.0,1.2736,2.9606,9.0198,0.0,0.0,16.103877,10.799882,14.227686,16.722864,...,14.0,2.0,0.507694,4.04,0.623248,0.0,lending,,0.504974,0.042604
2,-99999.0,0.27505,0.06,0.0,0.0,,,,,0.0,...,0.0,-99999.0,-99999.0,0.0,-99999.0,-99999.0,charge,,0.0,
3,3232.0,0.28505,2.8032,0.0,0.0,0.0,13.068547,13.068547,14.360412,13.109566,...,26.0,0.0,0.916663,2.02,0.464224,-99999.0,charge,4.512657,0.788809,0.098966
4,3466.0,2.09545,0.8318,2.5182,0.0,9.895455,13.955849,13.955849,15.877364,15.565218,...,20.0,0.0,0.234047,23.23,0.726688,0.0,lending,7.173642,1.637733,0.15111


In [27]:
test = test.replace('NaN', -9999)

In [28]:
test.head()

Unnamed: 0,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,...,form_field38,form_field39,form_field42,form_field43,form_field44,form_field46,form_field47,form_field48,form_field49,form_field50
0,3236.0,0.34875,10.2006,0.0,0.0,12.944587,12.944587,12.944587,13.20064,0.0,...,14.0,0.0,0.825,1.01,0.8,0.0,charge,,0.0,0.011159
1,3284.0,1.2736,2.9606,9.0198,0.0,0.0,16.103877,10.799882,14.227686,16.722864,...,14.0,2.0,0.507694,4.04,0.623248,0.0,lending,,0.504974,0.042604
2,-99999.0,0.27505,0.06,0.0,0.0,,,,,0.0,...,0.0,-99999.0,-99999.0,0.0,-99999.0,-99999.0,charge,,0.0,
3,3232.0,0.28505,2.8032,0.0,0.0,0.0,13.068547,13.068547,14.360412,13.109566,...,26.0,0.0,0.916663,2.02,0.464224,-99999.0,charge,4.512657,0.788809,0.098966
4,3466.0,2.09545,0.8318,2.5182,0.0,9.895455,13.955849,13.955849,15.877364,15.565218,...,20.0,0.0,0.234047,23.23,0.726688,0.0,lending,7.173642,1.637733,0.15111


In [29]:
test['form_field47'].replace(product, inplace=True)

In [30]:
test.head()

Unnamed: 0,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,...,form_field38,form_field39,form_field42,form_field43,form_field44,form_field46,form_field47,form_field48,form_field49,form_field50
0,3236.0,0.34875,10.2006,0.0,0.0,12.944587,12.944587,12.944587,13.20064,0.0,...,14.0,0.0,0.825,1.01,0.8,0.0,1,,0.0,0.011159
1,3284.0,1.2736,2.9606,9.0198,0.0,0.0,16.103877,10.799882,14.227686,16.722864,...,14.0,2.0,0.507694,4.04,0.623248,0.0,0,,0.504974,0.042604
2,-99999.0,0.27505,0.06,0.0,0.0,,,,,0.0,...,0.0,-99999.0,-99999.0,0.0,-99999.0,-99999.0,1,,0.0,
3,3232.0,0.28505,2.8032,0.0,0.0,0.0,13.068547,13.068547,14.360412,13.109566,...,26.0,0.0,0.916663,2.02,0.464224,-99999.0,1,4.512657,0.788809,0.098966
4,3466.0,2.09545,0.8318,2.5182,0.0,9.895455,13.955849,13.955849,15.877364,15.565218,...,20.0,0.0,0.234047,23.23,0.726688,0.0,0,7.173642,1.637733,0.15111


In [31]:
features = train.select_dtypes(exclude=object).columns.drop(['default_status'])
for col in features:
    train[col].fillna(-999, inplace = True)
    test[col].fillna(-999, inplace = True)

In [32]:
train.head()

Unnamed: 0,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,form_field10,...,form_field39,form_field42,form_field43,form_field44,form_field46,form_field47,form_field48,form_field49,form_field50,default_status
0,3436.0,0.28505,1.656,0.0,0.0,0.0,16.184793,12.437474,15.267775,16.243241,...,0.0,0.392854,2.02,0.711632,0.0,1,-999.0,1.129518,0.04338,0
1,3456.0,0.674,0.2342,0.0,0.0,0.0,13.709016,13.117415,16.020903,14.744982,...,0.0,0.314281,8.08,0.183584,0.0,1,5.860233,1.620483,0.279476,0
2,3276.0,0.53845,3.151,0.0,6.282,-999.0,13.771497,-999.0,12.17016,13.892347,...,0.0,0.162965,18.18,0.791136,0.0,1,-999.0,1.51337,0.011572,1
3,3372.0,0.17005,0.505,0.0,0.0,12.16612,14.928914,12.862296,15.198417,15.102531,...,1.0,0.488884,2.02,0.685168,0.0,1,4.510201,0.664452,0.079484,0
4,3370.0,0.7727,1.101,0.0,0.0,7.350516,12.277132,12.277132,14.065561,12.798767,...,0.0,0.275,12.12,0.438168,0.0,1,4.593983,1.427891,0.044619,0


In [33]:
X = train[features]
y = train['default_status']

In [34]:
def metric(y, pred):
    return roc_auc_score(y, pred, labels=[0,1]) 

In [35]:
n_skf = 5
kf = StratifiedKFold(n_skf)
seed = 2020
params = {'n_estimators':4000, 'learning_rate':0.01, 
          'objective':'CrossEntropy', 'eval_metric':'AUC', 
          'random_seed':seed, 'early_stopping_rounds':200, 
          'use_best_model':True,}

In [36]:
score_list = []
score=0
test_oofs = []
for i, (tr_idx, vr_idx) in enumerate(kf.split(X,y)):
    X_train, y_train = X.loc[tr_idx, features], y.loc[tr_idx]
    xval, yval = X.loc[vr_idx, features], y.loc[vr_idx]
    
    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(xval, yval)], verbose=100)
    p = model.predict_proba(xval)[:, 1]
    
    sc = metric(yval, p)
    score_list.append(sc)
    score+= sc/n_skf
    
    pred = model.predict_proba(test[features])[:, 1]
    test_oofs.append(pred)
    print('Fold {} : {}'. format(i, sc))

print()
print()
print('Avg log :', score)

0:	test: 0.7975090	best: 0.7975090 (0)	total: 154ms	remaining: 10m 14s
100:	test: 0.8249326	best: 0.8249326 (100)	total: 5.67s	remaining: 3m 38s
200:	test: 0.8287990	best: 0.8287990 (200)	total: 11.1s	remaining: 3m 28s
300:	test: 0.8310877	best: 0.8310877 (300)	total: 16.6s	remaining: 3m 23s
400:	test: 0.8325625	best: 0.8325625 (400)	total: 22s	remaining: 3m 17s
500:	test: 0.8337920	best: 0.8337920 (500)	total: 27s	remaining: 3m 8s
600:	test: 0.8347438	best: 0.8347498 (599)	total: 31.9s	remaining: 3m
700:	test: 0.8354861	best: 0.8354861 (700)	total: 37.5s	remaining: 2m 56s
800:	test: 0.8360833	best: 0.8360846 (799)	total: 43.1s	remaining: 2m 52s
900:	test: 0.8366536	best: 0.8366536 (900)	total: 49s	remaining: 2m 48s
1000:	test: 0.8370111	best: 0.8370111 (1000)	total: 54.1s	remaining: 2m 41s
1100:	test: 0.8374292	best: 0.8374292 (1100)	total: 58.7s	remaining: 2m 34s
1200:	test: 0.8377915	best: 0.8377915 (1200)	total: 1m 4s	remaining: 2m 29s
1300:	test: 0.8381274	best: 0.8381291 (1294)	t

1800:	test: 0.8309883	best: 0.8309935 (1795)	total: 1m 23s	remaining: 1m 42s
1900:	test: 0.8310549	best: 0.8310866 (1879)	total: 1m 28s	remaining: 1m 37s
2000:	test: 0.8311565	best: 0.8311565 (2000)	total: 1m 32s	remaining: 1m 32s
2100:	test: 0.8311921	best: 0.8312030 (2097)	total: 1m 37s	remaining: 1m 27s
2200:	test: 0.8311856	best: 0.8312222 (2147)	total: 1m 41s	remaining: 1m 23s
2300:	test: 0.8312407	best: 0.8312449 (2258)	total: 1m 46s	remaining: 1m 18s
2400:	test: 0.8312368	best: 0.8312637 (2344)	total: 1m 50s	remaining: 1m 13s
2500:	test: 0.8313119	best: 0.8313120 (2490)	total: 1m 55s	remaining: 1m 9s
2600:	test: 0.8313410	best: 0.8313861 (2567)	total: 1m 59s	remaining: 1m 4s
2700:	test: 0.8313720	best: 0.8313861 (2567)	total: 2m 4s	remaining: 59.7s
2800:	test: 0.8314196	best: 0.8314588 (2762)	total: 2m 8s	remaining: 55.1s
2900:	test: 0.8314069	best: 0.8314588 (2762)	total: 2m 13s	remaining: 50.4s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8314588467
bes

In [37]:
f'{n_skf} fold CV, score: {score}'

'5 fold CV, score: 0.8401613084070814'

In [38]:
oof_prediction = pd.DataFrame(test_oofs).T

In [39]:
oof_prediction.columns = ['fold' +str(i) for i in range(1, n_skf + 1)]

In [40]:
sub['default_status'] = np.mean(test_oofs, axis=0)

In [41]:
sub.to_csv('fourth_submission.csv', index=False)