In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder,StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, LogisticRegression
from catboost import CatBoostClassifier
from sklearn.metrics import log_loss, roc_auc_score
import xgboost as xgb
from xgboost import XGBClassifier

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('SampleSubmission.csv')

In [3]:
train.drop(['form_field40', 'form_field31','form_field41','form_field11', 'form_field45', 'form_field23'],axis=1, inplace = True)
test.drop(['form_field40', 'form_field31','form_field41','form_field11', 'form_field45', 'form_field23'],axis=1, inplace = True)            

In [4]:
train.fillna(-99999, inplace=True)
test.fillna(-99999, inplace=True)

In [5]:
from scipy.stats import skew,norm  # for some statistics

In [6]:
numeric_train = train.select_dtypes(include='number').columns
skew_features = train[numeric_train].apply(lambda x: skew(x)).sort_values(ascending=True)

high_skewtrain = skew_features[skew_features > 0.5]
skew_indextrain = high_skewtrain.index

print("There are {} numerical features with Skew > 0.5 :".format(high_skewtrain.shape[0]))
skewness = pd.DataFrame({'Skew' :high_skewtrain})
high_skewtrain

There are 11 numerical features with Skew > 0.5 :


form_field9       3.699367
form_field8       4.273139
form_field12      5.015246
form_field15      5.212030
form_field6       6.868198
form_field48     13.222211
form_field50     17.224354
form_field10     18.037849
form_field7      37.773409
form_field13     56.209804
form_field14    118.544302
dtype: float64

In [7]:
# Normalize skewed features using log transformation
for column in skew_indextrain:
    train[column] = np.log1p(train[column])

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [8]:
numeric_test = test.select_dtypes(include='number').columns
skew_features = test[numeric_test].apply(lambda x: skew(x)).sort_values(ascending=True)

high_skewtest = skew_features[skew_features > 0.5]
skew_indextest = high_skewtest.index

print("There are {} numerical features with Skew > 0.5 :".format(high_skewtest.shape[0]))
skewness = pd.DataFrame({'Skew' :high_skewtest})
high_skewtest

There are 12 numerical features with Skew > 0.5 :


form_field19     3.148678
form_field8      4.348827
form_field15     6.652310
form_field12     7.572344
form_field10     7.727398
form_field6      8.017278
form_field7     13.474686
form_field9     14.770864
form_field50    17.377388
form_field13    27.749131
form_field48    29.693249
form_field14    87.264067
dtype: float64

In [9]:
# Normalize skewed features using log transformation
for column in skew_indextest:
    test[column] = np.log1p(test[column])

In [10]:
train.drop(['Applicant_ID'], inplace=True, axis=1)
test.drop(['Applicant_ID'], inplace=True, axis=1)

In [11]:
sc = StandardScaler()
for i in numeric_train:
    if i not in skew_indextrain:
        sc.fit_transform(train[[i]])

In [12]:
product = {'charge':1, 'lending':0}
default = {'yes':1, 'no':0}
train['default_status'].replace(default, inplace=True)
train['form_field47'].replace(product, inplace=True)

In [13]:
sc = StandardScaler()
for i in numeric_test:
    if i not in skew_indextest:
        sc.fit_transform(test[[i]])

In [14]:
test = test.replace('NaN', -9999)

In [15]:
test['form_field47'].replace(product, inplace=True)

In [16]:
features = train.select_dtypes(exclude=object).columns.drop(['default_status'])
for col in features:
    train[col].fillna(-999, inplace = True)
    test[col].fillna(-999, inplace = True)

In [17]:
X = train[features]
y = train['default_status']

In [18]:
def metric(y, pred):
    return roc_auc_score(y, pred, labels=[0,1]) 

In [19]:
n_skf = 5
kf = StratifiedKFold(n_skf)
seed = 2020
params = {'n_estimators':4000, 'learning_rate':0.01, 
          'max_depth':3, 'subsample':0.8, 'gamma':1, 'colsample_bytree':1}

In [20]:
score_list = []
score=0
test_oofs = []
for i, (tr_idx, vr_idx) in enumerate(kf.split(X,y)):
    X_train, y_train = X.loc[tr_idx, features], y.loc[tr_idx]
    xval, yval = X.loc[vr_idx, features], y.loc[vr_idx]
    
    model = XGBClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(xval, yval)], verbose=100)
    p = model.predict_proba(xval)[:, 1]
    
    sc = metric(yval, p)
    score_list.append(sc)
    score+= sc/n_skf
    
    pred = model.predict_proba(test[features])[:, 1]
    test_oofs.append(pred)
    print('Fold {} : {}'. format(i, sc))

print()
print()
print('Avg log :', score)

[0]	validation_0-error:0.22089
[100]	validation_0-error:0.20973
[200]	validation_0-error:0.20607
[300]	validation_0-error:0.20250
[400]	validation_0-error:0.19875
[500]	validation_0-error:0.19732
[600]	validation_0-error:0.19687
[700]	validation_0-error:0.19634
[800]	validation_0-error:0.19536
[900]	validation_0-error:0.19518
[1000]	validation_0-error:0.19509
[1100]	validation_0-error:0.19473
[1200]	validation_0-error:0.19375
[1300]	validation_0-error:0.19411
[1400]	validation_0-error:0.19375
[1500]	validation_0-error:0.19348
[1600]	validation_0-error:0.19268
[1700]	validation_0-error:0.19187
[1800]	validation_0-error:0.19143
[1900]	validation_0-error:0.19107
[2000]	validation_0-error:0.19071
[2100]	validation_0-error:0.19054
[2200]	validation_0-error:0.19054
[2300]	validation_0-error:0.19018
[2400]	validation_0-error:0.19036
[2500]	validation_0-error:0.19045
[2600]	validation_0-error:0.19107
[2700]	validation_0-error:0.19134
[2800]	validation_0-error:0.19062
[2900]	validation_0-error: