In [1]:
import catboost

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import seaborn as sns

%matplotlib inline

from sklearn.preprocessing import LabelEncoder,StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, LogisticRegression
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import log_loss, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,AdaBoostRegressor
from sklearn.tree import DecisionTreeClassifier

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('SampleSubmission.csv')

In [4]:
dict = {'charge':1, 'lending':0}
target_map = {'yes':1, 'no':0}
train['default_status'].replace(target_map, inplace=True)
train['form_field47'].replace(dict, inplace=True)
test['form_field47'].replace(dict, inplace=True)

In [5]:
features = train.select_dtypes(exclude=object).columns.drop(['default_status'])
for col in features:
    train[col].fillna(-999, inplace = True)
    test[col].fillna(-999, inplace = True)

In [6]:
X = train[features]
y = train['default_status']

In [7]:
def metric(y, pred):
    return roc_auc_score(y, pred, labels=[0,1]) 

In [8]:
n_skf = 10
kf = StratifiedKFold(n_skf)
seed = 2020
params = {'n_estimators':5000, 'learning_rate':0.01, 
          'objective':'CrossEntropy', 'eval_metric':'AUC', 
          'random_seed':seed, 'early_stopping_rounds':200, 'max_depth':9,
          'use_best_model':True,}

In [9]:
score_list = []
score=0
test_oofs = []
for i, (tr_idx, vr_idx) in enumerate(kf.split(X,y)):
    X_train, y_train = X.loc[tr_idx, features], y.loc[tr_idx]
    xval, yval = X.loc[vr_idx, features], y.loc[vr_idx]
    
    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(xval, yval)], verbose=100)
    p = model.predict_proba(xval)[:, 1]
    
    sc = metric(yval, p)
    score_list.append(sc)
    score+= sc/n_skf
    
    pred = model.predict_proba(test[features])[:, 1]
    test_oofs.append(pred)
    print('Fold {} : {}'. format(i, sc))

print()
print()
print('Avg log :', score)

0:	test: 0.7976080	best: 0.7976080 (0)	total: 258ms	remaining: 21m 31s
100:	test: 0.8212678	best: 0.8212678 (100)	total: 14.8s	remaining: 11m 56s
200:	test: 0.8242382	best: 0.8242382 (200)	total: 32.2s	remaining: 12m 49s
300:	test: 0.8262571	best: 0.8262571 (300)	total: 46.1s	remaining: 12m
400:	test: 0.8276066	best: 0.8276066 (400)	total: 59.8s	remaining: 11m 25s
500:	test: 0.8286078	best: 0.8286102 (499)	total: 1m 15s	remaining: 11m 16s
600:	test: 0.8292577	best: 0.8292937 (590)	total: 1m 28s	remaining: 10m 48s
700:	test: 0.8299353	best: 0.8299514 (699)	total: 1m 42s	remaining: 10m 30s
800:	test: 0.8306242	best: 0.8306618 (793)	total: 1m 56s	remaining: 10m 11s
900:	test: 0.8310521	best: 0.8310642 (898)	total: 2m 11s	remaining: 9m 57s
1000:	test: 0.8314917	best: 0.8315054 (999)	total: 2m 26s	remaining: 9m 46s
1100:	test: 0.8318205	best: 0.8318205 (1100)	total: 2m 39s	remaining: 9m 26s
1200:	test: 0.8321040	best: 0.8321057 (1198)	total: 2m 55s	remaining: 9m 16s
1300:	test: 0.8324662	be

700:	test: 0.8414101	best: 0.8414440 (690)	total: 1m 35s	remaining: 9m 43s
800:	test: 0.8419121	best: 0.8419205 (798)	total: 1m 48s	remaining: 9m 27s
900:	test: 0.8421222	best: 0.8421245 (896)	total: 2m 1s	remaining: 9m 13s
1000:	test: 0.8424018	best: 0.8424023 (999)	total: 2m 15s	remaining: 8m 59s
1100:	test: 0.8425322	best: 0.8425353 (1098)	total: 2m 28s	remaining: 8m 46s
1200:	test: 0.8426587	best: 0.8427104 (1183)	total: 2m 42s	remaining: 8m 32s
1300:	test: 0.8427401	best: 0.8427833 (1244)	total: 2m 55s	remaining: 8m 19s
1400:	test: 0.8428995	best: 0.8429200 (1389)	total: 3m 9s	remaining: 8m 5s
1500:	test: 0.8428199	best: 0.8429200 (1389)	total: 3m 22s	remaining: 7m 52s
1600:	test: 0.8429805	best: 0.8430190 (1566)	total: 3m 35s	remaining: 7m 38s
1700:	test: 0.8430147	best: 0.8430607 (1680)	total: 3m 49s	remaining: 7m 25s
1800:	test: 0.8428573	best: 0.8430607 (1680)	total: 4m 3s	remaining: 7m 11s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.8430607004
bestIte

In [10]:
f'{n_skf} fold CV, score: {score}'

'10 fold CV, score: 0.8415222420385657'

In [11]:
oof_prediction = pd.DataFrame(test_oofs).T

In [12]:
oof_prediction.columns = ['fold' +str(i) for i in range(1, n_skf + 1)]

In [13]:
oof_prediction.head()

Unnamed: 0,fold1,fold2,fold3,fold4,fold5,fold6,fold7,fold8,fold9,fold10
0,0.281607,0.334704,0.318998,0.275459,0.289099,0.293496,0.28493,0.279107,0.279472,0.320711
1,0.465998,0.397151,0.379288,0.433036,0.455497,0.385521,0.372149,0.378349,0.381009,0.40963
2,0.343069,0.370492,0.38629,0.384291,0.385248,0.391442,0.374894,0.386745,0.329025,0.365185
3,0.725817,0.75163,0.761503,0.755976,0.71524,0.788022,0.762193,0.73884,0.744081,0.743727
4,0.14949,0.137024,0.14822,0.148248,0.171759,0.14122,0.149301,0.123363,0.163097,0.144054


In [14]:
sub['default_status'] = np.mean(test_oofs, axis=0)

In [15]:
sub.to_csv('tenth_submission.csv', index=False)