In [1]:
import catboost

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from IPython.display import display
import seaborn as sns

%matplotlib inline

from sklearn.preprocessing import LabelEncoder,StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression, LogisticRegression
from catboost import CatBoostClassifier
import warnings
warnings.filterwarnings('ignore')

from sklearn.metrics import log_loss, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,AdaBoostRegressor
from sklearn.tree import DecisionTreeClassifier

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sub = pd.read_csv('SampleSubmission.csv')

In [4]:
train.drop(['form_field40', 'form_field31','form_field41','form_field11', 'form_field45'],axis=1, inplace = True)
test.drop(['form_field40', 'form_field31','form_field41','form_field11', 'form_field45'],axis=1, inplace = True)            

In [5]:
dict = {'charge':1, 'lending':0}
target_map = {'yes':1, 'no':0}
train['default_status'].replace(target_map, inplace=True)
train['form_field47'].replace(dict, inplace=True)
test['form_field47'].replace(dict, inplace=True)

In [6]:
features = train.select_dtypes(exclude=object).columns.drop(['default_status'])
for col in features:
    train[col].fillna(-999, inplace = True)
    test[col].fillna(-999, inplace = True)

In [7]:
X = train[features]
y = train['default_status']

In [8]:
def metric(y, pred):
    return roc_auc_score(y, pred, labels=[0,1]) 

In [9]:
n_skf = 5
kf = StratifiedKFold(n_skf)
seed = 2020
params = {'n_estimators':100, 'learning_rate':0.01, 
          'objective':'CrossEntropy', 'eval_metric':'AUC', 
          'random_seed':seed, 'early_stopping_rounds':200, 
          'use_best_model':True,}

In [10]:
score_list = []
score=0
test_oofs = []
for i, (tr_idx, vr_idx) in enumerate(kf.split(X,y)):
    X_train, y_train = X.loc[tr_idx, features], y.loc[tr_idx]
    xval, yval = X.loc[vr_idx, features], y.loc[vr_idx]
    
    model = CatBoostClassifier(**params)
    model.fit(X_train, y_train, eval_set=[(xval, yval)], verbose=100)
    p = model.predict_proba(xval)[:, 1]
    
    sc = metric(yval, p)
    score_list.append(sc)
    score+= sc/n_skf
    
    pred = model.predict_proba(test[features])[:, 1]
    test_oofs.append(pred)
    print('Fold {} : {}'. format(i, sc))

print()
print()
print('Avg log :', score)

0:	test: 0.7984274	best: 0.7984274 (0)	total: 155ms	remaining: 15.4s
99:	test: 0.8250325	best: 0.8250325 (99)	total: 6.37s	remaining: 0us

bestTest = 0.8250324786
bestIteration = 99

Fold 0 : 0.8250324786439741
0:	test: 0.7863922	best: 0.7863922 (0)	total: 67.4ms	remaining: 6.67s
99:	test: 0.8211769	best: 0.8211769 (99)	total: 6.17s	remaining: 0us

bestTest = 0.8211768561
bestIteration = 99

Fold 1 : 0.8211768561258903
0:	test: 0.7870170	best: 0.7870170 (0)	total: 91.6ms	remaining: 9.07s
99:	test: 0.8269265	best: 0.8269265 (99)	total: 5.29s	remaining: 0us

bestTest = 0.8269265363
bestIteration = 99

Fold 2 : 0.8269265363399781
0:	test: 0.7785155	best: 0.7785155 (0)	total: 93.8ms	remaining: 9.29s
99:	test: 0.8151721	best: 0.8151721 (99)	total: 5.44s	remaining: 0us

bestTest = 0.8151720628
bestIteration = 99

Fold 3 : 0.8151720627750749
0:	test: 0.7894006	best: 0.7894006 (0)	total: 101ms	remaining: 10s
99:	test: 0.8361086	best: 0.8361086 (99)	total: 4.75s	remaining: 0us

bestTest = 0.836

In [11]:
f'{n_skf} fold CV, score: {score}'

'5 fold CV, score: 0.824883299103427'

In [12]:
oof_prediction = pd.DataFrame(test_oofs).T

In [13]:
oof_prediction.columns = ['fold' +str(i) for i in range(1, n_skf + 1)]

In [14]:
oof_prediction.head()

Unnamed: 0,fold1,fold2,fold3,fold4,fold5
0,0.458768,0.458867,0.480519,0.463238,0.459146
1,0.432599,0.416642,0.467195,0.425918,0.436001
2,0.449155,0.459404,0.459305,0.458119,0.438836
3,0.608059,0.602081,0.595976,0.625535,0.620018
4,0.283683,0.263524,0.27786,0.275748,0.267244


In [15]:
sub['default_status'] = np.mean(test_oofs, axis=0)

In [16]:
sub.to_csv('third_submission.csv', index=False)