In [1]:
import pandas as pd
import numpy as np
import catboost as cb
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from itertools import product,chain
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
train_set = pd.read_csv("train.csv", na_values='?')
test_set = pd.read_csv("test.csv", na_values='?')
category_cols = ['form_field47', 'default_status']

In [3]:
train_set.head()

Unnamed: 0,Applicant_ID,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,...,form_field42,form_field43,form_field44,form_field45,form_field46,form_field47,form_field48,form_field49,form_field50,default_status
0,Apcnt_1000000,3436.0,0.28505,1.656,0.0,0.0,0.0,10689720.0,252072.0,4272776.0,...,0.392854,2.02,0.711632,0.0,0.0,charge,,1.129518,0.044335,no
1,Apcnt_1000004,3456.0,0.674,0.2342,0.0,0.0,0.0,898979.0,497531.0,9073814.0,...,0.314281,8.08,0.183584,,0.0,charge,349.80573,1.620483,0.322436,no
2,Apcnt_1000008,3276.0,0.53845,3.151,0.0,6.282,,956940.0,,192944.0,...,0.162965,18.18,0.791136,0.0,0.0,charge,,1.51337,0.01164,yes
3,Apcnt_1000012,3372.0,0.17005,0.505,0.0,0.0,192166.0,3044703.0,385499.0,3986472.0,...,0.488884,2.02,0.685168,,0.0,charge,89.9401,0.664452,0.082729,no
4,Apcnt_1000016,3370.0,0.7727,1.101,0.0,0.0,1556.0,214728.0,214728.0,1284089.0,...,0.275,12.12,0.438168,0.0,0.0,charge,97.887502,1.427891,0.04563,no


In [4]:
test_set.head()

Unnamed: 0,Applicant_ID,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,...,form_field41,form_field42,form_field43,form_field44,form_field45,form_field46,form_field47,form_field48,form_field49,form_field50
0,Apcnt_1000032,3236.0,0.34875,10.2006,0.0,0.0,418564.0,418564.0,418564.0,540710.0,...,,0.825,1.01,0.8,,0.0,charge,,0.0,0.011221
1,Apcnt_1000048,3284.0,1.2736,2.9606,9.0198,0.0,0.0,9858816.0,49014.0,1510098.0,...,18.8415,0.507694,4.04,0.623248,1.0,0.0,lending,,0.504974,0.043525
2,Apcnt_1000052,,0.27505,0.06,0.0,0.0,,,,,...,,,0.0,,,,charge,,0.0,
3,Apcnt_1000076,3232.0,0.28505,2.8032,0.0,0.0,0.0,473802.0,473802.0,1724437.0,...,,0.916663,2.02,0.464224,,,charge,90.163742,0.788809,0.104029
4,Apcnt_1000080,3466.0,2.09545,0.8318,2.5182,0.0,19839.0,1150662.0,1150662.0,7860523.0,...,,0.234047,23.23,0.726688,0.0,0.0,lending,1303.587148,1.637733,0.163124


In [5]:
encoder = LabelEncoder()
train_set['form_field47'] = encoder.fit_transform(train_set['form_field47'])
test_set['form_field47'] = encoder.transform(test_set['form_field47'])
train_set['default_status'] = encoder.fit_transform(train_set['default_status'])

In [6]:
# split labels out of data sets    
train_label = train_set['default_status']
train_set = train_set.drop(['Applicant_ID','default_status'], axis=1) # remove labels
test_set = test_set.drop('Applicant_ID', axis=1) # dropping the first column because it has unique elements for all entries

In [7]:
x_train, x_test, y_train, y_test = train_test_split(train_set, train_label, test_size = 0.25, random_state = 1,
                                                   stratify = train_label)

In [8]:
clf = cb.CatBoostClassifier(border_count = 100, l2_leaf_reg = 3, eval_metric = 'AUC', verbose = False, random_seed = 0)

In [10]:
cat_dims = [train_set.columns.get_loc(i) for i in category_cols[:-1]] 
clf.fit(x_train, y_train, cat_features=cat_dims, eval_set = (x_test, y_test))

<catboost.core.CatBoostClassifier at 0x1fb8490d308>

In [11]:
y_pred = clf.predict(x_test)

In [12]:
from sklearn.metrics import roc_auc_score
score = roc_auc_score(y_test, y_pred)
score

0.6814785902842143

In [13]:
clf.fit(train_set, train_label, cat_features=cat_dims)

<catboost.core.CatBoostClassifier at 0x1fb8490d308>

In [14]:
pred1 = clf.predict_proba(test_set[:12000])[:,1]

In [15]:
pred2 = clf.predict_proba(test_set[12000:])[:,1]

In [16]:
pred = np.concatenate((pred1, pred2))

In [17]:
test = pd.read_csv("test.csv")
prediction = pd.DataFrame({'Applicant_ID': test['Applicant_ID'], 'default_status': pred})
prediction.head()

Unnamed: 0,Applicant_ID,default_status
0,Apcnt_1000032,0.314407
1,Apcnt_1000048,0.34174
2,Apcnt_1000052,0.41996
3,Apcnt_1000076,0.759467
4,Apcnt_1000080,0.162663


In [18]:
prediction.to_csv('Submit78.csv', index = False)