In [1]:
## CAT Boost ##
# Similarly to LGB, CatB is a gradient boost model 

In [2]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import gc
gc.enable()
import catboost as ctb
import utilities

In [3]:
train = utilities.read_csv('../../datatmp/data/trainjoincollnoisy.csv')
test = utilities.read_csv('../../datatmp/data/testjoincollnoisy.csv')

In [4]:
target = train['TARGET'].astype(int).copy()
train.drop('TARGET',axis=1, inplace=True)
testIds = test['SK_ID_CURR'].astype(int).copy()
train.drop('SK_ID_CURR', axis=1, inplace=True)
test.drop('SK_ID_CURR', axis=1, inplace=True)

In [5]:
# Due to the last version of lgb, we got an error related to special chars in column names. Replacing the blanks in the following fixed it
train.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in train.columns]
test.columns = ["".join (c if c.isalnum() else "_" for c in str(x)) for x in test.columns]

In [6]:
impFeats = []
importances = pd.read_csv('../../datatmp/data/importances.csv')

for feat in importances:
    if importances.loc[0][feat] > 0:
        impFeats.append(feat)

In [7]:
# Final train test
train = train[impFeats]
test = test[impFeats]

In [8]:
# K-Fold catboost train and predict
from sklearn.model_selection import KFold

def trainNpredCatb(train, test, y, nFolds):

    testPreds = np.zeros(test.shape[0])

    # Unbalanced data: use cv
    # Set seed to make the experiment reproducible
    folds = KFold(n_splits=nFolds, shuffle=True, random_state=42)

    for fold, (trainIds, valIds) in enumerate(folds.split(train, y)):

        trainX, trainY = train.iloc[trainIds], y.iloc[trainIds]
        valX, valY = train.iloc[valIds], y.iloc[valIds]

        clf = ctb.CatBoostClassifier(iterations=2000,
                                      learning_rate=0.02,
                                      depth=6,
                                      l2_leaf_reg=40,
                                      bootstrap_type='Bernoulli',
                                      subsample=0.8715623,
                                      scale_pos_weight=5,
                                      eval_metric='AUC',
                                      metric_period=50,
                                      od_type='Iter',
                                      od_wait=45,
                                      random_seed=42,
                                     allow_writing_files=False)

        clf.fit(trainX, trainY,
                     eval_set=(valX, valY),
                     use_best_model=True,
                     verbose=True)

        # Average best iteration preds for test 
        testPreds += clf.predict_proba(test)[:, 1] / folds.n_splits 

        # Free space
        del trainX, trainY, valX, valY
        gc.collect()

    return testPreds

In [9]:
%time predictions = trainNpredCatb(train, test, target, 5)

0:	test: 0.6912652	best: 0.6912652 (0)	total: 252ms	remaining: 8m 23s
50:	test: 0.7383902	best: 0.7383902 (50)	total: 7.51s	remaining: 4m 46s
100:	test: 0.7513409	best: 0.7513409 (100)	total: 14.7s	remaining: 4m 36s
150:	test: 0.7589573	best: 0.7589573 (150)	total: 21.8s	remaining: 4m 26s
200:	test: 0.7637604	best: 0.7637604 (200)	total: 28.8s	remaining: 4m 17s
250:	test: 0.7672185	best: 0.7672185 (250)	total: 35.7s	remaining: 4m 8s
300:	test: 0.7695085	best: 0.7695085 (300)	total: 42.6s	remaining: 4m
350:	test: 0.7713577	best: 0.7713577 (350)	total: 49.2s	remaining: 3m 51s
400:	test: 0.7729726	best: 0.7729726 (400)	total: 55.9s	remaining: 3m 43s
450:	test: 0.7742172	best: 0.7742172 (450)	total: 1m 2s	remaining: 3m 34s
500:	test: 0.7753980	best: 0.7753980 (500)	total: 1m 9s	remaining: 3m 26s
550:	test: 0.7764615	best: 0.7764615 (550)	total: 1m 15s	remaining: 3m 18s
600:	test: 0.7773527	best: 0.7773527 (600)	total: 1m 21s	remaining: 3m 10s
650:	test: 0.7784053	best: 0.7784053 (650)	tota

In [10]:
# Create submission csv 
subdf = pd.DataFrame({'SK_ID_CURR': testIds, 'TARGET': predictions})

In [11]:
subdf.to_csv('../../datatmp/data/submissioncatb.csv', index=False)