In [1]:
from catboost import Pool, CatBoostClassifier, CatBoost
from sklearn.model_selection import StratifiedKFold, train_test_split

import vaex
import pandas as pd

import numpy as np
from scipy.stats import uniform

from amex_metric import amex_metric

In [2]:
def column_selector(df, original_col):
    
    all_cols = df.columns
    selection = [agg_col for agg_col in all_cols if agg_col.startswith(original_col)]
    
    return selection

In [3]:
train = pd.read_parquet('train_agg_filtered_2.parquet')
print(train.shape)
train.info()

(458913, 883)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Columns: 883 entries, P_2_mean to target
dtypes: float32(846), int32(35), object(2)
memory usage: 1.5+ GB


In [4]:
for col, coltype in zip(train.columns, train.dtypes.values):
    if str(coltype).startswith('float'):
        train[col] = train[col].astype('float16')
    elif str(coltype).startswith('int'):
        train[col] = train[col].astype('int16')

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 458913 entries, 0 to 458912
Columns: 883 entries, P_2_mean to target
dtypes: float16(846), int16(35), object(2)
memory usage: 778.1+ MB


In [6]:
# Columnn names lists for easier selection later on:
# All columns
all_columns = list(train.columns)
# Training features
training_features = list(set(all_columns) - set(['target']))
# Categorical features (as per https://www.kaggle.com/competitions/amex-default-prediction/data)
categorical_features_old = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64',
                        'D_66', 'D_68']
categorical_features = []
for i in range(len(categorical_features_old)):
    categorical_features += column_selector(train, categorical_features_old[i])
    
numerical_features = list(set(training_features) - set(categorical_features))

# Cross-validation

In [7]:
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=333)\
        .split(X=train[training_features], y=train[['target']])
y_pred = np.full(shape=(len(train),), fill_value=np.nan)
y_true = train[['target']]

for count, (train_id, valid_id) in enumerate(folds):
    
    print('%==============================================================')
    print(f'                      Fold number {count+1}')
    print('\n%==============================================================')
    
    # Split test and validation sets and place them into Pool objects
    x_train, x_valid = (
        train[training_features].iloc[train_id, :],
        train[training_features].iloc[valid_id, :]
    )
    y_train, y_valid= (
        train[['target']].iloc[train_id, :],
        train[['target']].iloc[valid_id, :]
    )
    train_data = Pool(data=x_train, label=y_train, cat_features=categorical_features)
    valid_data = Pool(data=x_valid, label=y_valid, cat_features=categorical_features)
    # Build model
    model = CatBoostClassifier(
        iterations=2500,
        loss_function='Logloss',
        task_type='GPU',
        devices='0-2',
        verbose=100,
        l2_leaf_reg=0.90,
        learning_rate=0.070,
        depth=5,
        min_data_in_leaf=3,
        bootstrap_type=['Bayesian', 'Bernoulli', 'MVS', 'No'][0],
        score_function=['Cosine', 'L2', 'NewtonCosine', 'NewtonL2'][0]
    )
    # Fit model
    model.fit(
        X=train_data,
        eval_set=valid_data,
        plot=True
    )
    # Calculate amex_metric(y_true, y_pred) with appropriate naming convention and append to score list
    # Predictions are calculated fold by fold using the 'out-of-fold' predictions, in other words, 
    # validation set predictions.
    y_pred[valid_id] = model.predict_proba(x_valid)[:, 1]
    
y_pred = pd.DataFrame(data=y_pred, columns=['prediction'])

                      Fold number 1



MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6121590	test: 0.6118526	best: 0.6118526 (0)	total: 258ms	remaining: 10m 45s
100:	learn: 0.2319206	test: 0.2329004	best: 0.2329004 (100)	total: 18.8s	remaining: 7m 26s
200:	learn: 0.2245783	test: 0.2268836	best: 0.2268836 (200)	total: 37.3s	remaining: 7m 6s
300:	learn: 0.2207461	test: 0.2244993	best: 0.2244993 (300)	total: 55.5s	remaining: 6m 45s
400:	learn: 0.2179791	test: 0.2232712	best: 0.2232712 (400)	total: 1m 13s	remaining: 6m 24s
500:	learn: 0.2156039	test: 0.2224669	best: 0.2224669 (500)	total: 1m 31s	remaining: 6m 4s
600:	learn: 0.2134110	test: 0.2218989	best: 0.2218989 (600)	total: 1m 49s	remaining: 5m 44s
700:	learn: 0.2114413	test: 0.2216055	best: 0.2216055 (700)	total: 2m 7s	remaining: 5m 26s
800:	learn: 0.2095189	test: 0.2212755	best: 0.2212755 (800)	total: 2m 26s	remaining: 5m 10s
900:	learn: 0.2076910	test: 0.2210368	best: 0.2210368 (900)	total: 2m 52s	remaining: 5m 6s
1000:	learn: 0.2059589	test: 0.2209027	best: 0.2208923 (982)	total: 3m 18s	remaining: 4m 56

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6148822	test: 0.6148850	best: 0.6148850 (0)	total: 327ms	remaining: 13m 37s
100:	learn: 0.2321101	test: 0.2335542	best: 0.2335542 (100)	total: 18.8s	remaining: 7m 26s
200:	learn: 0.2247857	test: 0.2273834	best: 0.2273834 (200)	total: 38.3s	remaining: 7m 17s
300:	learn: 0.2208365	test: 0.2247666	best: 0.2247666 (300)	total: 1m 20s	remaining: 9m 50s
400:	learn: 0.2180604	test: 0.2235646	best: 0.2235646 (400)	total: 2m 27s	remaining: 12m 54s
500:	learn: 0.2156232	test: 0.2227481	best: 0.2227481 (500)	total: 4m 15s	remaining: 16m 58s
600:	learn: 0.2134808	test: 0.2222250	best: 0.2222219 (599)	total: 5m 57s	remaining: 18m 49s
700:	learn: 0.2114228	test: 0.2217769	best: 0.2217754 (699)	total: 7m 24s	remaining: 19m
800:	learn: 0.2095345	test: 0.2214919	best: 0.2214830 (798)	total: 9m 11s	remaining: 19m 28s
900:	learn: 0.2077171	test: 0.2212819	best: 0.2212819 (900)	total: 10m 51s	remaining: 19m 16s
1000:	learn: 0.2059771	test: 0.2210751	best: 0.2210751 (1000)	total: 12m 23s	remain

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6116431	test: 0.6119281	best: 0.6119281 (0)	total: 253ms	remaining: 10m 31s
100:	learn: 0.2321313	test: 0.2331222	best: 0.2331222 (100)	total: 18.7s	remaining: 7m 23s
200:	learn: 0.2247632	test: 0.2268488	best: 0.2268488 (200)	total: 38.8s	remaining: 7m 24s
300:	learn: 0.2207768	test: 0.2242841	best: 0.2242841 (300)	total: 1m 7s	remaining: 8m 15s
400:	learn: 0.2180584	test: 0.2231170	best: 0.2231170 (400)	total: 2m 31s	remaining: 13m 15s
500:	learn: 0.2156734	test: 0.2223296	best: 0.2223296 (500)	total: 4m 17s	remaining: 17m 7s
600:	learn: 0.2134816	test: 0.2218315	best: 0.2218291 (599)	total: 6m 10s	remaining: 19m 29s
700:	learn: 0.2114212	test: 0.2213875	best: 0.2213875 (700)	total: 7m 48s	remaining: 20m 2s
800:	learn: 0.2095396	test: 0.2210764	best: 0.2210736 (797)	total: 9m 28s	remaining: 20m 5s
900:	learn: 0.2077444	test: 0.2208682	best: 0.2208682 (900)	total: 10m 59s	remaining: 19m 30s
1000:	learn: 0.2059397	test: 0.2206815	best: 0.2206786 (996)	total: 12m 45s	remaini

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6116439	test: 0.6116838	best: 0.6116838 (0)	total: 289ms	remaining: 12m 2s
100:	learn: 0.2317063	test: 0.2340681	best: 0.2340681 (100)	total: 18.9s	remaining: 7m 29s
200:	learn: 0.2241661	test: 0.2278442	best: 0.2278442 (200)	total: 43s	remaining: 8m 11s
300:	learn: 0.2202532	test: 0.2254098	best: 0.2254098 (300)	total: 2m 23s	remaining: 17m 28s
400:	learn: 0.2175224	test: 0.2242272	best: 0.2242272 (400)	total: 4m 10s	remaining: 21m 50s
500:	learn: 0.2151325	test: 0.2235247	best: 0.2235247 (500)	total: 5m 48s	remaining: 23m 10s
600:	learn: 0.2130253	test: 0.2229804	best: 0.2229804 (600)	total: 7m 37s	remaining: 24m 6s
700:	learn: 0.2109796	test: 0.2225336	best: 0.2225336 (700)	total: 9m 26s	remaining: 24m 13s
800:	learn: 0.2090780	test: 0.2222231	best: 0.2222231 (800)	total: 11m 21s	remaining: 24m 5s
900:	learn: 0.2072173	test: 0.2219228	best: 0.2219228 (900)	total: 13m 11s	remaining: 23m 24s
1000:	learn: 0.2054287	test: 0.2218340	best: 0.2218336 (999)	total: 14m 50s	remain

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.6117000	test: 0.6118954	best: 0.6118954 (0)	total: 259ms	remaining: 10m 47s
100:	learn: 0.2320604	test: 0.2336117	best: 0.2336117 (100)	total: 19s	remaining: 7m 31s
200:	learn: 0.2245658	test: 0.2277310	best: 0.2277310 (200)	total: 41.4s	remaining: 7m 53s
300:	learn: 0.2206355	test: 0.2254107	best: 0.2254107 (300)	total: 2m 22s	remaining: 17m 24s
400:	learn: 0.2177587	test: 0.2241514	best: 0.2241514 (400)	total: 4m 13s	remaining: 22m 5s
500:	learn: 0.2153748	test: 0.2234639	best: 0.2234639 (500)	total: 5m 57s	remaining: 23m 47s
600:	learn: 0.2131771	test: 0.2229030	best: 0.2229030 (600)	total: 7m 44s	remaining: 24m 28s
700:	learn: 0.2111403	test: 0.2225220	best: 0.2225220 (700)	total: 9m 40s	remaining: 24m 49s
800:	learn: 0.2092143	test: 0.2223002	best: 0.2222963 (799)	total: 11m 28s	remaining: 24m 20s
900:	learn: 0.2073869	test: 0.2220679	best: 0.2220679 (900)	total: 13m 8s	remaining: 23m 19s
1000:	learn: 0.2055967	test: 0.2218211	best: 0.2218201 (998)	total: 14m 49s	remai

In [8]:
print(f'Competition metric CV score: {str(round(amex_metric(y_true, y_pred)*100, 2))} %.')

Competition metric CV score: 78.8 %.


# Trying out configs

In [9]:
x_train, x_valid, y_train, y_valid = train_test_split(train[training_features],
                                                      train[['target']], test_size=0.20, random_state=333,
                                                     stratify=train[['target']])

In [10]:
# Data in a catboost.Pool object
train_data = Pool(
    data=x_train,
    label=y_train,
    cat_features=categorical_features
)
valid_data = Pool(
    data=x_valid,
    label=y_valid,
    cat_features=categorical_features
)

In [13]:
model = CatBoostClassifier(
    iterations=500,
    loss_function='Logloss',
    task_type='GPU',
    devices='0-2',
    verbose=100,
    l2_leaf_reg=[0.1, 0.5, 1, 2, 5, 10][2],
    learning_rate=0.068,
    depth=5,
    min_data_in_leaf=1,
    bootstrap_type=['Bayesian', 'Bernoulli', 'MVS', 'No'][0],
    score_function=['Cosine', 'L2', 'NewtonCosine', 'NewtonL2'][0]
)

model.fit(
    X=train_data,
    eval_set=valid_data
)

0:	learn: 0.6159525	test: 0.6161246	best: 0.6161246 (0)	total: 321ms	remaining: 2m 40s
100:	learn: 0.2325325	test: 0.2354872	best: 0.2354872 (100)	total: 1m 38s	remaining: 6m 27s
200:	learn: 0.2252947	test: 0.2295452	best: 0.2295452 (200)	total: 3m 24s	remaining: 5m 3s
300:	learn: 0.2214723	test: 0.2271348	best: 0.2271348 (300)	total: 5m 9s	remaining: 3m 24s
400:	learn: 0.2186941	test: 0.2259550	best: 0.2259550 (400)	total: 6m 52s	remaining: 1m 41s
499:	learn: 0.2163903	test: 0.2251764	best: 0.2251764 (499)	total: 8m 33s	remaining: 0us
bestTest = 0.2251763756
bestIteration = 499


<catboost.core.CatBoostClassifier at 0x1628296c520>