In [1]:
from catboost import Pool, CatBoostClassifier

import vaex
import pandas as pd
import numpy as np

from amex_metric import amex_metric

In [14]:
def column_selector(df, original_col):
    
    all_cols = df.columns
    selection = [agg_col for agg_col in all_cols if agg_col.startswith(original_col)]
    
    return selection


def cat_datatypes_check(df_in, categorical_features):
    
    # Shallow copy not to modify input dataframe directly
    df_out = df_in.copy()
    
    # Scan for possible convertions to 'int' and 'str', acceptable types for categorical features
    for feature, dtype in zip(categorical_features, df_out[categorical_features].dtypes.values):
        if str(dtype).startswith('float'):
            df_out[feature] = df_out[feature].astype('int16')
        elif str(dtype).startswith('object'):
            df_out[feature] = df_out[feature].astype('str')
        else:
            continue
            
    return df_out

In [3]:
train = pd.read_parquet('train_agg_filtered_2.parquet')
test = pd.read_parquet('test_agg_filtered_2.parquet')

In [4]:
for col, coltype in zip(train.columns, train.dtypes.values):
    if str(coltype).startswith('float'):
        train[col] = train[col].astype('float16')
    elif str(coltype).startswith('int'):
        train[col] = train[col].astype('int16')
        
for col, coltype in zip(test.columns, test.dtypes.values):
    if str(coltype).startswith('float'):
        test[col] = test[col].astype('float16')
    elif str(coltype).startswith('int'):
        test[col] = test[col].astype('int16')

In [5]:
# Columnn names lists for easier selection later on:
# All columns
all_columns = list(train.columns)
# Training features
training_features = list(set(all_columns) - set(['target']))
# Categorical features (as per https://www.kaggle.com/competitions/amex-default-prediction/data)
categorical_features_old = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64',
                        'D_66', 'D_68']
categorical_features = []
for i in range(len(categorical_features_old)):
    categorical_features += column_selector(train, categorical_features_old[i])
    
numerical_features = list(set(training_features) - set(categorical_features))

# Training

In [7]:
train_data = Pool(
    data=train[training_features],
    label=train[['target']],
    cat_features=categorical_features
)

model = CatBoostClassifier(
        iterations=2500,
        loss_function='Logloss',
        task_type='GPU',
        devices='0-2',
        verbose=100,
        l2_leaf_reg=0.90,
        learning_rate=0.070,
        depth=5,
        min_data_in_leaf=3,
        bootstrap_type=['Bayesian', 'Bernoulli', 'MVS', 'No'][0],
        score_function=['Cosine', 'L2', 'NewtonCosine', 'NewtonL2'][0]
)

model.fit(X=train_data)

0:	learn: 0.6124233	total: 253ms	remaining: 10m 31s
100:	learn: 0.2322530	total: 22.5s	remaining: 8m 54s
200:	learn: 0.2249842	total: 44.6s	remaining: 8m 29s
300:	learn: 0.2213607	total: 1m 6s	remaining: 8m 4s
400:	learn: 0.2187159	total: 1m 27s	remaining: 7m 40s
500:	learn: 0.2166174	total: 1m 49s	remaining: 7m 16s
600:	learn: 0.2147534	total: 2m 10s	remaining: 6m 52s
700:	learn: 0.2130580	total: 2m 32s	remaining: 6m 30s
800:	learn: 0.2114114	total: 2m 56s	remaining: 6m 13s
900:	learn: 0.2098662	total: 3m 24s	remaining: 6m 3s
1000:	learn: 0.2083754	total: 3m 52s	remaining: 5m 48s
1100:	learn: 0.2069091	total: 4m 25s	remaining: 5m 37s
1200:	learn: 0.2054638	total: 5m 10s	remaining: 5m 35s
1300:	learn: 0.2040730	total: 5m 58s	remaining: 5m 30s
1400:	learn: 0.2027213	total: 6m 56s	remaining: 5m 26s
1500:	learn: 0.2013737	total: 8m 7s	remaining: 5m 24s
1600:	learn: 0.2000517	total: 9m 39s	remaining: 5m 25s
1700:	learn: 0.1987791	total: 11m 10s	remaining: 5m 14s
1800:	learn: 0.1974878	tota

<catboost.core.CatBoostClassifier at 0x29261b94fd0>

# Submission

In [21]:
test_data = Pool(
    data=test[training_features],
    cat_features=categorical_features
)

In [31]:
predictions = model.predict_proba(test_data)[:, 1]
predictions = pd.DataFrame(predictions, columns=['prediction'])

In [27]:
test_init = pd.read_pickle('test_agg.pkl', compression='gzip')

In [34]:
predictions['customer_ID'] = test_init.index

In [37]:
predictions = predictions.set_index('customer_ID')

In [38]:
predictions.to_csv('predictions.csv')