In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

import optuna

from sklearn.model_selection import train_test_split
import sklearn.metrics

from xgboost import XGBClassifier

import cupy, cudf # GPU libraries
import matplotlib.pyplot as plt, gc, os

import gc
import warnings
warnings.filterwarnings('ignore')

In [2]:
def read_train_file(path = '', usecols = None):
    # LOAD DATAFRAME
    if usecols is not None: df = cudf.read_parquet(path, columns=usecols)
    else: df = cudf.read_parquet(path)
    # REDUCE DTYPE FOR CUSTOMER AND DATE
    df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
    df.S_2 = cudf.to_datetime( df.S_2 )
    # FILL NAN
    df = df.fillna(0) 
    print('shape of data:', df.shape)
    
    return df

print('Reading train data...')
TRAIN_PATH = '../input/amex-data-integer-dtypes-parquet-format/train.parquet'
train = read_train_file(path = TRAIN_PATH)

Reading train data...
shape of data: (5531451, 190)


In [3]:
train.head()

Unnamed: 0,customer_ID,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,...,D_136,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145
0,-4532153018459703766,2017-03-09,0.938469,0,0.008724,1.006838,0.009228,0.124035,0.0,0.004709,...,-1,-1,-1,0,0,0.0,0.0,0,0.00061,0
1,-4532153018459703766,2017-04-07,0.936665,0,0.004923,1.000653,0.006151,0.12675,0.0,0.002714,...,-1,-1,-1,0,0,0.0,0.0,0,0.005492,0
2,-4532153018459703766,2017-05-28,0.95418,3,0.021655,1.009672,0.006815,0.123977,0.0,0.009423,...,-1,-1,-1,0,0,0.0,0.0,0,0.006986,0
3,-4532153018459703766,2017-06-13,0.960384,0,0.013683,1.0027,0.001373,0.117169,0.0,0.005531,...,-1,-1,-1,0,0,0.0,0.0,0,0.006527,0
4,-4532153018459703766,2017-07-16,0.947248,0,0.015193,1.000727,0.007605,0.117325,0.0,0.009312,...,-1,-1,-1,0,0,0.0,0.0,0,0.008126,0


In [4]:
def process_and_feature_engineer(df):
    # FEATURE ENGINEERING FROM 
    # https://www.kaggle.com/code/huseyincot/amex-agg-data-how-it-created
    all_cols = [c for c in list(df.columns) if c not in ['customer_ID','S_2']]
    cat_features = ["B_30","B_38","D_114","D_116","D_117","D_120","D_126","D_63","D_64","D_66","D_68"]
    num_features = [col for col in all_cols if col not in cat_features]

    test_num_agg = df.groupby("customer_ID")[num_features].agg(['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]

    test_cat_agg = df.groupby("customer_ID")[cat_features].agg(['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]

    df = cudf.concat([test_num_agg, test_cat_agg], axis=1)
    del test_num_agg, test_cat_agg
    print('shape after engineering', df.shape )
    
    return df

train = process_and_feature_engineer(train)

shape after engineering (458913, 918)


In [5]:
train.head()

Unnamed: 0_level_0,P_2_mean,P_2_std,P_2_min,P_2_max,P_2_last,D_39_mean,D_39_std,D_39_min,D_39_max,D_39_last,...,D_63_nunique,D_64_count,D_64_last,D_64_nunique,D_66_count,D_66_last,D_66_nunique,D_68_count,D_68_last,D_68_nunique
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9223358381327749917,0.415868,0.057145,0.340178,0.498727,0.387708,2.615385,4.628507,0,16,0,...,1,13,2,1,13,-1,1,13,3,2
-9223193039457028513,0.974068,0.013094,0.964483,1.002478,1.001372,0.0,0.0,0,0,0,...,2,13,0,1,13,-1,1,13,6,1
-9223189665817919541,0.802447,0.038025,0.694073,0.828761,0.694073,0.0,0.0,0,0,0,...,1,13,0,1,13,-1,1,13,6,1
-9223188534444851899,0.791203,0.002688,0.786647,0.794826,0.787945,0.0,0.0,0,0,0,...,1,13,3,2,13,-1,1,13,5,1
-9223173911659837606,0.115666,0.078554,0.038207,0.252421,0.040486,4.384615,6.144625,0,17,13,...,1,13,0,2,13,-1,1,13,6,2


In [6]:
# ADD TARGETS
targets = cudf.read_csv('../input/amex-default-prediction/train_labels.csv')
targets['customer_ID'] = targets['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
targets = targets.set_index('customer_ID')
train = train.merge(targets, left_index=True, right_index=True, how='left')
train.target = train.target.astype('int8')
del targets

# NEEDED TO MAKE CV DETERMINISTIC (cudf merge above randomly shuffles rows)
train = train.sort_index().reset_index()

# FEATURES
FEATURES = train.columns[1:-1]
print(f'There are {len(FEATURES)} features!')

There are 918 features!


In [7]:
train_pd = train.to_pandas()
del train
_ = gc.collect()

In [8]:
train_df, test_df = train_test_split(train_pd, test_size=0.25, stratify=train_pd['target'])
del train_pd
_ = gc.collect()

In [9]:
len(train_df),len(test_df)

(344184, 114729)

In [10]:
X_train = train_df.drop(['customer_ID', 'target'], axis=1)
X_test = test_df.drop(['customer_ID', 'target'], axis=1)

In [11]:
X_train

Unnamed: 0,P_2_mean,P_2_std,P_2_min,P_2_max,P_2_last,D_39_mean,D_39_std,D_39_min,D_39_max,D_39_last,...,D_63_nunique,D_64_count,D_64_last,D_64_nunique,D_66_count,D_66_last,D_66_nunique,D_68_count,D_68_last,D_68_nunique
148373,0.598238,0.075242,0.526577,0.734283,0.729374,0.230769,0.832050,0,3,0,...,1,13,0,4,13,-1,1,13,2,3
205794,0.851990,0.027970,0.803955,0.883138,0.867589,8.384615,8.251651,0,21,21,...,1,13,0,1,13,-1,1,13,6,1
372775,0.990787,0.014948,0.967687,1.007936,1.005955,1.461538,3.502746,0,13,0,...,1,13,0,2,13,-1,1,13,5,2
409254,0.656286,0.010752,0.641476,0.667558,0.665471,0.000000,0.000000,0,0,0,...,1,6,3,2,6,-1,1,6,3,2
358753,0.759962,0.040063,0.684041,0.810911,0.775050,5.000000,5.845226,0,14,9,...,1,13,0,1,13,-1,1,13,6,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50635,0.845773,0.028264,0.792391,0.893358,0.859859,0.000000,0.000000,0,0,0,...,1,13,0,2,13,-1,1,13,6,1
413394,0.596441,,0.596441,0.596441,0.596441,0.000000,,0,0,0,...,1,1,-1,1,1,-1,1,1,-1,1
117240,0.863514,0.046102,0.779180,0.944427,0.944427,0.000000,0.000000,0,0,0,...,1,13,3,2,13,-1,1,13,4,3
447295,0.752739,0.043793,0.697787,0.805806,0.701170,0.000000,0.000000,0,0,0,...,1,13,2,2,13,-1,1,13,6,2


In [12]:
y_train = train_df['target']
y_test = test_df['target']

In [13]:
y_train

148373    0
205794    0
372775    0
409254    0
358753    0
         ..
50635     0
413394    0
117240    0
447295    0
456860    1
Name: target, Length: 344184, dtype: int8

In [14]:
del train_df, test_df
_ = gc.collect()

In [15]:
# optuna

def objective(trial):
    
    param = {
        'booster':'gbtree',
        'tree_method':'gpu_hist', 
        "objective": "binary:logistic",
        'lambda': trial.suggest_loguniform(
            'lambda', 0.01, 1.0
        ),
        'alpha': trial.suggest_loguniform(
            'alpha', 5, 20.0
        ),
        'colsample_bytree': trial.suggest_float(
            'colsample_bytree', 0.3,0.9,step=0.1
        ),
        'subsample': trial.suggest_float(
            'subsample', 0.5,1,step=0.1
        ),
        'learning_rate': trial.suggest_float(
            'learning_rate', 0.01,0.1,step=0.001
        ),
        'n_estimators': trial.suggest_int(
            "n_estimators", 800,1200,20
        ),
        'max_depth': trial.suggest_int(
            'max_depth', 4,12,1
        ),
        'random_state': 99,
        'min_child_weight': trial.suggest_int(
            'min_child_weight', 64,256,1
        ),
    }
    
    model = XGBClassifier(**param, enable_categorical = True) 
    
    model.fit(X_train,y_train)
    
    preds = pd.DataFrame(model.predict(X_test))
    
    accuracy = sklearn.metrics.accuracy_score(pd.DataFrame(y_test.reset_index()['target']),preds)
    
    return accuracy

In [16]:
%%time
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials= 200)

[32m[I 2022-06-25 13:23:07,963][0m A new study created in memory with name: no-name-2a047cd5-faf9-4a43-b209-101add0add52[0m
[32m[I 2022-06-25 13:24:26,094][0m Trial 0 finished with value: 0.9023263516634853 and parameters: {'lambda': 0.011524216970142453, 'alpha': 12.999461701652196, 'colsample_bytree': 0.5, 'subsample': 0.9, 'learning_rate': 0.024, 'n_estimators': 800, 'max_depth': 7, 'min_child_weight': 168}. Best is trial 0 with value: 0.9023263516634853.[0m
[32m[I 2022-06-25 13:25:42,009][0m Trial 1 finished with value: 0.903128241333926 and parameters: {'lambda': 0.0751542312191535, 'alpha': 5.4026810304109985, 'colsample_bytree': 0.8, 'subsample': 0.5, 'learning_rate': 0.046000000000000006, 'n_estimators': 860, 'max_depth': 6, 'min_child_weight': 100}. Best is trial 1 with value: 0.903128241333926.[0m
[32m[I 2022-06-25 13:27:05,991][0m Trial 2 finished with value: 0.9015941915296045 and parameters: {'lambda': 0.21652739173263372, 'alpha': 13.976027359153333, 'colsample

CPU times: user 6h 20min 25s, sys: 6min 20s, total: 6h 26min 45s
Wall time: 5h 45min 45s


In [17]:
best_params = study.best_trial.params
best_params['tree_method'] = 'gpu_hist'
best_params['booster'] = 'gbtree'
print(best_params)

{'lambda': 0.995593032891557, 'alpha': 5.450877266021495, 'colsample_bytree': 0.3, 'subsample': 0.8, 'learning_rate': 0.041, 'n_estimators': 880, 'max_depth': 12, 'min_child_weight': 147, 'tree_method': 'gpu_hist', 'booster': 'gbtree'}


In [18]:
final_model = XGBClassifier(**best_params,enable_categorical = True)

In [19]:
final_model.fit(X_train,y_train)

XGBClassifier(alpha=5.450877266021495, base_score=0.5, booster='gbtree',
              callbacks=None, colsample_bylevel=1, colsample_bynode=1,
              colsample_bytree=0.3, early_stopping_rounds=None,
              enable_categorical=True, eval_metric=None, gamma=0, gpu_id=0,
              grow_policy='depthwise', importance_type=None,
              interaction_constraints='', lambda=0.995593032891557,
              learning_rate=0.041, max_bin=256, max_cat_to_onehot=4,
              max_delta_step=0, max_depth=12, max_leaves=0,
              min_child_weight=147, missing=nan, monotone_constraints='()',
              n_estimators=880, n_jobs=0, num_parallel_tree=1, predictor='auto',
              random_state=0, ...)

In [20]:
del X_train,X_test,y_train,y_test
_ = gc.collect()

In [21]:
import joblib
joblib.dump(final_model, "xgb_classifier_v1.h5")

['xgb_classifier_v1.h5']

In [22]:
# def read_test_file(path = '', usecols = None):
#     # LOAD DATAFRAME
#     if usecols is not None: df = cudf.read_parquet(path, columns=usecols)
#     else: df = cudf.read_parquet(path)
#     # REDUCE DTYPE FOR CUSTOMER AND DATE
#     #df['customer_ID'] = df['customer_ID'].str[-16:].str.hex_to_int().astype('int64')
#     df.S_2 = cudf.to_datetime( df.S_2 )
#     # SORT BY CUSTOMER AND DATE (so agg('last') works correctly)
#     #df = df.sort_values(['customer_ID','S_2'])
#     #df = df.reset_index(drop=True)
#     # FILL NAN
#     df = df.fillna(0) 
#     print('shape of data:', df.shape)
    
#     return df

# print('Reading test data...')
# TEST_PATH = '../input/amex-data-integer-dtypes-parquet-format/test.parquet'
# test = read_test_file(path = TEST_PATH)

In [23]:
# test.head()

In [24]:
# test = process_and_feature_engineer(test)

In [25]:
# test['prediction'] = final_model.predict_proba(test)[:,1]

In [26]:
# final = pd.DataFrame(test['prediction'].to_pandas())

In [27]:
# final.to_csv("submission.csv", index=True)