https://www.kaggle.com/code/ragnar123/amex-lgbm-dart-cv-0-7963<br>
https://www.kaggle.com/code/farhantandia/catboost-optuna-stratified-k-fold-top-6/notebook

In [1]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import shap
#一日分のデータだけをピックアップして、予測を実施する
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, auc, confusion_matrix, roc_auc_score

import gc
from catboost import CatBoostClassifier as clf
import catboost as cat
import optuna

In [2]:
# train_df = pd.read_parquet('train.parquet')
# train_label_df = pd.read_csv('train_labels.csv')
# test_df = pd.read_parquet('test.parquet')

In [6]:
def read_preprocess_data():
    train = pd.read_parquet('train.parquet')
    features = train.drop(['customer_ID', 'S_2'], axis=1).columns.to_list()
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68",
    ]
    num_features = [col for col in features if col not in cat_features]
    print('Starting training feature engineering...')
    train_num_agg = train.groupby('customer_ID')[num_features].agg(['mean', 'std',
                                                                   'min','max','last'])
    train_num_agg.columns = ['_'.join(x) for x in train_num_agg.columns]
    train_num_agg.reset_index(inplace=True)
    train_cat_agg = train.groupby('customer_ID')[cat_features].agg(
    ['count', 'last', 'nunique'])
    train_cat_agg.columns = ['_'.join(x) for x in train_cat_agg.columns]
    train_cat_agg.reset_index(inplace=True)
    train_labels = pd.read_csv('train_labels.csv')
    train = train_num_agg.merge(train_cat_agg, how='inner',
                               on='customer_ID')\
    .merge(train_labels, how='inner', on='customer_ID')
    del train_num_agg, train_cat_agg
    gc.collect()

    test = pd.read_parquet('test.parquet')
    print('Starting test feature engineering...')
    test_num_agg = test.groupby('customer_ID')[num_features].agg(
    ['mean', 'std', 'min', 'max', 'last'])
    test_num_agg.columns = ['_'.join(x) for x in test_num_agg.columns]
    test_num_agg.reset_index(inplace=True)
    test_cat_agg = test.groupby('customer_ID')[cat_features].agg(
    ['count', 'last', 'nunique'])
    test_cat_agg.columns = ['_'.join(x) for x in test_cat_agg.columns]
    test_cat_agg.reset_index(inplace=True)
    test = test_num_agg.merge(test_cat_agg, how='inner',
                             on='customer_ID')
    del test_num_agg, test_cat_agg
    gc.collect()

    train.to_parquet('train_fe.parquet')
    test.to_parquet('test_fe.parquet')

In [7]:
read_preprocess_data()

Starting training feature engineering...
Starting test feature engineering...


In [19]:
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import joblib
import itertools
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
import catboost
from itertools import combinations

class CFG:
    input_dir = ''
    seed = 42
    n_folds = 5
    target = 'target'

def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    # os.environ['PYTHONHASHED'] = str(seed)
    
def read_data():
    train = pd.read_parquet('train_fe.parquet')
    test = pd.read_parquet('test_fe.parquet')
    return train, test

def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04*np.sum(weights))]
    top_four = np.sum(cut_vals[:,0] / np.sum(labels[:, 0]))
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] * weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1] / gini[0] + top_four)

def cat_boost_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True

def train_and_evaluate(train, test):
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68"
    ]
    cat_features = [f'{cf}_last' for cf in cat_features]
    for cat_col in cat_features:
        encoder = LabelEncoder()
        train[cat_col] = encoder.fit_transform(train[cat_col])
        test[cat_col] = encoder.transform(test[cat_col])
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | 
                                 (train.dtypes == 'float64')].index)
    num_cols = [col for col in num_cols if 'last' in col]
    for col in num_cols:
        train[col + '_round2'] = train[col].round(2)
        test[col + '_round2'] = test[col].round(2)
        
    features = [col for col in train.columns if col not in ['customer_ID', 
                                                           CFG.target]]
    # params = {
    #     'objective': 'binary',
    #     'metric': "binary_logloss",
    #     'boosting': 'dart',
    #     'seed': CFG.seed,
    #     'num_leaves': 100,
    #     'learning_rate': 0.01,
    #     'feature_fraction': 0.20,
    #     'bagging_freq': 10,
    #     'bagging_fraction': 0.50,
    #     'n_jobs': -1,
    #     'lambda_l2': 2,
    #     'min_data_in_leaf': 40
    #     }
    params =  {'iterations': 1333, 
               'objective': 'Logloss', 
               'colsample_bylevel': 0.015880731489297147, 
               'depth': 8, 
               'boosting_type': 'Ordered', 
               'bootstrap_type': 'Bernoulli', 
               'subsample': 0.5758333972262372}
    test_predictions = np.zeros(len(test))
    
    oof_predictions = np.zeros(len(train))
    kfold = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True,
                           random_state=CFG.seed)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
        print(' ')
        print('-'*50)
        print(f'Training fold {fold} with {len(features)} features...')
        
        x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
        y_train, y_val = train[CFG.target].iloc[trn_ind], train[CFG.target].iloc[val_ind]
        
        model = catboost.CatBoostClassifier(**params)
        model.fit(x_train, y_train)
        # Predict validation
        val_pred = model.predict_log_proba(x_val)[:,1]
        oof_predictions[val_ind] = val_pred
        # Predict the test set
        test_pred = model.predict_proba(test[features])[:, 1]
        test_predictions += test_pred / CFG.n_folds
        # Computer fold metric
        score = amex_metric(y_val, val_pred)
        print(f'Our fold {fold} CV score is {score}')
        del x_train, x_val, y_train, y_val, model
        gc.collect()
    # Compute out of folds metric
    score = amex_metric(train[CFG.target], oof_predictions)
    print(f'Our out of folds CV score is {score}')
    # Create a dataframe to store out of folds predictions
    oof_df = pd.DataFrame({'customer_ID':train['customer_ID'],
                           'target':train[CFG.target], 'prediction':oof_predictions})
    oof_df.to_csv(f'catboost_{CFG.n_folds}fold_seed{CFG.seed}.csv',
                          index=False)
    # Create a dataframe to store test prediction
    test_df = pd.DataFrame({'customer_ID':test['customer_ID'],'prediction':test_predictions})
    test_df.to_csv(f'test_catboost_{CFG.n_folds}fold_seed{CFG.seed}.csv',index=False)

In [17]:
def train_and_evaluate_optuna(train, test):
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68"
    ]
    cat_features = [f'{cf}_last' for cf in cat_features]
    for cat_col in cat_features:
        encoder = LabelEncoder()
        train[cat_col] = encoder.fit_transform(train[cat_col])
        test[cat_col] = encoder.transform(test[cat_col])
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | 
                                 (train.dtypes == 'float64')].index)
    num_cols = [col for col in num_cols if 'last' in col]
    for col in num_cols:
        train[col + '_round2'] = train[col].round(2)
        test[col + '_round2'] = test[col].round(2)
        
    features = [col for col in train.columns if col not in ['customer_ID', 
                                                           CFG.target]]
    def objective(trial):
        
        # params = {
        #     'objective': 'binary',
        #     'metric': "binary_logloss",
        #     'boosting': 'dart',
        #     'seed': CFG.seed,
        #     'num_leaves': 100,
        #     'learning_rate': 0.01,
        #     'feature_fraction': 0.20,
        #     'bagging_freq': 10,
        #     'bagging_fraction': 0.50,
        #     'n_jobs': -1,
        #     'lambda_l2': 2,
        #     'min_data_in_leaf': 40
        #     }
        params = {
            "iterations": trial.suggest_int("iterations", 50, 1500),
            "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
            "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.1),
            "depth": trial.suggest_int("depth", 1, 12),
            "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
            "bootstrap_type": trial.suggest_categorical(
                "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
            )
        }
        
        if params["bootstrap_type"] == "Bayesian":
            params["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
        elif params["bootstrap_type"] == "Bernoulli":
            params["subsample"] = trial.suggest_float("subsample", 0.1, 1)

        
        test_predictions = np.zeros(len(test))

        oof_predictions = np.zeros(len(train))
        kfold = StratifiedKFold(n_splits=CFG.n_folds, shuffle=True,
                               random_state=CFG.seed)
        for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
            print(' ')
            print('-'*50)
            print(f'Training fold {fold} with {len(features)} features...')

            x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
            y_train, y_val = train[CFG.target].iloc[trn_ind], train[CFG.target].iloc[val_ind]

            model = catboost.CatBoostClassifier(**params)
            model.fit(x_train, y_train)
            # Predict validation
            val_pred = model.predict_log_proba(x_val)[:,1]
            oof_predictions[val_ind] = val_pred
            # Predict the test set
            test_pred = model.predict_proba(test[features])[:, 1]
            test_predictions += test_pred / CFG.n_folds
            # Computer fold metric
            score = amex_metric(y_val, val_pred)
            print(f'Our fold {fold} CV score is {score}')
            del x_train, x_val, y_train, y_val, model
            gc.collect()
        # Compute out of folds metric
        score = amex_metric(train[CFG.target], oof_predictions)
        print(f'Our out of folds CV score is {score}')
        # Create a dataframe to store out of folds predictions
        oof_df = pd.DataFrame({'customer_ID':train['customer_ID'],
                               'target':train[CFG.target], 'prediction':oof_predictions})
        oof_df.to_csv(f'0710_catboost_{CFG.n_folds}fold_seed{CFG.seed}.csv',
                              index=False)
        # Create a dataframe to store test prediction
        test_df = pd.DataFrame({'customer_ID':test['customer_ID'],'prediction':test_predictions})
        test_df.to_csv(f'0710_test_catboost_{CFG.n_folds}fold_seed{CFG.seed}.csv',index=False)
        
        return score
    
    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=100)
    
    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [8]:
seed_everything(CFG.seed)
train, test = read_data()

In [20]:
train_and_evaluate(train, test)
# train_and_evaluate_optuna(train, test)

 
--------------------------------------------------
Training fold 0 with 1383 features...


Custom logger is already specified. Specify more than one logger at same time is not thread safe.

Learning rate set to 0.098547
0:	learn: 0.5794675	total: 107ms	remaining: 2m 22s
1:	learn: 0.5015046	total: 187ms	remaining: 2m 4s
2:	learn: 0.4412207	total: 274ms	remaining: 2m 1s
3:	learn: 0.3996814	total: 363ms	remaining: 2m
4:	learn: 0.3680528	total: 436ms	remaining: 1m 55s
5:	learn: 0.3423278	total: 507ms	remaining: 1m 52s
6:	learn: 0.3246819	total: 576ms	remaining: 1m 49s
7:	learn: 0.3087952	total: 661ms	remaining: 1m 49s
8:	learn: 0.2983483	total: 748ms	remaining: 1m 49s
9:	learn: 0.2885117	total: 829ms	remaining: 1m 49s
10:	learn: 0.2805457	total: 910ms	remaining: 1m 49s
11:	learn: 0.2741628	total: 1s	remaining: 1m 50s
12:	learn: 0.2690535	total: 1.11s	remaining: 1m 53s
13:	learn: 0.2648463	total: 1.2s	remaining: 1m 53s
14:	learn: 0.2614433	total: 1.28s	remaining: 1m 52s
15:	learn: 0.2581895	total: 1.35s	remaining: 1m 51s
16:	learn: 0.2558573	total: 1.44s	remaining: 1m 51s
17:	learn: 0.2535039	total: 1.51s	remaining: 1m 50s
18:	learn: 0.2516036	total: 1.6s	remaining: 1m 50s
19: