<a href="https://colab.research.google.com/github/gauravbrills/kaggle-fiddle/blob/main/amex/amex_catboost.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### CATBOOST AMEX

### Pre-requisites

### Imports and Constants

In [5]:
import os,random 
import tqdm 
import pandas as cudf
import numpy as cupy 
from catboost import CatBoostClassifier
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
import joblib
import pathlib
import tqdm
from sklearn.model_selection import StratifiedKFold
cudf.set_option('display.max_rows', 500)
cudf.set_option('display.max_columns', 500)
cudf.set_option('display.width', 1000)

class CFG:
  seed = 42
  INPUT = "../input"
  TRAIN = True
  OPTIMIZE = False
  INFER = False
  n_folds = 5
  target ='target'
  DEBUG= False 
  ADD_CAT = True
  ADD_LAG = True 
  COMPUTE_Z = True
  ADD_DIFF_1 = True
  ADD_DIFF =  [1,3,5]#[3,6]
  ADD_PCTDIFF = [1,3,6]
  KURT = False
  TRIM=True   
  model_dir = ""

path = f'{CFG.INPUT}/amex-data-integer-dtypes-parquet-format'   
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

seed_everything(CFG.seed)  

### Libs 4 Feature Engg

### Feature Engineering

#### Utils

In [6]:
def agg_df_num(df):
    df_agg = df.groupby('customer_ID').agg(f_names)
    df_agg.columns = [str(c[0])+'_'+str(c[1]) for c in df_agg.columns]
    return df_agg

# ====================================================
# Get the difference  --> capture fluctuations, can capture diff(1),diff(2),diff(3) and consider adding features
# ====================================================
def get_difference(data, num_features,period=1): 
    df1 = []
    customer_ids = []
    for customer_id, df in  data.groupby(['customer_ID']):
        # Get the differences
        diff_df1 = df[num_features].diff(period).iloc[[-1]].values.astype(np.float32)
        # Append to lists
        df1.append(diff_df1)
        customer_ids.append(customer_id)
    # Concatenate
    df1 = np.concatenate(df1, axis = 0)
    # Transform to dataframe
    df1 = pd.DataFrame(df1, columns = [col + f'_diff{period}' for col in df[num_features].columns])
    # Add customer id
    df1['customer_ID'] = customer_ids
    return df1

def get_pct_change(data, num_features,period=1): 
    df1 = []
    customer_ids = []
    for customer_id, df in  data.groupby(['customer_ID']):
        # Get the differences
        diff_df1 = df[num_features].pct_change(period,fill_method=None).iloc[[-1]].values.astype(np.float32)
        # Append to lists
        df1.append(diff_df1)
        customer_ids.append(customer_id)
    # Concatenate
    df1 = np.concatenate(df1, axis = 0)
    # Transform to dataframe
    df1 = pd.DataFrame(df1, columns = [col + f'_pct_chg{period}' for col in df[num_features].columns])
    # Add customer id
    df1['customer_ID'] = customer_ids
    return df1


def kurtosis(x):
    if not isinstance(x, pd.Series):
        x = pd.Series(x)
    return pd.Series.kurtosis(x)    
 

CID ="customer_ID"
TIME = "S_2"
TARGET = "target"
def pivot_data(df, train=True):
    cols = [c for c in df.columns if c not in [CID, TIME, TARGET]]
    tmp = df.copy()
    tmp['max'] = tmp.groupby([CID])[TIME].transform('max')
    tmp['size'] = tmp.groupby([CID])[TIME].transform('size')
    tmp['rank'] = tmp.groupby([CID])[TIME].transform('rank')
    tmp['statement'] = (tmp['size']-tmp['rank']).astype(np.int8)
    pivot_pd = tmp.pivot(index=CID,columns=['statement'],values=cols)
    pivot_pd.columns = [('{0}__TE{1}'.format(*tup)) for tup in pivot_pd.columns]
    pivot_pd = pivot_pd.reset_index()
    return pivot_pd

def agg_pct_rank_by_cat(df,main_features_last,cat_features_last):
    df_list = [] 
    for c in cat_features_last:
        df_agg = df[main_features_last].groupby(df[c]).transform('rank')/df[main_features_last].groupby(df[c]).transform('count')
        df_agg.columns = [f+'_pct_rank_by_'+c for f in df_agg.columns]
        df_list.append(df_agg.astype('float16')) 
    return pd.concat([df,pd.concat(df_list,axis=1).astype('float16')], axis=1)
def agg_global_rank(df,main_features_last):
    df_rank = df[main_features_last].transform('rank')
    df_rank.columns = [s+'_global_rank' for s in df_rank.columns]
    return pd.concat([df,(df_rank/len(df)).astype('float16')],axis=1)

def agg_standardize_by_cat(df,main_features_last,cat_features_last):
    df_list = []
    for c in cat_features_last:
        df_agg = df[main_features_last].groupby(df[c]).transform(lambda x: (x - x.mean()) / x.std())
        df_agg.columns = [f+'_standardized_by_'+c for f in df_agg.columns]
        df_list.append(df_agg.astype('float16'))

    return pd.concat([df,pd.concat(df_list,axis=1).astype('float16')],axis=1)

#### MAIN FE

### CATBOOST Params and utility functions

In [7]:
class AmexMetric(object):
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)

    def is_max_optimal(self):
        return True

    def evaluate(self, approxes, target, weight):
        # approxes - list of list-like objects (one object per      approx dimension)
        # target - list-like object
        # weight - list-like object, can be None
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])
        
        approx = approxes[0]
        pred = [0 for i in range(len(target))] 
        return amex_metric(np.array(target), pred), 0

In [8]:
def cat_train(x, y, xt, yt,
               cat_features=['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120',
                'D_126', 'D_63', 'D_64', 'D_66', 'D_68'],#+[cat+"_first" for cat in cat_cols],
                params = {"iterations":10000}): 
    #print(params)            
    #params = {'iterations': 4885, 'l2_leaf_reg': 3, 'bootstrap_type': 'Bernoulli', 'max_depth': 10, 'subsample': 0.16284093343361972}
    cat_features= [col for col in x.columns if "__TE" in col ] 
    print(f"cat_features {cat_features}")
    model = CatBoostClassifier( random_state=CFG.seed, #nan_mode='Min',
                                task_type="GPU",
                                devices='0:1', 
                                iterations = 10500,
                                #learning_rate = 0.01,
                                #used_ram_limit=2*1024*1024*1024,
                                #pinned_memory_size=2*1024*1024*1024,
                                depth = 9,
                                #eval_metric= AmexMetric,
                                **params)
    model.fit(x, y, eval_set=[(xt, yt)], cat_features=cat_features,
              verbose=100, early_stopping_rounds=700)
    return model.predict_proba(xt)[:, 1],model,1

#### Metrics

In [9]:
def xgb_amex(y_pred, y_true):
    return 'amex', amex_metric_np(y_pred,y_true.get_label())

# Created by https://www.kaggle.com/yunchonggan
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/328020
def amex_metric_np(preds: np.ndarray, target: np.ndarray) -> float:
    indices = np.argsort(preds)[::-1]
    preds, target = preds[indices], target[indices]

    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_mask = cum_norm_weight <= 0.04
    d = np.sum(target[four_pct_mask]) / np.sum(target)

    weighted_target = target * weight
    lorentz = (weighted_target / weighted_target.sum()).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    n_pos = np.sum(target)
    n_neg = target.shape[0] - n_pos
    gini_max = 10 * n_neg * (n_pos + 20 * n_neg - 19) / (n_pos + 20 * n_neg)

    g = gini / gini_max
    return 0.5 * (g + d)

# we still need the official metric since the faster version above is slightly off
import pandas as pd
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

def lgb_amex_metric(y_pred, y_true):
    y_true = y_true.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred), True

### Load data and add feature

### Train CAT_BOOST

In [4]:
not_used = get_not_used()
msgs = {}
folds = CFG.n_folds
score = 0

kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed) 

def train_fn(fold,x,y,xt,yt,_params= {}):   
    print("Start training")
    val_pred,model, bst = cat_train(x, y, xt, yt,params=_params)  
    return val_pred,model

if CFG.TRAIN:  
  features = [col for col in train.columns if col not in  get_not_used()]
  oof_predictions = np.zeros(len(train))
  feature_importances = pd.DataFrame()
  feature_importances["feature"] = features
  for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):   
      x, y = train[features].iloc[trn_ind], train[CFG.target].iloc[trn_ind]
      xt, yt= train[features].iloc[val_ind], train[CFG.target].iloc[val_ind]
      cat_features= [col for col in x.columns if "__TE" in col ]
      x[cat_features] = x[cat_features].values.astype(int)
      xt[cat_features] = xt[cat_features].values.astype(int)
      x[cat_features]= x[cat_features].fillna(-1)
      xt[cat_features]= xt[cat_features].fillna(-1)
      if os.path.exists(f"{CFG.model_dir}/1cat_fold{fold}_seed{CFG.seed}.pkl"):
        model = joblib.load(f"{CFG.model_dir}/1cat_fold{fold}_seed{CFG.seed}.pkl")
        val_pred = model.predict_proba(xt)[:, 1] 
      else: 
        val_pred,model=train_fn(fold,x,y,xt,yt)
        joblib.dump(model, f'cat_fold{fold}_seed{CFG.seed}.pkl')
        
      amex_score = amex_metric(yt.values,val_pred) 
      msg = f"Fold {fold} amex {amex_score:.4f}"          
      oof_predictions[val_ind] = val_pred
      feature_importances[f"importance_fold{fold}+1"] = model.feature_importances_
      print(msg)
      score += amex_score  
      del x,y,xt,yt; gc.collect()
  oof_df = pd.DataFrame({'customer_ID': train['customer_ID'], 'target': train[CFG.target], 'prediction': oof_predictions})
  display(oof_df.head())
  oof_df.to_csv(f'cat_{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False)    
  feature_importances .to_csv(f'feature_importances_{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False) 
  score /= folds
  print(f"Average amex score: {score:.4f}") 
      
if CFG.INFER:
  test_predictions = np.zeros(len(test))
  not_used = [i for i in not_used if i in test.columns]
  for fold  in range(CFG.n_folds):
    model = joblib.load(f'cat_fold{fold}_seed{CFG.seed}.pkl')
    test_pred = model.predict_proba(test[features])[:, 1]
    test_predictions += test_pred / CFG.n_folds   
    torch.cuda.empty_cache() 
  test_df = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': test_predictions})
  test_df.to_csv(f'test_cat_{CFG.n_folds}fold_seed{CFG.seed}.csv', index = False) 

NameError: name 'get_not_used' is not defined

In [None]:
import seaborn as sns 
import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = (16,9)
plt.rcParams["figure.facecolor"] = '#FFFACD'
plt.rcParams["axes.facecolor"] = '#FFFFE0'
plt.rcParams["axes.grid"] = True 
plt.rcParams["grid.alpha"] = 0.5
plt.rcParams["grid.linestyle"] = '--'

feature_importances['mean_importance']=feature_importances[[f'importance_fold{fold_n}+1' for fold_n in range(CFG.n_folds)]].mean(axis=1)
feature_importances.sort_values(by='mean_importance', ascending=False, inplace=True)
sns.barplot(y=feature_importances['feature'][:50],x=feature_importances['mean_importance'][:50], palette='inferno')
plt.title('Mean Feature Importance by Folds')
plt.show()

### Optuna

In [12]:
import optuna
from optuna.integration import CatBoostPruningCallback


def optunaOpt(model_name,t_params,n_trials=100, callbacks=(lambda trial: [])):
    """ Best model eval util using Optuna
    """
    def run(trials):
        """ Optima trials lambda"""
        trial_params = {param:param_fn(trials) for param,param_fn in t_params.items()}
        if trial_params["bootstrap_type"] == "Bayesian":
          trial_params["bagging_temperature"] =trials.suggest_float("bagging_temperature", 0, 10)
        if trial_params["bootstrap_type"] == "Bernoulli":
          trial_params["subsample"] =trials.suggest_float("subsample", 0.1, 1)
        not_used = get_not_used()
        not_used = [i for i in not_used if i in train.columns]
        for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
            x, y = train[features].iloc[trn_ind], train[CFG.target].iloc[trn_ind]
            xt, yt= train[features].iloc[val_ind], train[CFG.target].iloc[val_ind]
            val_pred,model=train_fn(fold,x,y,xt,yt,trial_params)
            break

        amex_score = amex_metric(yt.values,val_pred)
        return amex_score

    study = optuna.create_study(direction="maximize",
                                study_name=f"{model_name}-study")
    study.optimize(run, n_trials)
    print('\n Best Trial:')
    print(study.best_trial)
    print('\n Best value')
    print(study.best_value)
    print('\n Best hyperparameters:')
    print(study.best_params)
    return study

ModuleNotFoundError: No module named 'optuna'

In [None]:
# 0.7932
catb_params = {
    #"iterations":lambda trial :trial.suggest_int("iterations", 6000, 11000), 
    #"learning_rate":lambda trial :trial.suggest_loguniform("learning_rate", 0.1,1.0), 
    'l2_leaf_reg' : lambda trial :trial.suggest_categorical('l2_leaf_reg',[0.2,0.5,1,3]),
    "boosting_type": lambda trial :trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
    "bootstrap_type": lambda trial:trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli" ]
        ),
    #"colsample_bylevel": lambda trial:trial.suggest_float("colsample_bylevel", 0.01, 0.1, log=True),
    "depth":lambda trial :trial.suggest_int("max_depth", 7, 12),   
}

if CFG.OPTIMIZE:
  optunaOpt("Catboost",catb_params,n_trials=100, callbacks=(lambda trial: []))