In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import multiprocessing
multiprocessing.cpu_count()

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score

from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from boruta import BorutaPy as boruta

from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances

In [23]:
train = pd.read_csv('../Details/dataset/train.csv', index_col=False)
test  = pd.read_csv('../Details/dataset/test.csv', index_col=False)

In [24]:
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,0,A,I,A,B,B,BI,A,S,Q,...,0.759439,0.795549,0.681917,0.621672,0.592184,0.791921,0.815254,0.965006,0.665915,0
1,1,A,I,A,A,E,BI,K,W,AD,...,0.386385,0.541366,0.388982,0.357778,0.600044,0.408701,0.399353,0.927406,0.493729,0
2,2,A,K,A,A,E,BI,A,E,BM,...,0.343255,0.616352,0.793687,0.552877,0.352113,0.388835,0.412303,0.292696,0.549452,0
3,3,A,K,A,C,E,BI,A,Y,AD,...,0.831147,0.807807,0.800032,0.619147,0.221789,0.897617,0.633669,0.760318,0.934242,0
4,4,A,I,G,B,E,BI,C,G,Q,...,0.338818,0.277308,0.610578,0.128291,0.578764,0.279167,0.351103,0.357084,0.32896,1


In [25]:
label_encoding_col = ['cat0','cat1','cat2','cat3','cat4','cat11','cat12','cat13','cat14','cat15','cat16','cat17','cat18']
freq_encoding_col = ['cat5', 'cat6', 'cat8', 'cat7', 'cat9','cat10']

In [26]:
le = LabelEncoder()
for i in label_encoding_col:
    train[i] = le.fit_transform(train[i].values)

In [27]:
for i in freq_encoding_col:
    enc = (train.groupby(i).size())/len(train)
    train[i] = train[i].apply(lambda x : enc[x])

In [28]:
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,0,0,8,0,1,1,0.79521,0.62632,0.026403,0.013763,...,0.759439,0.795549,0.681917,0.621672,0.592184,0.791921,0.815254,0.965006,0.665915,0
1,1,0,8,0,0,4,0.79521,0.005173,0.005083,0.048877,...,0.386385,0.541366,0.388982,0.357778,0.600044,0.408701,0.399353,0.927406,0.493729,0
2,2,0,10,0,0,4,0.79521,0.62632,0.132003,0.141267,...,0.343255,0.616352,0.793687,0.552877,0.352113,0.388835,0.412303,0.292696,0.549452,0
3,3,0,10,0,2,4,0.79521,0.62632,0.019653,0.048877,...,0.831147,0.807807,0.800032,0.619147,0.221789,0.897617,0.633669,0.760318,0.934242,0
4,4,0,8,6,1,4,0.79521,0.23809,0.018853,0.013763,...,0.338818,0.277308,0.610578,0.128291,0.578764,0.279167,0.351103,0.357084,0.32896,1


In [29]:
def cat_transformer(cat_cols):
    for i in cat_cols:
        train[i] = train[i].apply(lambda x:int(round(x*100)))

In [30]:
cat_cols = label_encoding_col+freq_encoding_col
cat_transformer(freq_encoding_col)

In [31]:
train.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,0,0,8,0,1,1,80,63,3,1,...,0.759439,0.795549,0.681917,0.621672,0.592184,0.791921,0.815254,0.965006,0.665915,0
1,1,0,8,0,0,4,80,1,1,5,...,0.386385,0.541366,0.388982,0.357778,0.600044,0.408701,0.399353,0.927406,0.493729,0
2,2,0,10,0,0,4,80,63,13,14,...,0.343255,0.616352,0.793687,0.552877,0.352113,0.388835,0.412303,0.292696,0.549452,0
3,3,0,10,0,2,4,80,63,2,5,...,0.831147,0.807807,0.800032,0.619147,0.221789,0.897617,0.633669,0.760318,0.934242,0
4,4,0,8,6,1,4,80,24,2,1,...,0.338818,0.277308,0.610578,0.128291,0.578764,0.279167,0.351103,0.357084,0.32896,1


In [36]:
train['cat5'].nunique()

5

In [32]:
# x_train, x_val, y_train, y_val = train_test_split(train.drop(['id', 'target'], axis=1), train['target'], random_state=42)
x_train = train.drop(columns=['id','target'])
y_train = train['target']

In [19]:
# from categorical_transform import CategoricalTransform, IntegerCategoricalTransform
# ct = categorical_transform.IntegerCategoricalTransform(cat_cols)
# x_train = ct.fit_transform(train)
# x_test  = ct.transform(test)

In [16]:
from sklearn.metrics import roc_auc_score
import optuna
import numpy as np
from sklearn.model_selection import KFold
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline

In [21]:
def objective(trial):
    params = {'iterations': 10000,
        'depth': trial.suggest_int("depth", 4, 16),
        'l2_leaf_reg': trial.suggest_float("l2_leaf_reg", 0.0001, 25, log=True),
        'bagging_temperature': trial.suggest_float("bagging_temperature", 0, 10),
        'auto_class_weights': trial.suggest_categorical('auto_class_weights', [None,'Balanced','SqrtBalanced']),
        'grow_policy': 'Lossguide',
        'early_stopping_rounds': 200,
        'eval_metric': 'AUC',
        'bootstrap_type': 'Bayesian',
        'use_best_model': True,
        'task_type': 'GPU', 
        'cat_features': cat_cols,
        'verbose': False,
        'border_count': 254              
    }
    
    cbc = CatBoostClassifier(**params)
    kf = KFold(n_splits=5, shuffle=True)
    roc_test = []
    
    for train_index, test_index in kf.split(x_train):
        x_train_fold, x_test_fold = x_train.loc[train_index], x_train.loc[test_index]
        y_train_fold, y_test_fold = y_train.loc[train_index], y_train.loc[test_index]
        cbc.fit(x_train_fold, y_train_fold, eval_set=(x_test_fold, y_test_fold))    
        proba = cbc.predict_proba(x_test_fold)[:,1]
        roc_test.append(roc_auc_score(y_test_fold, proba))
    return np.mean(roc_test)

In [18]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, timeout=5*60*60)
print(study.best_trial)

In [None]:
study.best_params

In [None]:
study.best_value

In [None]:
len(study.trials)

In [None]:
from optuna.visualization import plot_optimization_history, plot_param_importances
plot_optimization_history(study)

In [None]:
plot_param_importances(study)

In [33]:
params = {'depth': 15,
    'l2_leaf_reg': 0.0046887635296032305,
    'bagging_temperature': 8.287477579452695,
    'auto_class_weights': 'Balanced',
    'grow_policy': 'Lossguide',
    'early_stopping_rounds': 200,
    'eval_metric': 'AUC',
    'bootstrap_type': 'Bayesian',
    'use_best_model': True, 
    'cat_features': cat_cols,
    'verbose': False,
    'border_count': 254
}

cbc = CatBoostClassifier(**params)

In [21]:
cbc.fit(x_train, y_train, eval_set=(x_val, y_val))    
proba = cbc.predict_proba(x_val)[:,1]
roc_auc_score(y_val, proba)

0.8868523305497493

In [35]:
kf = KFold(n_splits=5, shuffle=True)
roc_test = []

for train_index, test_index in kf.split(x_train):
    x_train_fold, x_test_fold = x_train.loc[train_index], x_train.loc[test_index]
    y_train_fold, y_test_fold = y_train.loc[train_index], y_train.loc[test_index]
    cbc.fit(x_train_fold, y_train_fold, eval_set=(x_test_fold, y_test_fold))    
    proba = cbc.predict_proba(x_test_fold)[:,1]
    roc_test.append(roc_auc_score(y_test_fold, proba))
    
print(np.mean(roc_test))
roc_test

0.8851611263616505


[0.8872194974652451,
 0.8846924748133398,
 0.8824341387330947,
 0.8835608550395775,
 0.8878986657569949]