In [1]:
import numpy as np
import pandas as pd

from pathlib import Path

from lightgbm import LGBMClassifier

from sklearn.preprocessing import PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score

import optuna
from optuna.visualization import plot_optimization_history, plot_param_importances
from optuna.integration import LightGBMPruningCallback

from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import ADASYN
from imblearn.under_sampling import OneSidedSelection, NeighbourhoodCleaningRule, TomekLinks


pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_rows', 20)

## Load data

In [2]:
INPUT_PATH = Path("data")
TRAIN_PATH = Path("data/train")
TEST_PATH = Path("data/test")

In [3]:
train_df = pd.read_csv(INPUT_PATH / "train.csv", index_col=0)
test_df = pd.read_csv(INPUT_PATH / "test.csv", index_col=0)
sub_df = pd.read_csv(INPUT_PATH / 'sample_submission.csv')

In [4]:
train_df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,1,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S
1,0,3,"Bryan, Drew",male,,0,0,27323,13.35,,S
2,0,3,"Owens, Kenneth",male,0.33,1,2,CA 457703,71.29,,S
3,0,3,"Kramer, James",male,19.0,0,0,A. 10866,13.04,,S
4,1,3,"Bond, Michael",male,25.0,0,0,427635,7.76,,S


In [33]:
for col in train_df.columns[train_df.dtypes == "object"].tolist():
    train_df[col] = train_df[col].astype('category')
    
for col in test_df.columns[test_df.dtypes == "object"].tolist():
    test_df[col] = test_df[col].astype('category')
    
X = train_df.drop("Survived", axis=1)
y = train_df["Survived"]

K = 10 # cross validation

fixed_params = {"random_state": 26,
              "n_estimators": 10000,
              "learning_rate": 0.03,
              "metric": "auc",
              "verbose": -1
              }

numerical_cols = ["Age", "Fare"]
categorical_cols = ["Pclass", "Name", "Sex", "SibSp", "Parch", "Ticket", "Cabin", "Embarked"]

In [19]:
def model_instance(hyperparams, fixedparams, numerical_cols):

    clf = LGBMClassifier(**hyperparams['clf'], **fixedparams) 
    
    if hyperparams['resample'] == 'random':
        resample = RandomUnderSampler(sampling_strategy='majority')
    else:
        resample = None
        
    if hyperparams['power'] == True:
        numeric_transformer = PowerTransformer(method='yeo-johnson',
                                               standardize=True)

        preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, numerical_cols)])
    else:
        preprocessor = None
    
    pipe = Pipeline([('preprocessor', preprocessor),
                     ('resample', resample),
                     ('clf', clf) ])
    return pipe

In [20]:
def fit_with_stop(pipe, X, y, X_val, y_val, trial, hyperparams, early_stopping_rounds = 50):
    
    if(trial != 0):
        pruning_callback = [LightGBMPruningCallback(trial, 'auc')]
    else: 
        pruning_callback = None
    
    if hyperparams['power'] == True:
        pipe_interim = pipe.named_steps.preprocessor.fit(X)
        X_val = pipe_interim.transform(X_val)
    
    pipe.fit(X, y,
              clf__eval_set=(X_val, y_val),
              clf__early_stopping_rounds=early_stopping_rounds,
              clf__verbose=0,
              clf__eval_metric="auc",
              clf__callbacks=pruning_callback)
    return pipe

In [21]:
def evaluate(model, X, y):

    yp = model.predict_proba(X)[:, 1]
    auc_score = roc_auc_score(y, yp)
    return auc_score

In [52]:
def kfold_prediction(X, y, X_test, k, hyperparams, fixedparams, numerical_cols, early_stopping_rounds = 50):

    yp = np.zeros(len(X_test))
    
    kf = StratifiedKFold(n_splits=k,random_state=42,shuffle=True)
    model = model_instance(hyperparams, fixedparams, numerical_cols)
    
    for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):
        print(f"\n FOLD {i} ...")
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[test_idx]
        y_val = y.iloc[test_idx]
        
        model_fit = fit_with_stop(model, X_train, y_train,
                                  X_val, y_val, 0, hyperparams, 
                                  early_stopping_rounds)
        yp += model_fit.predict_proba(X_test)[:, 1] / k
    
    return yp

In [51]:
def objective(trial):
    
    global X, y, K, fixed_params, numerical_cols

    hyperparams = {
        'resample': trial.suggest_categorical("resample", [None]),
        'power': trial.suggest_categorical("power", [False]),
        'clf':{
            'boosting_type': trial.suggest_categorical("boosting_type", ['gbdt']),
            'num_leaves': trial.suggest_int('num_leaves', 2, 1024),
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
            'max_depth': trial.suggest_int('max_depth', 1, 64),
               
            'max_delta_step': trial.suggest_int('max_delta_step', 1, 15),
            ##'max_bin': trial.suggest_int('max_bin', 32, 255),
            ##'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 256),
            ##'min_data_in_bin': trial.suggest_int('min_data_in_bin', 1, 256),
            'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 10),
            'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 10),
            #'min_split_gain' : trial.suggest_discrete_uniform('min_split_gain', 0, 5, 0.01),
            
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 0.5),
            #'subsample': trial.suggest_float('subsample ', 0.1, 1.0),
            
            'cat_smooth': trial.suggest_float('cat_smooth', 10, 100.0),
            'cat_l2': trial.suggest_int('cat_l2', 1, 20)
        }

    }
    
    kf = StratifiedKFold(n_splits=K,random_state=42,shuffle=True)
    scores = []
    model = model_instance(hyperparams, fixed_params, numerical_cols)
    
    for i, (train_idx, test_idx) in enumerate(kf.split(X, y)):
        
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[test_idx]
        y_val = y.iloc[test_idx]
        
        model_fit = fit_with_stop(model, X_train, y_train, X_val, y_val,
                                  trial, hyperparams)
        val_score = evaluate(model_fit, X_val, y_val)
        scores.append(val_score)
    
    return np.nanmean(scores)

In [35]:
study = optuna.create_study(direction='maximize',
                            pruner=optuna.pruners.HyperbandPruner())

[32m[I 2021-04-07 10:48:17,175][0m A new study created in memory with name: no-name-a3fba1a6-b308-44a6-b881-1a7de8bfbd6f[0m


In [36]:
%%time
import warnings
warnings.filterwarnings('ignore')

study.optimize(objective, timeout=60*5, n_jobs=-1,
               n_trials=None, gc_after_trial=False)

[32m[I 2021-04-07 11:03:06,706][0m Trial 0 finished with value: 0.8213821769158154 and parameters: {'resample': None, 'power': False, 'boosting_type': 'gbdt', 'num_leaves': 381, 'min_child_samples': 39, 'max_depth': 40, 'max_delta_step': 11, 'reg_alpha': 9.759497194608587, 'reg_lambda': 4.756910568566625, 'colsample_bytree': 0.14888219780095324, 'cat_smooth': 26.8758199052397, 'cat_l2': 20}. Best is trial 0 with value: 0.8213821769158154.[0m
[32m[I 2021-04-07 11:03:06,795][0m Trial 2 pruned. Trial was pruned at iteration 487.[0m
[32m[I 2021-04-07 11:03:06,813][0m Trial 10 pruned. Trial was pruned at iteration 426.[0m
[32m[I 2021-04-07 11:03:06,870][0m Trial 1 pruned. Trial was pruned at iteration 304.[0m
[32m[I 2021-04-07 11:03:06,923][0m Trial 4 pruned. Trial was pruned at iteration 55.[0m
[32m[I 2021-04-07 11:03:06,963][0m Trial 7 pruned. Trial was pruned at iteration 205.[0m
[32m[I 2021-04-07 11:03:06,998][0m Trial 9 pruned. Trial was pruned at iteration 473.[0m

CPU times: user 1h 2min 23s, sys: 2h 2min 15s, total: 3h 4min 39s
Wall time: 16min 39s


In [37]:
study.trials_dataframe()

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_boosting_type,params_cat_l2,params_cat_smooth,params_colsample_bytree,params_max_delta_step,params_max_depth,params_min_child_samples,params_num_leaves,params_power,params_reg_alpha,params_reg_lambda,params_resample,system_attrs_completed_rung_0,system_attrs_completed_rung_1,system_attrs_completed_rung_2,system_attrs_completed_rung_3,system_attrs_completed_rung_4,system_attrs_completed_rung_5,state
0,0,0.821382,2021-04-07 10:48:17.490123,2021-04-07 11:03:06.700132,0 days 00:14:49.210009,gbdt,20,26.87582,0.148882,11,40,39,381,False,9.759497,4.756911,,,,,,,,COMPLETE
1,1,0.845913,2021-04-07 10:48:17.494871,2021-04-07 11:03:06.870719,0 days 00:14:49.375848,gbdt,3,83.206547,0.312414,10,7,37,960,False,0.061476,8.641008,,0.845913,,,,,,PRUNED
2,2,0.841146,2021-04-07 10:48:17.498012,2021-04-07 11:03:06.795416,0 days 00:14:49.297404,gbdt,11,46.137119,0.46461,6,5,83,692,False,4.842826,5.763122,,0.841146,,,,,,PRUNED
3,3,0.84367,2021-04-07 10:48:17.501175,2021-04-07 11:04:51.157994,0 days 00:16:33.656819,gbdt,16,74.543171,0.384185,8,62,7,626,False,6.364873,9.600611,,0.846185,0.846185,0.846185,0.846185,0.846185,0.846185,COMPLETE
4,4,0.843166,2021-04-07 10:48:17.503880,2021-04-07 11:03:06.923838,0 days 00:14:49.419958,gbdt,9,12.328906,0.411898,8,20,91,333,False,0.266341,5.707632,,0.843166,,,,,,PRUNED
5,5,0.843058,2021-04-07 10:48:17.508121,2021-04-07 11:04:52.295836,0 days 00:16:34.787715,gbdt,1,70.829947,0.439963,6,53,65,200,False,0.791441,1.364725,,0.845802,0.845802,0.845802,0.845802,0.845802,,COMPLETE
6,6,0.843234,2021-04-07 10:48:17.510874,2021-04-07 11:04:50.191193,0 days 00:16:32.680319,gbdt,9,48.696451,0.392456,4,14,86,821,False,1.580156,3.440895,,0.846032,0.846032,0.846032,0.846032,0.846032,,COMPLETE
7,7,0.838256,2021-04-07 10:48:17.513519,2021-04-07 11:03:06.963824,0 days 00:14:49.450305,gbdt,2,53.365745,0.350764,6,58,27,132,False,2.459645,6.407222,,0.838256,,,,,,PRUNED
8,8,0.843178,2021-04-07 10:48:17.515106,2021-04-07 11:04:57.088367,0 days 00:16:39.573261,gbdt,2,60.685432,0.332041,4,34,17,247,False,4.681176,5.297314,,0.845686,0.845686,0.83825,,,,COMPLETE
9,9,0.843558,2021-04-07 10:48:17.517419,2021-04-07 11:03:06.998744,0 days 00:14:49.481325,gbdt,19,92.124136,0.253606,11,58,98,813,False,5.357019,6.823535,,0.843558,,,,,,PRUNED


In [40]:
study.best_value

0.8436699339123003

In [42]:
plot_optimization_history(study)

ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.

In [43]:
optuna.visualization.plot_parallel_coordinate(study)

ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.

In [44]:
plot_param_importances(study)

ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.

In [45]:
study.best_params

{'resample': None,
 'power': False,
 'boosting_type': 'gbdt',
 'num_leaves': 626,
 'min_child_samples': 7,
 'max_depth': 62,
 'max_delta_step': 8,
 'reg_alpha': 6.36487346445646,
 'reg_lambda': 9.600611241381081,
 'colsample_bytree': 0.3841853801587758,
 'cat_smooth': 74.54317144855379,
 'cat_l2': 16}

In [46]:
best_params = {'resample': None,
 'power': False,
 'boosting_type': 'gbdt',
 'num_leaves': 626,
 'min_child_samples': 7,
 'max_depth': 62,
 'max_delta_step': 8,
 'reg_alpha': 6.36487346445646,
 'reg_lambda': 9.600611241381081,
 'colsample_bytree': 0.3841853801587758,
 'cat_smooth': 74.54317144855379,
 'cat_l2': 16}

In [48]:
final_params = dict()
final_params['clf']=dict(best_params)

final_params['resample']=final_params['clf']['resample']
del final_params['clf']['resample']

final_params['power']=final_params['clf']['power']
del final_params['clf']['power']

fixed_params['learning_rate'] = 0.005

In [56]:
%%time

sub_df.loc[:, 'Survived'] = [0 if x<0.5 else 1 for x in kfold_prediction(X, y, test_df, 10, 
                                               final_params, fixed_params, numerical_cols,
                                               500)]
sub_df.to_csv('submission.csv', index = False)


 FOLD 0 ...

 FOLD 1 ...

 FOLD 2 ...

 FOLD 3 ...

 FOLD 4 ...

 FOLD 5 ...

 FOLD 6 ...

 FOLD 7 ...

 FOLD 8 ...

 FOLD 9 ...
CPU times: user 28min 24s, sys: 12.7 s, total: 28min 36s
Wall time: 2min 42s
