# HYPERPARAMETER OPTIMIZATION USING OPTUNA

## IMPORTS

In [1]:
##################
# IMPORT MODULES #
##################
# SYS IMPORT
import os, inspect, importlib, argparse
import gc
import pandas as pd
import numpy as np
from pathlib import Path

from tqdm import tqdm
import matplotlib.pyplot as plt

import xgboost as xgb
import lightgbm as lgb 
import optuna 
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## CONFIG

In [2]:
config = {
    "DATA_PATH" : "../input/tabular-playground-series-mar-2021/train.csv",
    "TARGET_VAR" : "target"
}

## LOADING DATA

In [3]:
df = pd.read_csv(config["DATA_PATH"])
target = df[config["TARGET_VAR"]]
df.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont2,cont3,cont4,cont5,cont6,cont7,cont8,cont9,cont10,target
0,0,A,I,A,B,B,BI,A,S,Q,...,0.759439,0.795549,0.681917,0.621672,0.592184,0.791921,0.815254,0.965006,0.665915,0
1,1,A,I,A,A,E,BI,K,W,AD,...,0.386385,0.541366,0.388982,0.357778,0.600044,0.408701,0.399353,0.927406,0.493729,0
2,2,A,K,A,A,E,BI,A,E,BM,...,0.343255,0.616352,0.793687,0.552877,0.352113,0.388835,0.412303,0.292696,0.549452,0
3,3,A,K,A,C,E,BI,A,Y,AD,...,0.831147,0.807807,0.800032,0.619147,0.221789,0.897617,0.633669,0.760318,0.934242,0
4,4,A,I,G,B,E,BI,C,G,Q,...,0.338818,0.277308,0.610578,0.128291,0.578764,0.279167,0.351103,0.357084,0.32896,1


## FEATURE ENGINEERING

In [4]:
def cat_encoding(dataframe):
    cat = dataframe.columns[1:20]
    for feature in cat:
        le = LabelEncoder()
        le.fit(dataframe[feature])
        dataframe[feature] = le.transform(dataframe[feature])
    return dataframe

def feature_engineering(dataframe):
    dataframe = cat_encoding(dataframe)
    features = dataframe.columns[1:31]
    return dataframe, features

In [5]:
df, features = feature_engineering(df)

## OPTIMIZING

In [6]:
def objective(trial, data=df[features], target=target):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=95)
    param = {
        'objective' : "binary",
        'metric': 'auc', 
        'random_state': 95,
        'n_estimators': 100000,
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    model = lgb.LGBMClassifier(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)], early_stopping_rounds=1000, verbose=False)
    
    preds = model.predict_proba(test_x)[:, 1]
    
    auc = roc_auc_score(test_y, preds)
    
    return auc

In [7]:
%%time
study = optuna.create_study(direction='maximize')
study.optimize(objective, timeout=3600*5)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-03-06 13:20:36,517][0m A new study created in memory with name: no-name-b6fc5307-9fe7-4f5d-b881-1b047231c695[0m
[32m[I 2021-03-06 13:24:40,635][0m Trial 0 finished with value: 0.8941319241264956 and parameters: {'reg_alpha': 0.001316883022340614, 'reg_lambda': 0.528554915646157, 'colsample_bytree': 0.5, 'subsample': 0.6, 'learning_rate': 0.006, 'max_depth': 100, 'num_leaves': 674, 'min_child_samples': 198, 'min_data_per_groups': 62}. Best is trial 0 with value: 0.8941319241264956.[0m
[32m[I 2021-03-06 13:26:44,983][0m Trial 1 finished with value: 0.8923251322896474 and parameters: {'reg_alpha': 0.04204525225695287, 'reg_lambda': 0.11353301018540965, 'colsample_bytree': 0.8, 'subsample': 1.0, 'learning_rate': 0.01, 'max_depth': 10, 'num_leaves': 710, 'min_child_samples': 194, 'min_data_per_groups': 48}. Best is trial 0 with value: 0.8941319241264956.[0m
[32m[I 2021-03-06 13:29:52,384][0m Trial 2 finished with value: 0.892960439279956 and parameters: {'reg_alpha': 

Number of finished trials: 99
Best trial: {'reg_alpha': 0.0031293275223408185, 'reg_lambda': 0.04787145507141445, 'colsample_bytree': 0.3, 'subsample': 0.6, 'learning_rate': 0.008, 'max_depth': 100, 'num_leaves': 584, 'min_child_samples': 173, 'min_data_per_groups': 30}
CPU times: user 19h 22min 33s, sys: 14min 20s, total: 19h 36min 53s
Wall time: 5h 1min 10s


## VISUALIZATION

In [8]:
optuna.visualization.plot_optimization_history(study)

In [9]:
#Visualize parameter importances.
optuna.visualization.plot_param_importances(study)

In [10]:
params=study.best_params
params

{'reg_alpha': 0.0031293275223408185,
 'reg_lambda': 0.04787145507141445,
 'colsample_bytree': 0.3,
 'subsample': 0.6,
 'learning_rate': 0.008,
 'max_depth': 100,
 'num_leaves': 584,
 'min_child_samples': 173,
 'min_data_per_groups': 30}