# HYPERPARAMETER OPTIMIZATION USING OPTUNA

## IMPORTS

In [1]:
##################
# IMPORT MODULES #
##################
# SYS IMPORT
import os, inspect, importlib, argparse
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import gc
import pandas as pd
import numpy as np
from pathlib import Path

from tqdm import tqdm
import matplotlib.pyplot as plt

import xgboost as xgb
import lightgbm as lgb
import optuna 
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## CONFIG

In [2]:
config = {
    "DATA_PATH" : "D:/Documents/GitHub/gbm_pipeline/data/TPS-FEV2021/train.csv",
    "TARGET_VAR" : "target"
}

## LOADING DATA

In [3]:
df = pd.read_csv(config["DATA_PATH"])
target = df[config["TARGET_VAR"]]
df.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,A,B,A,A,B,D,A,E,C,...,0.881122,0.42165,0.741413,0.895799,0.802461,0.724417,0.701915,0.877618,0.719903,6.994023
1,2,B,A,A,A,B,B,A,E,A,...,0.440011,0.34623,0.278495,0.593413,0.546056,0.613252,0.741289,0.326679,0.808464,8.071256
2,3,A,A,A,C,B,D,A,B,C,...,0.914155,0.369602,0.832564,0.86562,0.825251,0.264104,0.695561,0.869133,0.828352,5.760456
3,4,A,A,A,C,B,D,A,E,G,...,0.934138,0.57893,0.407313,0.868099,0.794402,0.494269,0.698125,0.809799,0.614766,7.806457
4,6,A,B,A,A,B,B,A,E,C,...,0.3826,0.70594,0.325193,0.440967,0.462146,0.724447,0.683073,0.343457,0.297743,6.868974


## FEATURE ENGINEERING

In [4]:
def cat_encoding(dataframe):
    cat = dataframe.columns[1:11]
    for feature in cat:
        le = LabelEncoder()
        le.fit(dataframe[feature])
        dataframe[feature] = le.transform(dataframe[feature])
    return dataframe

def feature_engineering(dataframe):
    dataframe = cat_encoding(dataframe)
    features = dataframe.columns[1:25]
    return dataframe, features

In [8]:
df, features = feature_engineering(df)
df.head()

Unnamed: 0,id,cat0,cat1,cat2,cat3,cat4,cat5,cat6,cat7,cat8,...,cont5,cont6,cont7,cont8,cont9,cont10,cont11,cont12,cont13,target
0,1,0,1,0,0,1,3,0,4,2,...,0.881122,0.42165,0.741413,0.895799,0.802461,0.724417,0.701915,0.877618,0.719903,6.994023
1,2,1,0,0,0,1,1,0,4,0,...,0.440011,0.34623,0.278495,0.593413,0.546056,0.613252,0.741289,0.326679,0.808464,8.071256
2,3,0,0,0,2,1,3,0,1,2,...,0.914155,0.369602,0.832564,0.86562,0.825251,0.264104,0.695561,0.869133,0.828352,5.760456
3,4,0,0,0,2,1,3,0,4,6,...,0.934138,0.57893,0.407313,0.868099,0.794402,0.494269,0.698125,0.809799,0.614766,7.806457
4,6,0,1,0,0,1,1,0,4,2,...,0.3826,0.70594,0.325193,0.440967,0.462146,0.724447,0.683073,0.343457,0.297743,6.868974


## OPTIMIZING

In [9]:
def objective(trial, data=df[features], target=target):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=95)
    param = {
        'metric': 'rmse', 
        'random_state': 95,
        'n_estimators': 5000,
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    model = lgb.LGBMRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)], early_stopping_rounds=100, verbose=False)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds, squared=False)
    
    return rmse

In [10]:
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-03-05 14:15:00,703][0m A new study created in memory with name: no-name-ad2fa391-15f1-4b41-acf4-8c80290cc267[0m
[32m[I 2021-03-05 14:15:38,907][0m Trial 0 finished with value: 0.8406972463295934 and parameters: {'reg_alpha': 0.0030660879230153142, 'reg_lambda': 0.149397356932498, 'colsample_bytree': 0.3, 'subsample': 0.4, 'learning_rate': 0.006, 'max_depth': 20, 'num_leaves': 204, 'min_child_samples': 109, 'min_data_per_groups': 57}. Best is trial 0 with value: 0.8406972463295934.[0m
[32m[I 2021-03-05 14:16:08,763][0m Trial 1 finished with value: 0.8428343205850217 and parameters: {'reg_alpha': 0.007667258906183743, 'reg_lambda': 0.028588118417830367, 'colsample_bytree': 0.9, 'subsample': 0.7, 'learning_rate': 0.008, 'max_depth': 20, 'num_leaves': 170, 'min_child_samples': 284, 'min_data_per_groups': 17}. Best is trial 0 with value: 0.8406972463295934.[0m
[32m[I 2021-03-05 14:16:26,265][0m Trial 2 finished with value: 0.8428855286082522 and parameters: {'reg_alph

KeyboardInterrupt: 

## VISUALIZATION

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
#Visualize parameter importances.
optuna.visualization.plot_param_importances(study)

In [None]:
params=study.best_params
params