# HYPERPARAMETER OPTIMIZATION USING OPTUNA

## IMPORTS

In [1]:
##################
# IMPORT MODULES #
##################
# SYS IMPORT
import pandas as pd
import numpy as np

from tqdm import tqdm
import matplotlib.pyplot as plt

import xgboost as xgb
import lightgbm as lgb 
import optuna 
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## CONFIG

In [2]:
config = {
    "DATA_PATH" : "D:/Documents/GitHub/ml-pipeline/data/TPS-AUG2021/train.csv",
    "TARGET_VAR" : "loss"
}

## LOADING DATA

In [3]:
df = pd.read_csv(config["DATA_PATH"])
target = df[config["TARGET_VAR"]]
df.head()

Unnamed: 0,id,f0,f1,f2,f3,f4,f5,f6,f7,f8,...,f91,f92,f93,f94,f95,f96,f97,f98,f99,loss
0,0,-0.00235,59,0.766739,-1.35046,42.2727,16.6857,30.3599,1.2673,0.392007,...,-42.4399,26.854,1.45751,0.696161,0.941764,1.82847,0.92409,2.29658,10.4898,15
1,1,0.784462,145,-0.463845,-0.530421,27324.9,3.47545,160.498,0.828007,3.73586,...,-184.132,7.90137,1.70644,-0.494699,-2.0583,0.819184,0.439152,2.3647,1.14383,3
2,2,0.317816,19,-0.432571,-0.382644,1383.26,19.7129,31.1026,-0.515354,34.4308,...,7.43721,37.2181,3.25339,0.337934,0.615037,2.21676,0.745268,1.69679,12.3055,6
3,3,0.210753,17,-0.616454,0.946362,-119.253,4.08235,185.257,1.38331,-47.5214,...,9.66778,0.626942,1.49425,0.517513,-10.2221,2.62731,0.61727,1.45645,10.0288,2
4,4,0.439671,20,0.968126,-0.092546,74.302,12.3065,72.186,-0.233964,24.3991,...,290.657,15.6043,1.73557,-0.476668,1.39019,2.19574,0.826987,1.78485,7.07197,1


## FEATURE ENGINEERING

In [4]:
def feature_engineering(dataframe):
    features = dataframe.columns[1:101]
    return dataframe, features

In [5]:
df, features = feature_engineering(df)

## OPTIMIZING

In [8]:
def objective(trial, data=df[features], target=target):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=95)
    param = {
        'objective' : "reg:squarederror",
        'seed': 95,
        'n_estimators': 4000,
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [5,10,15,20]),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
        'n_jobs': 2,
        'tree_method': "gpu_hist",
        "gpu_id": 0,
        'predictor': 'gpu_predictor'
    }
    model = xgb.XGBRegressor(**param)  
    
    model.fit(train_x,train_y,eval_set=[(test_x,test_y)], early_stopping_rounds=200, verbose=False)
    
    preds = model.predict(test_x)
    
    rmse = mean_squared_error(test_y, preds, squared=False)
    
    return rmse

In [9]:
%%time
study = optuna.create_study(direction='minimize')
study.optimize(objective, timeout=3600*5)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-08-03 14:12:53,098][0m A new study created in memory with name: no-name-a4b014ef-3462-445e-9928-2ccd531ccc26[0m
[32m[I 2021-08-03 14:14:07,496][0m Trial 0 finished with value: 7.805481248181306 and parameters: {'reg_alpha': 3.730683102562492, 'colsample_bytree': 0.5, 'subsample': 0.5, 'learning_rate': 0.017, 'max_depth': 15, 'min_child_weight': 105}. Best is trial 0 with value: 7.805481248181306.[0m
[32m[I 2021-08-03 14:15:26,199][0m Trial 1 finished with value: 7.802911285028155 and parameters: {'reg_alpha': 0.004429371660341314, 'colsample_bytree': 0.5, 'subsample': 0.7, 'learning_rate': 0.02, 'max_depth': 15, 'min_child_weight': 149}. Best is trial 1 with value: 7.802911285028155.[0m
[32m[I 2021-08-03 14:17:17,299][0m Trial 2 finished with value: 7.791425125430866 and parameters: {'reg_alpha': 0.0441067199361869, 'colsample_bytree': 0.3, 'subsample': 0.4, 'learning_rate': 0.008, 'max_depth': 10, 'min_child_weight': 115}. Best is trial 2 with value: 7.791425125

## VISUALIZATION

In [None]:
optuna.visualization.plot_optimization_history(study)

In [None]:
#Visualize parameter importances.
optuna.visualization.plot_param_importances(study)

In [None]:
params=study.best_params
params

{'reg_alpha': 0.0031293275223408185,
 'reg_lambda': 0.04787145507141445,
 'colsample_bytree': 0.3,
 'subsample': 0.6,
 'learning_rate': 0.008,
 'max_depth': 100,
 'num_leaves': 584,
 'min_child_samples': 173,
 'min_data_per_groups': 30}

In [None]:
{'reg_alpha': 0.007915504076304212, 'colsample_bytree': 0.5, 'subsample': 0.8, 'learning_rate': 0.008, 'max_depth': 10, 'min_child_weight': 274}