# HYPERPARAMETER OPTIMIZATION USING OPTUNA

## IMPORTS

In [8]:
##################
# IMPORT MODULES #
##################
# SYS IMPORT
import os, inspect, importlib, argparse
import gc
import pandas as pd
import numpy as np
from pathlib import Path

from tqdm import tqdm
import matplotlib.pyplot as plt

import xgboost as xgb
import lightgbm as lgb 
import optuna 
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## CONFIG

In [9]:
config = {
    "DATA_PATH" : "D:/Documents/GitHub/ml-pipeline/data/TPS-APR2021/train.csv",
    "TARGET_VAR" : "Survived"
}

## LOADING DATA

In [10]:
df = pd.read_csv(config["DATA_PATH"])
target = df[config["TARGET_VAR"]]
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,0,1,1,"Oconnor, Frankie",male,,2,0,209245,27.14,C12239,S
1,1,0,3,"Bryan, Drew",male,,0,0,27323,13.35,,S
2,2,0,3,"Owens, Kenneth",male,0.33,1,2,CA 457703,71.29,,S
3,3,0,3,"Kramer, James",male,19.0,0,0,A. 10866,13.04,,S
4,4,1,3,"Bond, Michael",male,25.0,0,0,427635,7.76,,S


## FEATURE ENGINEERING

In [11]:
import datetime
import pandas as pd 
from sklearn.preprocessing import LabelEncoder
from pandas.api.types import is_integer_dtype, is_float_dtype, is_object_dtype

def missing_values(dataframe):
    # Droping features that have too many MV
    drop_features = ["Cabin", "Name", "Ticket"]
    dataframe = dataframe.drop(drop_features, axis=1)
    # DIVIDE INTO NUMERICAL FEATURES AND CATEGORICAL FEATURES
    integer_features = [col for col in dataframe.columns if is_integer_dtype(dataframe[col])]
    float_features = [col for col in dataframe.columns if is_float_dtype(dataframe[col])]
    object_features = [col for col in dataframe.columns if is_object_dtype(dataframe[col])]

    # WE REPLACE MISSING VALUES IN INTEGER  & FLOAT FEATURES WITH MEAN AND MODE FOR CATEGORICAL FEATURES
    dataframe[integer_features] = dataframe[integer_features].apply(lambda x: x.fillna(value=x.mean().astype(int)))
    dataframe[float_features] = dataframe[float_features].apply(lambda x: x.fillna(value=x.mean()))
    dataframe[object_features] = dataframe[object_features].apply(lambda x: x.fillna(value=x.mode()[0]))

    # ASSERT WE DON'T HAVE ANY MISSING VALUES IN THE DATASET
    assert dataframe.columns[dataframe.isnull().any()].empty, 'We still have some missing values in the dataset!'
    return dataframe

def get_cat_features(dataframe):
    categorical_features = [col for col in dataframe.columns if is_object_dtype(dataframe[col])]
    return categorical_features

def cat_encoding(dataframe, features):
    le = LabelEncoder()
    dataframe[features] = dataframe[features].apply(lambda x: le.fit_transform(x))
    return dataframe

def feature_engineering(dataframe, train=True):
    # FEATURE ENG
    dataframe = missing_values(dataframe)
    features_cat = get_cat_features(dataframe)
    dataframe = cat_encoding(dataframe, features_cat)
    features = dataframe.columns.difference(["Survived", "PassengerId", "kfold"])
    # RETURN DATAFRAME & ALL FEATURES NEEDED FOR TRAINING OR PREDICTION
    return dataframe, features

In [12]:
df, features = feature_engineering(df)

## OPTIMIZING

In [13]:
def objective(trial, data=df[features], target=target):
    
    train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.2,random_state=95)
    param = {
        'objective' : "binary",
        'metric': 'binary_error', 
        'random_state': 95,
        'n_estimators': 1000,
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014,0.017,0.02]),
        'max_depth': trial.suggest_categorical('max_depth', [5,10,20,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
    model = lgb.LGBMClassifier(**param)  
    
    model.fit(train_x,train_y, eval_set=[(test_x,test_y)], verbose=False)
    
    preds = model.predict(test_x)
    
    acc = accuracy_score(test_y, preds)
    
    return acc

In [7]:
%%time
study = optuna.create_study(direction='maximize')
study.optimize(objective, timeout=3600)
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[32m[I 2021-04-09 14:07:04,807][0m A new study created in memory with name: no-name-1a574f8a-64bf-4787-8fbb-6a0ff19b9095[0m
[33m[W 2021-04-09 14:07:04,823][0m Trial 0 failed because of the following error: AttributeError("module 'xgboost' has no attribute 'LGBMClassifier'")
Traceback (most recent call last):
  File "C:\Anaconda\envs\ML-37\lib\site-packages\optuna\_optimize.py", line 211, in _run_trial
    value_or_values = func(trial)
  File "<ipython-input-6-5504627932d6>", line 19, in objective
    model = xgb.LGBMClassifier(**param)
AttributeError: module 'xgboost' has no attribute 'LGBMClassifier'[0m


AttributeError: module 'xgboost' has no attribute 'LGBMClassifier'

## VISUALIZATION

In [11]:
optuna.visualization.plot_optimization_history(study)

ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.

In [12]:
#Visualize parameter importances.
optuna.visualization.plot_param_importances(study)

ImportError: Tried to import 'plotly' but failed. Please make sure that the package is installed correctly to use this feature. Actual error: No module named 'plotly'.

In [14]:
params=study.best_params
params

{'reg_alpha': 0.023013164688329528,
 'reg_lambda': 0.003811720979048805,
 'colsample_bytree': 0.3,
 'subsample': 0.6,
 'learning_rate': 0.02,
 'max_depth': 100,
 'num_leaves': 186,
 'min_child_samples': 225,
 'min_data_per_groups': 74}