### W23P1 STAT 857 - Hyper-Parameter Tuning

In [None]:
pip install optuna lightgbm xgboost

In [19]:
## Importing libraries
import optuna
import numpy as np
import pandas as pd
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

pd.set_option('display.max_columns', 100)
np.seterr(divide='ignore', invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [9]:
## Reading the data
data = pd.read_csv('Data/W23P1_train_final.csv')

## Defining the input and target variables

variables = ['passenger_count', 'distance', 'duration', 'pickup_day', 'pickup_hour', 'Friday', 'Monday', 'Saturday', 
             'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 'weekend', 'rush_hour', 'overnight', 'pickup_LGA', 'dropoff_LGA', 
             'pickup_JFK', 'dropoff_JFK', 'pickup_EWR', 'dropoff_EWR', 'airport', 'change_borough', 'haversine']

X = data[variables]
Y = data['fare_amount']

## Splitting the data into train and validation sets
X_train, X_validation, Y_train, Y_validation = train_test_split(X, Y, test_size = 0.3)

#### Defining optuna objective functions

In [15]:
def rf_reg_objective(trial):

    ## Defining the XGBoost hyper-parameter grid
    rf_param_grid = {'n_estimators': trial.suggest_int('n_estimators', 100, 1000, 100),
                     'max_depth': trial.suggest_int('max_depth', 3, 12), 
                     'min_samples_split': trial.suggest_int('min_samples_split', 2, 10), 
                     'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10), 
                    }
    
    ## Building the model
    rf_md = RandomForestRegressor(**rf_param_grid, n_jobs = -1).fit(X_train, Y_train)
    
    ## Predicting on the test data-frame
    rf_md_preds = rf_md.predict(X_validation)
    
    ## Evaluating model performance on the test set
    rf_md_mse = mean_squared_error(Y_validation, rf_md_preds, squared = False)
    
    return rf_md_mse

def xgb_reg_objective(trial):

    ## Defining the XGBoost hyper-parameter grid
    xgboost_param_grid = {'tree_method':'hist', 
                          'n_estimators': trial.suggest_int('n_estimators', 100, 500, 100), 
                          'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, step = 0.01), 
                          'max_depth': trial.suggest_int('max_depth', 3, 12), 
                          'gamma': trial.suggest_float('gamma', 0.01, 0.3, step = 0.01), 
                          'min_child_weight': trial.suggest_int('min_child_weight', 5, 15), 
                          'subsample': trial.suggest_float('subsample', 0.7, 1, step = 0.01), 
                          'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1, step = 0.01)
                         }
    
    ## Building the model
    xgb_md = XGBRegressor(**xgboost_param_grid, n_jobs = -1).fit(X_train, Y_train)
    
    ## Predicting on the test data-frame
    xgb_md_preds = xgb_md.predict(X_validation)
    
    ## Evaluating model performance on the test set
    xgb_md_mse = mean_squared_error(Y_validation, xgb_md_preds, squared = False)
    
    return xgb_md_mse

def lgbm_reg_objective(trial):
    
    ## Defining the LGB hyper-parameter grid
    LGB_param_grid = {'boosting_type': 'dart',
                      'n_estimators': trial.suggest_int('n_estimators', 100, 1500, 100),
                      'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3, step = 0.01),
                      'num_leaves': trial.suggest_int('num_leaves', 5, 40, step = 1),
                      'max_depth': trial.suggest_int('max_depth', 3, 12),
                      'subsample': trial.suggest_float('subsample', 0.7, 1, step = 0.01), 
                      'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1, step = 0.01),
                      'random_state': trial.suggest_int('random_state', 1, 1000),
                      'reg_alpha': trial.suggest_float('reg_alpha', 0.001, 0.1, step = 0.001),
                      'reg_lambda': trial.suggest_float('reg_lambda', 0.001, 0.1, step = 0.001), 
                      'objective': 'rmse', 
                      'verbosity': -1
                     }
                     
    ## Building the LightGBM model
    model = LGBMRegressor(**LGB_param_grid, n_jobs = -1).fit(X_train, Y_train)
        
    ## Predicting on the test data-frame
    lgbm_md_preds = model.predict(X_validation)
    
    ## Evaluating model performance on the test set
    lgbm_md_mse = mean_squared_error(Y_validation, lgbm_md_preds, squared = False)
    
    return lgbm_md_mse

In [None]:
## Starting RandomForest
## ----
## Creating a study object and to optimize the home objective function
study_rf = optuna.create_study(direction = 'minimize')
study_rf.optimize(rf_reg_objective, n_trials = 100)

In [14]:
## Printing best hyper-parameter set
print(study_rf.best_trial.params)

## Printing model performance
print(study_rf.best_trial.value)

{'n_estimators': 300, 'max_depth': 12, 'min_samples_split': 5, 'min_samples_leaf': 6}
3.36698326576257


In [None]:
## Starting XGBoost
## ----
## Creating a study object and to optimize the home objective function
study_xgb = optuna.create_study(direction = 'minimize')
study_xgb.optimize(xgb_reg_objective, n_trials = 500)

In [18]:
## Printing best hyper-parameter set
print(study_xgb.best_trial.params)

## Printing model performance
print(study_xgb.best_trial.value)

{'n_estimators': 500, 'learning_rate': 0.02, 'max_depth': 5, 'gamma': 0.2, 'min_child_weight': 10, 'subsample': 0.94, 'colsample_bytree': 0.9199999999999999}
3.2904768024142927


In [None]:
## Starting LightGBM
## ----
## Creating a study object and to optimize the home objective function
study_lgbm = optuna.create_study(direction = 'minimize')
study_lgbm.optimize(lgbm_reg_objective, n_trials = 50)

In [22]:
## Printing best hyper-parameter set
print(study_lgbm.best_trial.params)

## Printing model performance
print(study_lgbm.best_trial.value)

{'n_estimators': 600, 'learning_rate': 0.18000000000000002, 'num_leaves': 8, 'max_depth': 8, 'subsample': 0.73, 'colsample_bytree': 0.86, 'random_state': 543, 'reg_alpha': 0.021, 'reg_lambda': 0.027000000000000003}
3.3219108358489633
