# XGBoost and Optuna

In [38]:
## TO DO
# - Update the data import from UCI directly
# - Identify more details regarding each column name and details
# - The dataset is unbalanced resolve this.
# - Research more into how XGBoost works.
# - Research more into how optuna works!!

As part of this notebook I will take the Pima Indian diabetes dataset which is freely available on the UC Irvine dataset repository. I would like to first apply an xgboost model to the dataset and subsequently use the hyperparameter tuning package Optuna to identify the optimal parameters of the xgboost model. It will be interesting to see if there is much of a difference in the overall accuracy of the model. 

## XGboost Installation

If installing xgboost for the first time on MacOS it may be required to install the following package using brew `brew install libomp`. When running the XGBoost model an error may occur: `XGBoostError: sklearn needs to be installed in order to use this module`. In such cases (if sklearn is already installed), simply close and restart the jupyter notebook to resolved this error.

## Pima Indian Diabetes Dataset

The Pima Indian diabetes contains 9 variables each of which is described below:

1. Number of times pregnant
2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
3. Diastolic blood pressure (mm Hg)
4. Triceps skin fold thickness (mm)
5. 2-Hour serum insulin (mu U/ml)
6. Body mass index (weight in kg/(height in m)^2)
7. Diabetes pedigree function
8. Age (years)
9. Class variable (0 or 1)

In [27]:
# import the maths libraries
import pandas as pd
import numpy as np

# import the ml libraries
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# import the hyperparamter tuning libraries
import optuna 

In [96]:
import pandas as pd
import os

from zipfile import ZipFile
from urllib.request import urlretrieve


class ImportData:
    """
    This class defines the locations of the trainSet.csv and
    candidateTestSet.txt files which are subsequently downloaded and stored
    locally in the folder `../data/`.
    """

    def __init__(self, file_type):
        
        # define the file type
        self.file_type = file_type

        # define the location of the data zip file
        self.web_loc_url_zip = "https://datahub.io/machine-learning/diabetes/r/diabetes_zip.zip"
        
        # define the location of the online csv
        self.web_loc_url_csv = "https://datahub.io/machine-learning/diabetes/r/diabetes.csv"
        
        # define the root path
        self.root_path = os.getcwd()
        
        # define the local storage path
        self.local_storage = os.path.join(self.root_path, "data")
        
        # define the local filename
        self.local_file_path = os.path.join(self.local_storage, "data.zip")
        
    
    @staticmethod
    def create_local_folder(local_storage):
        """
        Identify if there is a local storage folder, if not create one
        """
        
        # identify if the local storage location exists
        try:
            os.stat(local_storage)
        
        # if not create local storage
        except:
            os.mkdir(local_storage)
    
    
    @staticmethod
    def download_data(web_loc_url, local_file_path):
        """
        Download a file from a supplied url address and store it locally at a pre-defined path.
        """
        
        # download a file from the a URL and save it at a local file path
        urlretrieve(web_loc_url, local_file_path)
    
    
    @staticmethod
    def extract_tar_file(local_file_path, local_storage):
        """ 
        The extract_tar_file function simply extracts all files contained within the supplied zipfile located at 
        the zipfile_path.
        """

        # read the zipfile object
        with ZipFile(local_file_path, 'r') as zipObj:

            # extract all the contents of zip file into the data directory
            zipObj.extractall(local_storage)
            
    @staticmethod
    def import_csv_file(file_location):
        """
        Import a CSV file to pandas dataframe.
        """
        
        # import csv from url
        data = pd.read_csv(file_location)
        
        return data
    
    def import_data(self):
        """
        Download the training and holdout files and store locally.
        """
        
        # define the data object
        data = None
        
        # create local storage folder
        self.create_local_folder(self.local_storage)
        
        # if importing a zip file
        if self.file_type == "zip":
            
            # download the data from the web and store locally
            self.download_data(self.web_loc_url_zip, self.local_file_path)
        
            # extract the local zip file
            self.extract_tar_file(self.local_file_path, self.local_storage)
            
            # import the csv from zip file
            data = self.import_csv_file(self.local_storage + "/data/diabetes_csv.csv")
        
        # if importing a csv file
        elif self.file_type == "csv":
            
            # import the dataset from url 
            data = self.import_csv_file(self.web_loc_url_csv)
                        
        return data

In [188]:
class PrepareDataset:
    """
    """
    
    def __init__(self, data):
        """
        """
        self.dataset = data
    
    def prepare_data(self):
        """
        Prepares the pima dataset for use by the xgboost model.
        """
        
        # prepare the class variable
        self.dataset['class'] = self.dataset['class'].replace({"tested_positive": 1, "tested_negative": 0})
        
        # split the dataset into a training and validation dataset in a 70:30 split
        x_train, x_test, y_train, y_test = train_test_split(
            self.dataset.drop(["class"], axis = 1), self.dataset["class"], 
            test_size=0.3, stratify = self.dataset["class"], random_state=7)
        
        # split the test dataset into a validation and test dataset in a 15:15 split
        x_val, x_test, y_val, y_test = train_test_split(
            x_test, y_test, stratify=y_test, test_size=0.50, random_state=42)
        
        return x_train, y_train, x_val, y_val, x_test, y_test

In [210]:
class DataModelling(HyperparameterTuning):
    """
    """
    
    def __init__(self, x_train, y_train, x_val, y_val, x_test, y_test):
        """
        """
        self.x_train = x_train
        self.y_train = y_train
        self.x_val = x_val
        self.y_val = y_val
        self.x_test = x_test
        self.y_test = y_test
        self.dtrain = xgb.DMatrix(self.x_train, label=self.y_train)
        self.dvalid = xgb.DMatrix(self.x_val, label=self.y_val)
        self.dtest = xgb.DMatrix(self.x_test, label=self.y_test)
    
    def fit_xgboost_model(self):
        """
        """

        # fit model to the training data
        model = XGBClassifier(eval_metric = 'logloss')
        model.fit(self.x_train, self.y_train)

        # make predictions for test data
        y_val_pred = model.predict(self.x_val)
        val_predictions = [round(value) for value in y_val_pred]
        
        # evaluate predictions
        accuracy = accuracy_score(self.y_val, val_predictions)
        print("Accuracy: %.2f%%" % (accuracy * 100.0))

        return model
    
    def evaluate_test_data(self, model, test_data):
        """
        """

        optimal_model = xgb.train(model.params, test_data)
        predictions = optimal_model.predict(test_data)
        pred_labels = np.rint(predictions)
        accuracy = sklearn.metrics.accuracy_score(test_data.get_label(), pred_labels)
        print("Accuracy: %.2f%%" % (accuracy * 100.0))
    
    def data_modelling(self):
        """
        """
        
        # train an xgboost model
        training_model = self.fit_xgboost_model()
        
        # apply hyperparameter tuning 
        optimal_model = HyperparameterTuning(self.dtrain, self.dvalid).hyperparameter_tuning()
        
        # apply the optimal model to the test dataset
        self.evaluate_test_data(optimal_model, self.dtest)
        

In [211]:
# load the pima indian diabetes dataset
pima = ImportData(file_type = "zip").import_data()
x_train, y_train, x_val, y_val, x_test, y_test = PrepareDataset(data = pima).prepare_data()


In [212]:
DataModelling(x_train, y_train, x_val, y_val, x_test, y_test).data_modelling()

[32m[I 2021-03-12 22:25:37,629][0m A new study created in memory with name: no-name-27df8f7c-8f12-4b10-a141-213c18712683[0m
[32m[I 2021-03-12 22:25:37,646][0m Trial 0 finished with value: 0.7304347826086957 and parameters: {'max_depth': 2, 'eta': 4.025290180140614e-06, 'gamma': 0.726613252089718, 'grow_policy': 'lossguide', 'sample_type': 'weighted', 'normalize_type': 'forest', 'rate_drop': 0.0043158753894589124, 'skip_drop': 7.377573543369162e-08}. Best is trial 0 with value: 0.7304347826086957.[0m
[32m[I 2021-03-12 22:25:37,665][0m Trial 1 finished with value: 0.7652173913043478 and parameters: {'max_depth': 1, 'eta': 0.0007271226546055024, 'gamma': 2.9171550441251316e-07, 'grow_policy': 'depthwise', 'sample_type': 'weighted', 'normalize_type': 'tree', 'rate_drop': 0.2013019267540839, 'skip_drop': 0.638548364767231}. Best is trial 0 with value: 0.7304347826086957.[0m
[32m[I 2021-03-12 22:25:37,720][0m Trial 2 finished with value: 0.6869565217391305 and parameters: {'max_de

Accuracy: 72.17%


[32m[I 2021-03-12 22:25:37,842][0m Trial 7 finished with value: 0.7304347826086957 and parameters: {'max_depth': 2, 'eta': 2.30167245557116e-08, 'gamma': 6.028856535308686e-07, 'grow_policy': 'depthwise', 'sample_type': 'weighted', 'normalize_type': 'tree', 'rate_drop': 4.5233500731444785e-05, 'skip_drop': 3.707671547454867e-07}. Best is trial 2 with value: 0.6869565217391305.[0m
[32m[I 2021-03-12 22:25:37,876][0m Trial 8 finished with value: 0.7130434782608696 and parameters: {'max_depth': 5, 'eta': 1.4145992766240532e-05, 'gamma': 5.222584302759911e-08, 'grow_policy': 'depthwise', 'sample_type': 'weighted', 'normalize_type': 'forest', 'rate_drop': 0.49317180704144525, 'skip_drop': 2.0003356133405017e-08}. Best is trial 2 with value: 0.6869565217391305.[0m
[32m[I 2021-03-12 22:25:37,900][0m Trial 9 finished with value: 0.7739130434782608 and parameters: {'max_depth': 3, 'eta': 0.011950504460657443, 'gamma': 2.8320749075240666e-06, 'grow_policy': 'lossguide', 'sample_type': 'we

[32m[I 2021-03-12 22:25:39,436][0m Trial 29 finished with value: 0.6869565217391305 and parameters: {'max_depth': 7, 'eta': 6.468829349551212e-06, 'gamma': 1.7378846630520452e-06, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'forest', 'rate_drop': 1.9534523527714026e-06, 'skip_drop': 0.00634012245601311}. Best is trial 10 with value: 0.6608695652173913.[0m
[32m[I 2021-03-12 22:25:39,499][0m Trial 30 finished with value: 0.6782608695652174 and parameters: {'max_depth': 9, 'eta': 1.860822943691577e-06, 'gamma': 0.0003585376634653668, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'forest', 'rate_drop': 3.9909866043405934e-08, 'skip_drop': 0.03280333910905575}. Best is trial 10 with value: 0.6608695652173913.[0m
[32m[I 2021-03-12 22:25:39,562][0m Trial 31 finished with value: 0.6782608695652174 and parameters: {'max_depth': 8, 'eta': 2.5592924595097766e-07, 'gamma': 3.404857998514248e-05, 'grow_policy': 'lossguide', 'sample_type'

[32m[I 2021-03-12 22:25:40,791][0m Trial 51 finished with value: 0.6695652173913044 and parameters: {'max_depth': 7, 'eta': 3.230248531929368e-08, 'gamma': 0.01580055482289225, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': 3.3385829596026e-07, 'skip_drop': 0.0150133130751964}. Best is trial 10 with value: 0.6608695652173913.[0m
[32m[I 2021-03-12 22:25:40,849][0m Trial 52 finished with value: 0.7217391304347827 and parameters: {'max_depth': 6, 'eta': 9.66536772683407e-08, 'gamma': 0.11211983544749166, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': 0.00022867395155827056, 'skip_drop': 0.028277325389715634}. Best is trial 10 with value: 0.6608695652173913.[0m
[32m[I 2021-03-12 22:25:40,911][0m Trial 53 finished with value: 0.7217391304347827 and parameters: {'max_depth': 8, 'eta': 0.1934536804183351, 'gamma': 0.0019213652527461924, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'nor

[32m[I 2021-03-12 22:25:42,234][0m Trial 73 finished with value: 0.7043478260869566 and parameters: {'max_depth': 7, 'eta': 1.0982982464583153e-08, 'gamma': 0.03152388139851308, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': 2.2389759378426253e-07, 'skip_drop': 6.025758848071337e-05}. Best is trial 10 with value: 0.6608695652173913.[0m
[32m[I 2021-03-12 22:25:42,309][0m Trial 74 finished with value: 0.6695652173913044 and parameters: {'max_depth': 8, 'eta': 4.3195054736472656e-08, 'gamma': 0.07112503583403573, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': 9.5349315147205e-05, 'skip_drop': 0.00438555119136257}. Best is trial 10 with value: 0.6608695652173913.[0m
[32m[I 2021-03-12 22:25:42,376][0m Trial 75 finished with value: 0.6956521739130435 and parameters: {'max_depth': 8, 'eta': 1.5075837460730614e-08, 'gamma': 0.001212786820701004, 'grow_policy': 'lossguide', 'sample_type': 'unifor

[32m[I 2021-03-12 22:25:43,802][0m Trial 95 finished with value: 0.6956521739130435 and parameters: {'max_depth': 7, 'eta': 1.5072733998009238e-08, 'gamma': 0.007176516176992803, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'forest', 'rate_drop': 2.425425006800758e-07, 'skip_drop': 2.7931962289606845e-05}. Best is trial 10 with value: 0.6608695652173913.[0m
[32m[I 2021-03-12 22:25:43,870][0m Trial 96 finished with value: 0.6782608695652174 and parameters: {'max_depth': 9, 'eta': 2.5246259269217283e-07, 'gamma': 6.716649189970089e-08, 'grow_policy': 'lossguide', 'sample_type': 'uniform', 'normalize_type': 'forest', 'rate_drop': 5.097786183798579e-07, 'skip_drop': 0.0025965223878891544}. Best is trial 10 with value: 0.6608695652173913.[0m
[32m[I 2021-03-12 22:25:43,930][0m Trial 97 finished with value: 0.7130434782608696 and parameters: {'max_depth': 6, 'eta': 2.6357945474028068e-08, 'gamma': 0.03459548955898063, 'grow_policy': 'lossguide', 'sample_type

Accuracy: 64.66%


## Optuna

You can optimize XGBoost hyperparameters, such as the booster type and alpha, in three steps:

1. Wrap model training with an objective function and return accuracy
2. Suggest hyperparameters using a trial object
3. Create a study object and execute the optimization

In [190]:
import numpy as np
import sklearn.datasets
import sklearn.metrics
from sklearn.model_selection import train_test_split
import xgboost as xgb

import optuna


class HyperparameterTuning:
    """
    """
    
    def __init__(self, dtrain, dvalid):
        """
        """
        
        # define the training and validation datasets
        self.dtrain = dtrain
        self.dvalid = dvalid
        
        # define the parameters to be tested
        self.param = {
            "verbosity": 0,
            "objective": "binary:logistic",
            "booster": trial.suggest_categorical("booster", ["gbtree", "gblinear", "dart"]),
            "lambda": trial.suggest_float("lambda", 1e-8, 1.0, log=True),
            "alpha": trial.suggest_float("alpha", 1e-8, 1.0, log=True),
        }
    
    def objective(self, trial, dtrain, dvalid):
        """
        """
        
        # choose the different hyperparameter values
        if self.param["booster"] == "gbtree" or self.param["booster"] == "dart":
            self.param["max_depth"] = trial.suggest_int("max_depth", 1, 9)
            self.param["eta"] = trial.suggest_float("eta", 1e-8, 1.0, log=True)
            self.param["gamma"] = trial.suggest_float("gamma", 1e-8, 1.0, log=True)
            self.param["grow_policy"] = trial.suggest_categorical("grow_policy", ["depthwise", "lossguide"])

        if self.param["booster"] == "dart":
            self.param["sample_type"] = trial.suggest_categorical("sample_type", ["uniform", "weighted"])
            self.param["normalize_type"] = trial.suggest_categorical("normalize_type", ["tree", "forest"])
            self.param["rate_drop"] = trial.suggest_float("rate_drop", 1e-8, 1.0, log=True)
            self.param["skip_drop"] = trial.suggest_float("skip_drop", 1e-8, 1.0, log=True)

        # fit a model to the training dataset using the defined hyperparameter values
        bst = xgb.train(self.param, self.dtrain)
        valid_preds = bst.predict(self.dvalid)
        valid_pred_labels = np.rint(valid_preds)
        accuracy = sklearn.metrics.accuracy_score(dvalid.get_label(), valid_pred_labels)
    
        return accuracy
    
    def hyperparameter_tuning(self):
        """
        """
    
        # execute an optimization by using the above objective function wrapped by a lambda function
        study = optuna.create_study()
        study.optimize(lambda trial: self.objective(trial, self.dtrain, self.dvalid), n_trials=100, timeout=600) 
        
        # identify the best model
        trial = study.best_trial
        
        return trial
    

In [191]:
cv = HyperparameterTuning(dtrain, dvalid).hyperparameter_tuning()

[32m[I 2021-03-12 22:13:38,442][0m A new study created in memory with name: no-name-73eee060-b7b7-4098-8942-d26b42820b5e[0m
[32m[I 2021-03-12 22:13:38,524][0m Trial 0 finished with value: 0.6956521739130435 and parameters: {'max_depth': 5, 'eta': 0.4441873843697772, 'gamma': 4.075643395094318e-05, 'grow_policy': 'depthwise', 'sample_type': 'uniform', 'normalize_type': 'tree', 'rate_drop': 1.4870234079518694e-06, 'skip_drop': 1.0180012587612135e-08}. Best is trial 0 with value: 0.6956521739130435.[0m
[32m[I 2021-03-12 22:13:38,543][0m Trial 1 finished with value: 0.7217391304347827 and parameters: {'max_depth': 4, 'eta': 2.255096863849266e-06, 'gamma': 0.14007081509090352, 'grow_policy': 'lossguide', 'sample_type': 'weighted', 'normalize_type': 'tree', 'rate_drop': 2.50987333371064e-07, 'skip_drop': 4.65077724148217e-06}. Best is trial 0 with value: 0.6956521739130435.[0m
[32m[I 2021-03-12 22:13:38,604][0m Trial 2 finished with value: 0.6782608695652174 and parameters: {'max_

[32m[I 2021-03-12 22:13:39,758][0m Trial 21 finished with value: 0.6782608695652174 and parameters: {'max_depth': 8, 'eta': 5.417020655852657e-08, 'gamma': 0.0003460816631578895, 'grow_policy': 'depthwise', 'sample_type': 'weighted', 'normalize_type': 'forest', 'rate_drop': 5.759313046522207e-06, 'skip_drop': 1.1943716163462233e-08}. Best is trial 2 with value: 0.6782608695652174.[0m
[32m[I 2021-03-12 22:13:39,822][0m Trial 22 finished with value: 0.6782608695652174 and parameters: {'max_depth': 8, 'eta': 6.031192746951159e-07, 'gamma': 0.02750770328024262, 'grow_policy': 'depthwise', 'sample_type': 'weighted', 'normalize_type': 'forest', 'rate_drop': 4.156371148061181e-06, 'skip_drop': 1.747498902678847e-06}. Best is trial 2 with value: 0.6782608695652174.[0m
[32m[I 2021-03-12 22:13:39,881][0m Trial 23 finished with value: 0.7130434782608696 and parameters: {'max_depth': 6, 'eta': 2.9194051099798662e-08, 'gamma': 0.7667001278568236, 'grow_policy': 'depthwise', 'sample_type': '

[32m[I 2021-03-12 22:13:41,387][0m Trial 43 finished with value: 0.6782608695652174 and parameters: {'max_depth': 9, 'eta': 2.0818169995023147e-07, 'gamma': 0.0004167756756012586, 'grow_policy': 'lossguide', 'sample_type': 'weighted', 'normalize_type': 'tree', 'rate_drop': 0.013476868799566525, 'skip_drop': 0.0005011551759233662}. Best is trial 2 with value: 0.6782608695652174.[0m
[32m[I 2021-03-12 22:13:41,452][0m Trial 44 finished with value: 0.6782608695652174 and parameters: {'max_depth': 8, 'eta': 9.918594009348404e-08, 'gamma': 1.9858668547388208e-05, 'grow_policy': 'lossguide', 'sample_type': 'weighted', 'normalize_type': 'tree', 'rate_drop': 1.4345474561024494e-05, 'skip_drop': 0.01063972763474372}. Best is trial 2 with value: 0.6782608695652174.[0m
[32m[I 2021-03-12 22:13:41,527][0m Trial 45 finished with value: 0.6782608695652174 and parameters: {'max_depth': 9, 'eta': 1.1787223488691888e-06, 'gamma': 0.0008781870117012118, 'grow_policy': 'lossguide', 'sample_type': '

[32m[I 2021-03-12 22:13:42,829][0m Trial 65 finished with value: 0.6956521739130435 and parameters: {'max_depth': 7, 'eta': 1.9174092426807805e-08, 'gamma': 0.003611076856193145, 'grow_policy': 'depthwise', 'sample_type': 'weighted', 'normalize_type': 'tree', 'rate_drop': 0.19296021132807248, 'skip_drop': 0.9959402263568947}. Best is trial 47 with value: 0.6695652173913044.[0m
[32m[I 2021-03-12 22:13:42,894][0m Trial 66 finished with value: 0.6782608695652174 and parameters: {'max_depth': 8, 'eta': 6.609497578902583e-08, 'gamma': 0.03462420578913754, 'grow_policy': 'depthwise', 'sample_type': 'weighted', 'normalize_type': 'forest', 'rate_drop': 1.2957552094848323e-06, 'skip_drop': 5.918896715653907e-07}. Best is trial 47 with value: 0.6695652173913044.[0m
[32m[I 2021-03-12 22:13:42,956][0m Trial 67 finished with value: 0.6869565217391305 and parameters: {'max_depth': 7, 'eta': 5.6543593699541025e-08, 'gamma': 8.881105153873032e-05, 'grow_policy': 'depthwise', 'sample_type': 'we

[32m[I 2021-03-12 22:13:44,312][0m Trial 87 finished with value: 0.6695652173913044 and parameters: {'max_depth': 9, 'eta': 3.5605030435719455e-08, 'gamma': 2.5389634603885494e-06, 'grow_policy': 'lossguide', 'sample_type': 'weighted', 'normalize_type': 'tree', 'rate_drop': 0.12249560256093564, 'skip_drop': 2.679779968141738e-05}. Best is trial 84 with value: 0.6608695652173913.[0m
[32m[I 2021-03-12 22:13:44,380][0m Trial 88 finished with value: 0.6695652173913044 and parameters: {'max_depth': 9, 'eta': 3.9105379728207476e-08, 'gamma': 2.161561377471475e-06, 'grow_policy': 'lossguide', 'sample_type': 'weighted', 'normalize_type': 'tree', 'rate_drop': 0.08678296498754737, 'skip_drop': 9.111370264287022e-05}. Best is trial 84 with value: 0.6608695652173913.[0m
[32m[I 2021-03-12 22:13:44,447][0m Trial 89 finished with value: 0.6782608695652174 and parameters: {'max_depth': 9, 'eta': 1.432606080603563e-08, 'gamma': 2.419679118185788e-06, 'grow_policy': 'lossguide', 'sample_type': '