## XGBoost Predictive Model

## Load the dataset

In [5]:
# Import the pandas, numpy packages and dump from joblib
import pandas as pd
import numpy as np
from joblib import dump

In [6]:
# Load the saved sets from data/processed using numpy
X_train = np.load('../data/processed/X_train.npy')
X_val   = np.load('../data/processed/X_val.npy'  )
y_train = np.load('../data/processed/y_train.npy')
y_val   = np.load('../data/processed/y_val.npy'  )

## Train XGBoost Model

In [7]:
# Import the xgboost package as xgb
import xgboost as xgb

In [8]:
# Instantiate the XGBRegressor class into a variable called xgb_default
xgb_default = xgb.XGBRegressor()

In [9]:
# Fit the XGBoost model
xgb_default.fit(X_train, y_train)

In [10]:
# Import dump from joblib and save the model
from joblib import dump 

dump(xgb_default,  '../models/xgb_default.joblib')

['../models/xgb_default.joblib']

In [11]:
# Calculate the predicted values for the training and validation sets
predicted_values_train = xgb_default.predict(X_train)
predicted_values_val = xgb_default.predict(X_val)


In [12]:
# Import the function print_mse from models.performance and display the MSE score
import sys
sys.path.insert(1, '..')
from src.models.performance import print_mse

print_mse(y_actuals=y_train, y_preds=predicted_values_train,set_name='Training')
print_mse(y_actuals=y_val, y_preds=predicted_values_val,set_name='Validation')

MSE Training: 16859.055652102663
MSE Validation: 17047.774111585564


Our default XGBoost model performs better than our baseline. 

## Hyperparameter Tuning

## Manual Search

In [9]:
# Instantiate the XGBRegressor class into a variable called xgb_manual
xgb_manual = xgb.XGBRegressor(
    n_estimators=100,
    eta=0.02,
    max_depth=3,
    subsample=0.8, 
    scale_pos_weight=0.2,
    min_child_weight=1.5,
    gamma=5)

In [10]:
# Fit the XGBoost model
xgb_manual.fit(X_train, y_train)

In [12]:
# Import dump from joblib and save the model
from joblib import dump 

dump(xgb_manual,  '../models/xgb_manual.joblib')

['../models/xgb_manual.joblib']

In [13]:
# Calculate the predicted values for the training and validation sets
predicted_values_train = xgb_manual.predict(X_train)
predicted_values_val = xgb_manual.predict(X_val)

In [14]:
# Import the function print_mse from models.performance and display the MSE score
import sys
sys.path.insert(1, '..')
from src.models.performance import print_mse

print_mse(y_actuals=y_train, y_preds=predicted_values_train,set_name='Training')
print_mse(y_actuals=y_val, y_preds=predicted_values_val,set_name='Validation')

MSE Training: 26150.84592577848
MSE Validation: 26264.173837713206


We get slightly worse results with xgb_manual than xgb_default. Further iterations were attempted with little benefit. Grid Search was considered but given the sheer size of the dataframe (34785111, 8), this becomes computationally and time inefficient. Greater resources will need to be dedicated by the business to further refine our ML predictive capabilities but we have a strong foundation to work from. Hyperopt will be tested. 

### Grid Search

In [15]:
# Import GridSearchCV and numpy as np
from sklearn.model_selection import GridSearchCV
import numpy as np

In [16]:
# Instantiate the XGBRegressor class into a variable called estimator
estimator = xgb.XGBRegressor(
    objective='reg:squarederror',  # Use 'reg:squarederror' for regression (MSE)
    nthread=4,
    seed=42
)

In [17]:
# Set up the range for several hyperparameters
parameters = {
    'max_depth': range (2, 3, 1),
    'n_estimators': range(50, 100, 25),
    'learning_rate': [0.01, 0.02, 0.03,0.04,0.05],
    'subsample': [0.7,0.8,0.9]
}

In [18]:
# Set up the grid search with cross validation
grid_search=GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring='roc_auc',
    cv=10,
    verbose=True )

In [19]:
# Fit the model with the training set
grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 30 candidates, totalling 300 fits


MemoryError: Unable to allocate 1.60 GiB for an array with shape (9734399, 22) and data type float64

In [None]:
# Import dump from joblib and save the model
from joblib import dump 

dump(grid_search,  '../models/xgb_grid_search.joblib')

['../models/xgb_grid_search.joblib']

In [None]:
# Display the hyperparameters for the best estimator
grid_search.best_estimator_

In [None]:
# Calculate the predicted values for the training and validation sets
predicted_values_train = xgboost_hyperopt.predict(X_train)
predicted_values_val = xgboost_hyperopt.predict(X_val)

In [None]:
# Import the function print_mse from models.performance and display the MSE score
import sys
sys.path.insert(1, '..')
from src.models.performance import print_mse

print_mse(y_actuals=y_train, y_preds=predicted_values_train,set_name='Training')
print_mse(y_actuals=y_val, y_preds=predicted_values_val,set_name='Validation')

MSE Training: 122.57432469986081
MSE Validation: 121.86869114138327


### Hyperopt package

In [23]:
# Import Trials, STATUS_OK, tpe, hp, fmin from hyperopt package
from hyperopt import Trials, STATUS_OK, tpe, hp, fmin

In [21]:
# Define the search space for xgboost hyperparameters
space = {
    'learning_rate' : hp.quniform('learning_rate', 0.01, 0.05, 0.005),
    'subsample' : hp.quniform('subsample', 0.7, 0.9, 0.05),
    'scale_pos_weight': hp.quniform('scale_pos_weight', 0.1, 1.0, 0.1),
    'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 0.5),
    'gamma': hp.quniform('gamma', 2, 6, 0.5)
}

In [26]:
# Define a function called objective
def objective(space):
    from sklearn.model_selection import cross_val_score

    xgboost = xgb.XGBRegressor(  # Use XGBRegressor for regression
        max_depth=3,
        learning_rate=space['learning_rate'],
        subsample=space['subsample'],
        #colsample_bytree=space['colsample_bytree'],
        min_child_weight=space['min_child_weight'],
        gamma=space['gamma'],
    )

    mse = -cross_val_score(xgboost, X_train, y_train, cv=10, scoring="neg_mean_squared_error").mean()

    return {'loss': mse, 'status': STATUS_OK}

In [27]:
# Launch Hyperopt search and save the result in a variable called best
best = fmin(
    fn=objective,   
    space=space,       
    algo=tpe.suggest,       
    max_evals=5
)

  0%|          | 0/5 [00:00<?, ?trial/s, best loss=?]

job exception: Unable to allocate 1.60 GiB for an array with shape (9734400, 22) and data type float64



  0%|          | 0/5 [05:26<?, ?trial/s, best loss=?]


MemoryError: Unable to allocate 1.60 GiB for an array with shape (9734400, 22) and data type float64

In [None]:
# Print out the Hyperparameters for the best model
print("Best:", best)

Best: {'gamma': 3.5, 'learning_rate': 0.045, 'min_child_weight': 5.5, 'scale_pos_weight': 0.6000000000000001, 'subsample': 0.75}


In [None]:
# Instantiate the XGBRegressor class into a variable called xgboost_hyperopt
xgb_hyperopt = xgb.XGBRegressor(
    max_depth = 3,
    learning_rate = best['learning_rate'],
    min_child_weight = best['min_child_weight'],
    subsample = best['subsample'],
    # colsample_bytree = best['colsample_bytree'],
    scale_pos_weight=best['scale_pos_weight'],
    gamma=best['gamma']
)

In [None]:
# Fit the model with the training set
xgb_hyperopt.fit(X_train, y_train)

In [24]:
# Import dump from joblib and save the model
from joblib import dump 

dump(xgb_hyperopt,  '../models/xgb_hyperopt.joblib')

NameError: name 'xgb_hyperopt' is not defined

In [25]:
# Calculate the predicted values for the training and validation sets
predicted_values_train = xgboost_hyperopt.predict(X_train)
predicted_values_val = xgboost_hyperopt.predict(X_val)

NameError: name 'xgboost_hyperopt' is not defined

In [None]:
# Import the function print_mse from models.performance and display the MSE score
import sys
sys.path.insert(1, '..')
from src.models.performance import print_mse

print_mse(y_actuals=y_train, y_preds=predicted_values_train,set_name='Training')
print_mse(y_actuals=y_val, y_preds=predicted_values_val,set_name='Validation')

MSE Training: 122.57432469986081
MSE Validation: 121.86869114138327


Hyperopt performs worse than both the default model and xgb_manual. Further tuning is possible but this would be computationally inefficient given the size of the dataframe. We will use the default model for our predictive model.

## XGBoost Predictive Model

### Hyperopt package

In [13]:
# Import Trials, STATUS_OK, tpe, hp, fmin from hyperopt package
from hyperopt import Trials, STATUS_OK, tpe, hp, fmin

In [14]:
# Define the search space for xgboost hyperparameters - use a smaller space to optimise computational efficiency 
space = {
    'learning_rate': hp.choice('learning_rate', [0.01, 0.02, 0.03]),
    'subsample': hp.choice('subsample', [0.7, 0.8, 0.9]),
    'colsample_bytree': hp.choice('colsample_bytree', [0.5, 0.7, 0.9]),
    'min_child_weight': hp.choice('min_child_weight', [1, 3, 5]),
    'gamma': hp.choice('gamma', [2, 3, 4]),
}

In [15]:
def objective(space):
    from sklearn.model_selection import cross_val_score

    xgboost = xgb.XGBRegressor(  # Use XGBRegressor for regression
        max_depth=3,
        learning_rate=space['learning_rate'],
        subsample=space['subsample'],
        colsample_bytree=space['colsample_bytree'],
        min_child_weight=space['min_child_weight'],
        gamma=space['gamma'],
    )

    mse = -cross_val_score(xgboost, X_train, y_train, cv=10, scoring="neg_mean_squared_error").mean()

    return {'loss': mse, 'status': STATUS_OK}

In [16]:
# Launch Hyperopt search and save the result in a variable called best
best = fmin(
    fn=objective,   
    space=space,       
    algo=tpe.suggest,       
    max_evals=2
)

100%|██████████| 2/2 [14:55<00:00, 447.58s/trial, best loss: 26664.720728229975]


In [17]:
# Print out the Hyperparameters for the best model
print("Best:", best)

Best: {'colsample_bytree': 1, 'gamma': 1, 'learning_rate': 1, 'min_child_weight': 0, 'subsample': 1}


In [18]:
# Instantiate the XGBRegressor class into a variable called xgboost_hyperopt
xgboost_hyperopt = xgb.XGBRegressor(
    max_depth = 3,
    learning_rate = best['learning_rate'],
    min_child_weight = best['min_child_weight'],
    subsample = best['subsample'],
    # colsample_bytree = best['colsample_bytree'],
    gamma=best['gamma']
)

In [19]:
# Fit the model with the training set
xgboost_hyperopt.fit(X_train, y_train)

In [20]:
# Import dump from joblib and save the model
from joblib import dump 

dump(xgboost_hyperopt,  '../models/xgboost_hyperopt.joblib')

['../models/xgboost_hyperopt.joblib']

In [21]:
# Calculate the predicted values for the training and validation sets
predicted_values_train = xgboost_hyperopt.predict(X_train)
predicted_values_val = xgboost_hyperopt.predict(X_val)

In [22]:
# Import the function print_mse from models.performance and display the MSE score
import sys
sys.path.insert(1, '..')
from src.models.performance import print_mse

print_mse(y_actuals=y_train, y_preds=predicted_values_train,set_name='Training')
print_mse(y_actuals=y_val, y_preds=predicted_values_val,set_name='Validation')

MSE Training: 19464.31581707041
MSE Validation: 19560.71944806938


Hyperopt performs worse than both the default model and xgb_manual. Further tuning is possible but this would be computationally inefficient given the size of the dataframe. We will use the default model for our predictive model.