In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from mpl_toolkits.mplot3d import Axes3D

In [None]:
# Splits data sets into training and testing data such that the testing data has $test_size$ datapoints
# Inputs: X_data, y_data: Pandas dataframes; test_size: an integer less than the number of 
#         datapoints in $X_data$ or $y_data$
# Outputs: X_train, X_test, y_train, y_test: Pandas dataframes
def train_test_split_nr(X_data, y_data, test_size): 
    split_index = data.shape[0] - test_size
    X_train = X_data.iloc[:split_index]
    X_test = X_data.iloc[split_index :]
    y_train = y_data.iloc[:split_index]
    y_test = y_data.iloc[split_index :]
    return X_train, X_test, y_train, y_test

In [39]:
# Takes in Pandas dataframes $X_data$ and $y_data$ and splits them evenly into $fold$ parts. Each part 
# will be further splitted into training and testing sets such that the testing set is of size $test_size$. 
# Applies walk forward validation for XGBoost on the training and testing sets, and computes the error for 
# all the combinations of parameters.
# Inputs: X_data, y_data: Pandas dataframes of
#         test_size: an integer that is the size of the testing data
#         fold: an integer such that $data$ will be broken evenly into $fold$ pieces 
#         params: a dictionary of parameters to be tested on. It must have three keys: 'max_depth', 'learning_rate',  
#                 and 'n_estimators', with the corresponding value being lists of integers, floats, and integers. 
#         scoring: a string that is the type of scoring. It can be "mean_absolute_error" or "root_mean_square_error"
# Output: a dictionary whose keys are lists of parameters and values are the corresponding errors of the XGBoost model
#         with those parameters

def xgb_walk_forward_validation(X_data, y_data, test_size = 5, fold = 3, params = {'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.05, 0.1], 'n_estimators' : [100, 200, 300]}, scoring = "mean_absolute_error"): 
    assert test_size <= np.floor(data.shape[0] / fold)
    errors = {}
    # number of data points in each sample
    sample_size = np.int64(np.floor(data.shape[0] / fold))
    for f in range(fold): 
        #break the f-th sample taken from the data set and split it into training and testing datasets
        X_sample = X_data.iloc[f*sample_size: (f+1)*sample_size]
        y_sample = y_data.iloc[f*sample_size: (f+1)*sample_size]
        X_train, X_test, y_train, y_test = train_test_split_nr(sample, test_size)

        # loop over the parameters
        for n in params['n_estimators']: 
            for depth in params['max_depth']: 
                for rate in params['learning_rate']: 
                    predictions = []
                    # walk forward validation
                    for i in range(test_size):
                        
                        # fit the XGBRegressor model on X_train, y_train and make a prediction
                        XGBoost = XGBRegressor(objective='reg:squarederror', learning_rate=rate, n_estimators=n, max_depth=depth)
                        XGBoost.fit(X_train, y_train)
                        
                        #y_test_hat = XGBoost.predict(np.array(X_test).reshape((1,4)))
                        y_hat = XGBoost.predict(pd.DataFrame(X_test.iloc[i]).T)
                            
                        # store forecast in array of predictions
                        predictions.append(y_hat)

                        #add current observation to the training data for the next loop
                        pd.concat([X_train, pd.DataFrame(X_test.iloc[i]).T], axis = 'index')
                        pd.concat([y_train, pd.DataFrame(y_test.iloc[i]).T], axis = 'index')

        # estimate prediction error
        if scoring == "mean_absolute_error": 
            error = mean_absolute_error(test[:, -1], predictions)
        elif scoring == "root_mean_square_error": 
            error = np.sqrt(mean_squared_error(test[:, -1], predictions))

        errors[[n, depth, rate]] = error
    return errors

In [161]:
# Plot test errors for different model complexities
def Plot3D_train_test_errors(params, errors, scoring = 'mean_absolute_error'): 
    fig = plt.figure(1, figsize=(8, 8))
    ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134)
    plt.axes(ax)
    if scoring == 'mean_absolute_error': 
        ax.set(xlabel = 'Number of Trees', ylabel = 'Maximum Depth of Trees', zlable = 'Mean Absolute Error', title = 'Mean Absolute Error vs Parameters')
    elif scoring == 'root_mean_square_error': 
        ax.set(xlabel = 'Number of Trees', ylabel = 'Maximum Depth of Trees', zlable = 'Root Mean Square Error', title = 'Root Mean Square Error vs Parameters')
    ax.scatter(params[0], params[1], errors[0], lbl = 'Training Error')
    ax.scatter(params[0], params[1], errors[1], lbl = 'Testing Error')
    plt.legend()
    plt.show()

def Plot2D_train_test_errors(param, errors, scoring = 'mean_absolute_error'): 
    fig = plt.figure(1, figsize=(8, 8))

