In [7]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from mpl_toolkits.mplot3d import Axes3D

In [35]:
def train_test_split_nr(data, test_size): 
    split_index = data.shape[0] - test_size
    train = data.iloc[:split_index]
    test = data.iloc[split_index :]
    return train, test



In [37]:
# Applies walk forward validation for XGBoost Regressor
# Inputs: train, test: pandas dataframes of training and testing datasets
#         n_estimators_list: a list of integers where the entries are the number of trees
#         max_depth_list: a list of integers where the entries are the max_depth
#         scoring: a string, this is the type of scoring, can be "mean_absolute_error" or "root_mean_square_error"
# Return value: a list of two rows with each row representing n_estimator and max_depth
#               another list of two rows with each row representing train_error and test_error

def xgb_walk_forward_validation(data, test_size = 10, fold = 3, params = {'max_depth': [3, 5, 7], 'learning_rate': [0.01, 0.05, 0.1], 'n_estimators' : [100, 200, 300]}, scoring = "mean_absolute_error"): 
    assert test_size <= np.floor(data.shape[0] / fold)
    errors = {}
    # number of data points in each sample
    data_size = np.int64(np.floor(data.shape[0] / fold))
    for f in range(fold): 
        #break the f-th sample taken from the data set and split it into training and testing datasets
        sample = data.iloc[f*data_size: (f+1)*data_size]
        train, test = train_test_split_nr(sample, test_size)
        print(test.shape)

        # loop over the parameters
        for n in params['n_estimators']: 
            for depth in params['max_depth']: 
                for rate in params['learning_rate']: 
                    predictions = []
                    # walk forward validation
                    for i in range(test_size):
                        
                        # split train, test rows into input and output columns
                        X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
                        X_test = pd.DataFrame(test.iloc[i, :-1]).T
                        # fit model on X_train and make a prediction
                        XGBoost = XGBRegressor(objective='reg:squarederror', learning_rate=rate, n_estimators=n, max_depth=depth)
                        XGBoost.fit(X_train, y_train)
                        
                        #y_test_hat = XGBoost.predict(np.array(X_test).reshape((1,4)))
                        y_hat = XGBoost.predict(X_test)
                            
                        # store forecast in array of predictions
                        predictions.append(y_hat)

                        #add current observation to the train data for the next loop
                        pd.concat([train, pd.DataFrame(test.iloc[i]).T], axis = 'index')

        # estimate prediction error
        if scoring == "mean_absolute_error": 
            error = mean_absolute_error(test[:, -1], predictions)
        elif scoring == "root_mean_square_error": 
            error = np.sqrt(mean_squared_error(test[:, -1], predictions))

        errors[[n, depth, rate]] = error
    return errors

In [161]:
# Plot test errors for different model complexities
def Plot3D_train_test_errors(params, errors, scoring = 'mean_absolute_error'): 
    fig = plt.figure(1, figsize=(8, 8))
    ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134)
    plt.axes(ax)
    if scoring == 'mean_absolute_error': 
        ax.set(xlabel = 'Number of Trees', ylabel = 'Maximum Depth of Trees', zlable = 'Mean Absolute Error', title = 'Mean Absolute Error vs Parameters')
    elif scoring == 'root_mean_square_error': 
        ax.set(xlabel = 'Number of Trees', ylabel = 'Maximum Depth of Trees', zlable = 'Root Mean Square Error', title = 'Root Mean Square Error vs Parameters')
    ax.scatter(params[0], params[1], errors[0], lbl = 'Training Error')
    ax.scatter(params[0], params[1], errors[1], lbl = 'Testing Error')
    plt.legend()
    plt.show()

def Plot2D_train_test_errors(param, errors, scoring = 'mean_absolute_error'): 
    fig = plt.figure(1, figsize=(8, 8))



In [10]:
import datetime as dt

# Replace the below file name for your layout if necessary
train_raw = pd.read_csv('data/train.csv')
train = train_raw # Todo:  Add split here

# date_time conversion
dates_dt = pd.to_datetime(train['date'])
dates_dt_min = dates_dt.min()

families = train['family'].unique()
assert families.shape == (33,)
fam_index = pd.DataFrame(data = np.arange(33), index = families)[0]

# Here's the conversion
train_txf = train.copy()
train_txf['date'] = (pd.to_datetime(train['date']) - dates_dt_min).dt.days
train_txf['store_nbr'] = train['store_nbr'] - 1
train_txf['family'] = train['family'].apply(fam_index.get)

# Allocate sales array
sales_shape = (1688, 54, 33)
sales = np.zeros(dtype = np.float64, shape = sales_shape)

# Fill in the sales array
for row in train_txf.itertuples():
    day = row.date
    store = row.store_nbr
    fam = row.family
    sales[day, store, fam] = row.sales

In [11]:
train_txf_sample = train_txf.copy()
train_txf_swap = train_txf.copy()
del train_txf_swap['id']
del train_txf_swap['sales']
train_txf_swap['sales'] = train_txf['sales']

In [38]:
res = xgb_walk_forward_validation(train_txf_swap)

(10, 5)


KeyboardInterrupt: 

In [172]:
print(res)

NameError: name 'res' is not defined