In [6]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from mpl_toolkits.mplot3d import Axes3D

In [55]:
import pandas as pd
import numpy as np
import datetime as dt

# Replace the below file name for your layout if necessary
train_raw = pd.read_csv('data/train.csv')
train = train_raw # Todo:  Add split here

# date_time conversion
dates_dt = pd.to_datetime(train['date'])
dates_dt_min = dates_dt.min()

families = train['family'].unique()
assert families.shape == (33,)
fam_index = pd.DataFrame(data = np.arange(33), index = families)[0]

# Here's the conversion
train_txf = train.copy()
train_txf['date'] = (pd.to_datetime(train['date']) - dates_dt_min).dt.days
train_txf['store_nbr'] = train['store_nbr'] - 1
train_txf['family'] = train['family'].apply(fam_index.get)

# Allocate sales array
sales_shape = (1688, 54, 33)
sales = np.zeros(dtype = np.float64, shape = sales_shape)

# Fill in the sales array
for row in train_txf.itertuples():
    day = row.date
    store = row.store_nbr
    fam = row.family
    sales[day, store, fam] = row.sales

In [None]:
# Applies walk forward validation for XGBoost Regressor
# Inputs: train, test: pandas dataframes of training and testing datasets
#         n_estimators_list: a list of integers where the entries are the number of trees
#         max_depth_list: a list of integers where the entries are the max_depth
#         scoring: a string, this is the type of scoring, can be "mean_absolute_error" or "root_mean_square_error"
# Return value: a list of two rows with each row representing n_estimator and max_depth
#               another list of two rows with each row representing train_error and test_error

def xgb_walk_forward_validation(train, test, n_estimators_list = range(20, 200, 20), max_depth_list = range(3, 11), scoring = "mean_absolute_error"): 
    params, errors = [[], []],  [[], []]
    for n in n_estimators_list: 
        for depth in max_depth_list: 
            params[0].append(n)
            params[1].append(depth)
            for i in range(test.shape[0]):
                train_predictions = np.zeros((1, test.shape[0]))
                test_predictions = np.zeros((1, test.shape[0]))
                # split train, test rows into input and output columns
                X_train, y_train = train.iloc[:, :-1], train.iloc[:, -1]
                X_test, y_test = test.iloc[i, :-1], test.iloc[i, -1]
                
                # fit model on X_train and make a prediction
                XGBoost = XGBRegressor(objective='reg:squarederror', learning_rate=0.1, n_estimators = n, max_depth = depth)
                XGBoost.fit(X_train, y_train)
                y_train_hat = XGBoost.predict(X_train)
                y_test_hat = XGBoost.predict(X_test)
                    
                # store forecast in array of predictions
                train_predictions[i] = y_train_hat
                test_predictions[i] = y_test_hat
                #add current observation to the train data for the next loop
                X_train.append(test.iloc[i], ignore_index = True)

        # estimate prediction error
        if scoring == "mean_absolute_error": 
            train_error = mean_absolute_error(train[:, -1], train_predictions)
            test_error = mean_absolute_error(test[:, -1], test_predictions)
        elif scoring == "root_mean_square_error": 
            train_error = np.sqrt(mean_squared_error(train[:, -1], train_predictions))
            test_error = np.sqrt(mean_squared_error(test[:, -1], test_predictions))
        errors[0].append(train_error)
        errors[1].append(test_error)
    return params, errors

In [65]:
# Plot test errors for different model complexities
def Plot3D_train_test_errors(params, errors, scoring = 'mean_absolute_error'): 
    fig = plt.figure(1, figsize=(8, 8))
    ax = Axes3D(fig, rect=[0, 0, 0.95, 1], elev=48, azim=134)
    plt.axes(ax)
    if scoring == 'mean_absolute_error': 
        ax.set(xlabel = 'Number of Trees', ylabel = 'Maximum Depth of Trees', zlable = 'Mean Absolute Error', title = 'Mean Absolute Error vs Parameters')
    elif scoring == 'root_mean_square_error': 
        ax.set(xlabel = 'Number of Trees', ylabel = 'Maximum Depth of Trees', zlable = 'Root Mean Square Error', title = 'Root Mean Square Error vs Parameters')
    ax.scatter(params[0], params[1], errors[0], lbl = 'Training Error')
    ax.scatter(params[0], params[1], errors[1], lbl = 'Testing Error')
    plt.legend()
    plt.show()

def Plot2D_train_test_errors(param, errors, scoring = 'mean_absolute_error'): 
    fig = plt.figure(1, figsize=(8, 8))



In [66]:
train_txf_swap = train_txf.copy()
del train_txf_swap['id']
del train_txf_swap['sales']
train_txf_swap['sales'] = train_txf['sales']

In [67]:
lim = np.int64((train_txf_swap['date'].max() * 0.80))

In [68]:
train_train = train_txf_swap[train_txf_swap['date'] < lim]
train_test = train_txf_swap[train_txf_swap['date'] >= lim]

In [69]:
train_train

Unnamed: 0,date,store_nbr,family,onpromotion,sales
0,0,0,0,0,0.000
1,0,0,1,0,0.000
2,0,0,2,0,0.000
3,0,0,3,0,0.000
4,0,0,4,0,0.000
...,...,...,...,...,...
2398567,1348,8,28,0,524.739
2398568,1348,8,29,1,147.655
2398569,1348,8,30,1,1696.684
2398570,1348,8,31,7,326.000


In [70]:
train_test

Unnamed: 0,date,store_nbr,family,onpromotion,sales
2398572,1349,0,0,0,0.000
2398573,1349,0,1,0,0.000
2398574,1349,0,2,0,3.000
2398575,1349,0,3,2,864.000
2398576,1349,0,4,0,0.000
...,...,...,...,...,...
3000883,1687,8,28,0,438.133
3000884,1687,8,29,1,154.553
3000885,1687,8,30,148,2419.729
3000886,1687,8,31,8,121.000


In [71]:
res = xgb_walk_forward_validation(train_train, train_test)

ValueError: feature_names mismatch: ['date', 'store_nbr', 'family', 'onpromotion'] ['2398572']
expected store_nbr, family, date, onpromotion in input data
training data did not have the following fields: 2398572