# Training of Model

## Import libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Load data

In [2]:
trainD = pd.read_csv('Data/train.csv', parse_dates=['date'])
testD = pd.read_csv('Data/test.csv', parse_dates=['date'])
isHoliday = pd.read_csv('Data/holidays_events.csv', parse_dates=['date'])

In [3]:
trainD.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   store_nbr    int64         
 3   family       object        
 4   sales        float64       
 5   onpromotion  int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(1)
memory usage: 137.4+ MB


In [4]:
isHoliday.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         350 non-null    datetime64[ns]
 1   type         350 non-null    object        
 2   locale       350 non-null    object        
 3   locale_name  350 non-null    object        
 4   description  350 non-null    object        
 5   transferred  350 non-null    bool          
dtypes: bool(1), datetime64[ns](1), object(4)
memory usage: 14.1+ KB


## Filters Earthquake dates

In [7]:
#trainDWOEarthQuake = trainD[(trainD['date'] < '2016-04-16') | (trainD['date'] >= '2016-07-16')]
trainDWOEarthQuake = trainD

## Slicing for each family and store

In [8]:
#Categorical encoding for dates with categories

#date
Xaux = trainDWOEarthQuake['date'].astype('category').cat.codes
trainDWOEarthQuake['date'] = Xaux



fam_unq = trainDWOEarthQuake['family'].unique()
store_unq = trainDWOEarthQuake['store_nbr'].unique()
names_col = []
data = []

data.append(trainDWOEarthQuake['date'].unique())
names_col.append('ds')

for fam in fam_unq:
    trainD_aux = trainDWOEarthQuake.loc[trainDWOEarthQuake['family']==fam]
    for store in store_unq:
        #data.append(trainD_aux['id'].loc[trainDWOEarthQuake['store_nbr']==store].to_list())
        data.append(trainD_aux['sales'].loc[trainDWOEarthQuake['store_nbr']==store].to_list())
        #names_col.append( fam + "," + str(store) + '_id')
        names_col.append( fam + "," + str(store))

data_t = list(map(list, zip(*data)))
dataDF = pd.DataFrame(data_t,columns=names_col)


## Training model for testing

In [9]:
horizon = 100
n_series = len(dataDF)-horizon

dataDF_train = dataDF.head(n_series)
dataDF_test = dataDF.tail(horizon)

def shift_n(df,col,n_steps):
    emp_list = df[col].shift(1, fill_value=0).to_list()
    df_ret=pd.DataFrame(emp_list,columns=['0'])
    for i in range(1,n_steps):
        df_ret[str(i)]=df[col].shift(i+1, fill_value=0).to_list()
    return df_ret


#Gradient boosting regressor
from sklearn.ensemble import GradientBoostingRegressor
#MLP Regressor
from sklearn.neural_network import MLPRegressor

n_shifts = 40
regr_list = []
for column in dataDF_train.iloc[:,1:]:
    data_X = shift_n(dataDF_train,column,n_shifts)
    data_y = dataDF_train[column]
    regr_list.append(GradientBoostingRegressor(n_estimators=100, learning_rate=0.2, 
                                random_state=0,loss='squared_error').fit(data_X, data_y))
    #regr_list.append(MLPRegressor(random_state=1, max_iter=2000, learning_rate = 'adaptive',
    #                              activation='logistic').fit(data_X, data_y))


In [10]:
# # Summarize fitted model
# print(fitted_model_list[0].summary())

# # Forecast 14 steps ahead
# y_forecasted = fitted_model_list[0].forecast(steps=horizon)

# ax = plt.plot(dataDF_test['ds'],dataDF_test['AUTOMOTIVE,1'])
# ax = plt.plot(dataDF_test['ds'],y_forecasted)

#Test
from sklearn.metrics import mean_squared_log_error

y_pred = []
mnle_list = []
for i in range(0,len(regr_list)):
    y_pred.append(regr_list[i].predict(shift_n(dataDF_test,dataDF_test.columns.values[i+1],n_shifts)))
    y_test = dataDF_test[dataDF_test.columns.values[i+1]]
    for j in range(0,len(y_pred[i])):
        if y_pred[i][j] < 0:
            y_pred[i][j]=0
    mnle_list.append(mean_squared_log_error(y_test,y_pred[i]))
#print(regr_list[0].feature_importances_)

print(mnle_list)

from statistics import mean
print(mean(mnle_list))

[0.4143304336130147, 0.5485692542410792, 0.23696089161381764, 0.3171734422096225, 0.36054727119953756, 0.4184173420549699, 0.371870692871368, 0.40019772189173, 0.44539481999322617, 0.37937203156358057, 0.3281755487226013, 0.45009489466350106, 0.3972431523221853, 0.2827051177213141, 0.4054637092994233, 0.39235828373107784, 0.3782134761817356, 0.38837755534562285, 0.3508973092530033, 0.31864517916710383, 0.3266205327640615, 0.3132763376644832, 0.21496168412957498, 0.3978473972867564, 0.23715878797546025, 0.4288826131808856, 0.32327321409816284, 0.33783014892079355, 0.45233410152755305, 0.3287700922423805, 0.4549956280809152, 0.25833784492621675, 0.3028998357824412, 0.36503554543423056, 0.4083788756272829, 0.2916970919969196, 0.434898225595343, 0.43573211016081764, 0.18761834661401644, 0.13203666099772665, 0.15326380100070217, 0.17687944980630202, 0.2413500190338368, 0.30404112128618405, 0.20940613509901126, 0.3990263874578192, 0.22783638932054628, 0.6524534555110155, 0.30137936849896274,