# Training of Model

## Import libraries

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Load data

In [50]:
trainD = pd.read_csv('Data/train.csv', parse_dates=['date'])
testD = pd.read_csv('Data/test.csv', parse_dates=['date'])
isHoliday = pd.read_csv('Data/holidays_events.csv', parse_dates=['date'])

In [51]:
trainD.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype         
---  ------       -----         
 0   id           int64         
 1   date         datetime64[ns]
 2   store_nbr    int64         
 3   family       object        
 4   sales        float64       
 5   onpromotion  int64         
dtypes: datetime64[ns](1), float64(1), int64(3), object(1)
memory usage: 137.4+ MB


In [52]:
isHoliday.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype         
---  ------       --------------  -----         
 0   date         350 non-null    datetime64[ns]
 1   type         350 non-null    object        
 2   locale       350 non-null    object        
 3   locale_name  350 non-null    object        
 4   description  350 non-null    object        
 5   transferred  350 non-null    bool          
dtypes: bool(1), datetime64[ns](1), object(4)
memory usage: 14.1+ KB


## Filters Earthquake dates

In [53]:
#trainDWOEarthQuake = trainD[(trainD['date'] < '2016-04-16') | (trainD['date'] >= '2016-07-16')]

## Slicing for each family and store

In [54]:
#Categorical encoding for dates with categories

#date
Xaux = trainDWOEarthQuake['date'].astype('category').cat.codes
trainDWOEarthQuake['date'] = Xaux



fam_unq = trainDWOEarthQuake['family'].unique()
store_unq = trainDWOEarthQuake['store_nbr'].unique()
names_col = []
data = []

data.append(trainDWOEarthQuake['date'].unique())
names_col.append('ds')

for fam in fam_unq:
    trainD_aux = trainDWOEarthQuake.loc[trainDWOEarthQuake['family']==fam]
    for store in store_unq:
        #data.append(trainD_aux['id'].loc[trainDWOEarthQuake['store_nbr']==store].to_list())
        data.append(trainD_aux['sales'].loc[trainDWOEarthQuake['store_nbr']==store].to_list())
        #names_col.append( fam + "," + str(store) + '_id')
        names_col.append( fam + "," + str(store))

data_t = list(map(list, zip(*data)))
dataDF = pd.DataFrame(data_t,columns=names_col)


## Training model

In [61]:
horizon = 100
n_series = len(dataDF)-horizon

dataDF_train = dataDF.head(n_series)
dataDF_test = dataDF.tail(horizon)

# #Auto arima
# from statsforecast.core import StatsForecast
# from statsforecast.models import auto_arima
# from multiprocessing import cpu_count

# models = [(auto_arima, 1)]

# fcst_list = []

# even = False
# for column in dataDF_train.iloc[:,1:]:
#     if even:
#         data_arima[column] = dataDF_train[column]
#         fcst = StatsForecast(
#           data_arima.set_index('unique_id'), #your data
#           models=[auto_arima], 
#           freq='D', # frequency of your data
#           n_jobs=min(cpu_count(), n_series), # you can also define the number of cores used for parallelizing
#         )
#         fcst_list.append(fcst)
#         even = False
#     else: 
#         data_arima = dataDF_train[[column,'ds']]
#         data_arima = data_arima.rename(columns={column: 'unique_id'})
#         even = True

# from tbats import TBATS

# estimator = TBATS(seasonal_periods=[7])

# fitted_model_list = []
# for column in dataDF_train.iloc[:,1:3]:
#     print(column)
#     fitted_model_list.append(estimator.fit(dataDF_train[column]))

  
def shift_n(df,col,n_steps):
    emp_list = df[col].shift(1, fill_value=0).to_list()
    df_ret=pd.DataFrame(emp_list,columns=['0'])
    for i in range(1,n_steps):
        df_ret[str(i)]=df[col].shift(i+1, fill_value=0).to_list()
    return df_ret


#Gradient boosting regressor
from sklearn.ensemble import GradientBoostingRegressor
#MLP Regressor
from sklearn.neural_network import MLPRegressor

n_shifts = 40
regr_list = []
for column in dataDF_train.iloc[:,1:]:
    data_X = shift_n(dataDF_train,column,n_shifts)
    data_y = dataDF_train[column]
    regr_list.append(GradientBoostingRegressor(n_estimators=100, learning_rate=0.2, 
                                random_state=0,loss='squared_error').fit(data_X, data_y))
    #regr_list.append(MLPRegressor(random_state=1, max_iter=2000, learning_rate = 'adaptive',
    #                              activation='logistic').fit(data_X, data_y))


In [63]:
# # Summarize fitted model
# print(fitted_model_list[0].summary())

# # Forecast 14 steps ahead
# y_forecasted = fitted_model_list[0].forecast(steps=horizon)

# ax = plt.plot(dataDF_test['ds'],dataDF_test['AUTOMOTIVE,1'])
# ax = plt.plot(dataDF_test['ds'],y_forecasted)

#Test
from sklearn.metrics import mean_squared_log_error

y_pred = []
mnle_list = []
for i in range(0,len(regr_list)):
    y_pred.append(regr_list[i].predict(shift_n(dataDF_test,dataDF_test.columns.values[i+1],n_shifts)))
    y_test = dataDF_test[dataDF_test.columns.values[i+1]]
    for j in range(0,len(y_pred[i])):
        if y_pred[i][j] < 0:
            y_pred[i][j]=0
    mnle_list.append(mean_squared_log_error(y_test,y_pred[i]))
#print(regr_list[0].feature_importances_)

print(mnle_list)

from statistics import mean
print(mean(mnle_list))

[0.4281507527486644, 0.5699464332552671, 0.25917987554228833, 0.3299122430625697, 0.35416351183319217, 0.43615779659908804, 0.32438766210090697, 0.409674000874457, 0.4123011738167513, 0.3775816570916253, 0.3343940375448758, 0.4267802565134927, 0.44643640436378773, 0.3060120620562829, 0.3692932905800956, 0.4066702112052804, 0.3748940758232731, 0.3965939222035327, 0.3548815490422493, 0.31646616817175827, 0.33523476527637747, 0.29727519269308483, 0.1968511164134809, 0.42798730162466136, 0.2551053633495646, 0.40727411241927064, 0.2947153421629318, 0.34287586602400866, 0.4218723072730453, 0.3336204771554991, 0.46539348630354843, 0.26398777126987943, 0.24211143415635886, 0.34941965133028574, 0.42944170228074396, 0.30368435018763373, 0.4398353573495346, 0.4683662608867183, 0.18216073992136328, 0.1253672867400101, 0.16818441016005423, 0.1726886432962396, 0.239607146019775, 0.31512862604478853, 0.21868225840306416, 0.3661120906994401, 0.25364161134199853, 0.6542500134132646, 0.29691226867292825