# IMPORTS

## Libraries

In [1]:
import pandas as pd
import numpy as np

import datetime

import warnings
warnings.filterwarnings("ignore")

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.display import HTML

from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso

import xgboost as xgb
from lightgbm import LGBMRegressor

## Helper Functions

In [2]:
def crossValidation(XTraining, kfold, modelName, verbose=False):
    maeList = []
    mapeList = []
    rmseList = []

    for k in reversed(range(1, kfold+1)):
        if verbose:
            print(f'\nKFold Number: {k}')
        # Start and End Date for Validation
        startDateValid = XTraining['Date'].max() - datetime.timedelta(days=k*6*7)
        endDateValid = XTraining['Date'].max() - datetime.timedelta(days=(k-1)*6*7)

        # Filtering Dataset
        training = XTraining[XTraining['Date'] < startDateValid]
        validation = XTraining[(XTraining['Date'] >= startDateValid) & (XTraining['Date'] <= endDateValid)]

        # Training and Validation Dataset
        # Training
        XKFoldTraining = training.drop(['Date', 'Sales'], axis=1)
        yKFoldTraining = training['Sales']

        # Validation
        XKFoldValidation = validation.drop(['Date', 'Sales'], axis=1)
        yKFoldValidation = validation['Sales']

        # Model
        ## Model Map
        modelMap = {
            'Linear Regression': LinearRegression(),
            'Linear Regression Regularized': Lasso(alpha=0.01)
        }
        
        ## Mapped Model
        model = modelMap[modelName]
        model.fit(XKFoldTraining, yKFoldTraining)

        # Prediction
        yhat = model.predict(XKFoldValidation)

        #Performance
        modelResult = mlError('Linear Regression', np.expm1(yKFoldValidation), np.expm1(yhat))
        
        #Store Performance of each KFold iteration
        maeList.append(modelResult['MAE'].tolist())
        mapeList.append(modelResult['MAPE'].tolist())
        rmseList.append(modelResult['RMSE'].tolist())


    dictResult = {
                    'Model Name': [modelName],
                    'MAE CV': [np.round(np.mean(maeList),2).astype(str) + ' +/- ' + np.round(np.std(maeList),2).astype(str)],
                    'MAPE CV': [np.round(np.mean(mapeList),2).astype(str) + ' +/- ' + np.round(np.std(mapeList),2).astype(str)],
                    'RMSE CV': [np.round(np.mean(rmseList),2).astype(str) + ' +/- ' + np.round(np.std(rmseList),2).astype(str)]
                }

    return pd.DataFrame(dictResult)



def mean_absolute_percentage_error(y, yhat):
    return np.mean(np.abs((y - yhat) / y))



def mlError(modelName, y, yhat):
    mae = mean_absolute_error(y, yhat)
    mape = mean_absolute_percentage_error(y, yhat)
    rmse = np.sqrt(mean_squared_error(y, yhat))
    
    return pd.DataFrame({
                            'ModelName': modelName,
                            'MAE': mae,
                            'MAPE': mape,
                            'RMSE': rmse,
                        }, index=[0])



def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 16]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    
    sns.set()

In [3]:
jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


## Loading Data

In [4]:
dfRaw = pd.read_csv('../../01-Data/Results/01-FirstRoundCRISP/dfDataPreparation.csv', low_memory=False, parse_dates=['Date'])

# MACHINE LEARNING MODELLING

In [31]:
dfRaw1 = dfRaw.copy()

## Feature Selection >> Boruta

In [32]:
toKeepBoruta = [
                'Store',
                'Promo',
                'StoreType',
                'Assortment',
                'CompetitionDistance',
                'CompetitionOpenSinceMonth',
                'CompetitionOpenSinceYear',
                'Promo2',
                'Promo2SinceWeek',
                'Promo2SinceYear',
                'CompetionTimeMonth',
                'PromoTimeWeek',
                'MonthSin',
                'MonthCos',
                'DaySin',
                'DayCos',
                'WeekOfYearSin',
                'WeekOfYearCos',
                'DayOfWeekSin',
                'DayOfWeekCos',
                'Date',
                'Sales']


## Split DataFrame into Training and Validation Dataset

In [81]:
toKeep = toKeepBoruta[:-2]

#Training Dataset
XTrain = dfRaw1[dfRaw1['Date'] < '2015-06-19']
yTrain = XTrain['Sales']
XTr = XTrain[toKeep]


#Validation Dataset
XTest = dfRaw1[dfRaw1['Date'] >= '2015-06-19']
yTest = XTest['Sales']
XTst = XTest[toKeep]

## Average Model

In [37]:
aux1 = XTst.copy()
aux1['Sales'] = yTest.copy()

# Prediction
aux2 = aux1[['Store', 'Sales']].groupby('Store').mean().reset_index().rename(columns={'Sales': 'Predictions'})
aux1 = pd.merge(aux1, aux2, how='left', on='Store')
yhatBaseline = aux1['Predictions']

#Performance
baselineResult = mlError('AverageModel', np.expm1(yTest), np.expm1(yhatBaseline))
baselineResult

Unnamed: 0,ModelName,MAE,MAPE,RMSE
0,AverageModel,1354.800353,0.2064,1835.135542


## Linear Regression Model

In [41]:
# Model
lr = LinearRegression()
lr.fit(XTr, yTrain)

# Prediction
yhatLr = lr.predict(XTst)

# Performance
lrResult = mlError('Linear Regression', np.expm1(yTest), np.expm1(yhatLr))
lrResult

Unnamed: 0,ModelName,MAE,MAPE,RMSE
0,Linear Regression,1867.089774,0.292694,2671.049215


## Linear Regression Regularized Model -> Lasso

In [43]:
# Model
lrr = Lasso(alpha=0.01)
lrr.fit(XTr, yTrain)

# Prediction
yhatLrr = lrr.predict(XTst)

# Performance
lrrResult = mlError('Linear Regression Regularized', np.expm1(yTest), np.expm1(yhatLrr))
lrrResult

Unnamed: 0,ModelName,MAE,MAPE,RMSE
0,Linear Regression Regularized,1890.285928,0.294716,2720.254331


## Random Forest Regressor

In [44]:
# Model
rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42)
rf.fit(XTr, yTrain)

# Prediction
yhatRf = rf.predict(XTst)

# Performance
rfResult = mlError('Random Forest Regressor', np.expm1(yTest), np.expm1(yhatRf))
rfResult

Unnamed: 0,ModelName,MAE,MAPE,RMSE
0,Random Forest Regressor,679.288277,0.099894,1010.633903


## XGBoost Regressor

In [59]:
# Model
modelXGB = xgb.XGBRegressor( objective='reg:squarederror',
                              n_estimators=500, 
                              eta=0.01, 
                              max_depth=10, 
                              subsample=0.7,
                              colsample_bytree=0.9)
modelXGB.fit(XTr, yTrain)

# Prediction
yhatXGB = modelXGB.predict(XTst)

# Performance
modelXGBResult = mlError('XGBoost Regressor', np.expm1(yTest), np.expm1(yhatXGB))
modelXGBResult

Unnamed: 0,ModelName,MAE,MAPE,RMSE
0,XGBoost Regressor,1270.979455,0.171472,1888.918717


## LightGBM Regressor

In [60]:
modelLGBM = LGBMRegressor(num_leaves=10, min_data_in_leaf=50, n_jobs=-1, random_state=42, n_estimators=500)

modelLGBM.fit(XTr, yTrain)

# Prediction
yhatLGBM = modelLGBM.predict(XTst)

# Performance
modelLGBMResult = mlError('lightgbm Regressor', np.expm1(yTest), np.expm1(yhatLGBM))
modelLGBMResult



Unnamed: 0,ModelName,MAE,MAPE,RMSE
0,lightgbm Regressor,1154.518817,0.173184,1658.615271


## Compared Model's Performance

In [61]:
modellingResult = pd.concat([baselineResult, lrResult, lrrResult, rfResult, modelXGBResult, modelLGBMResult])
modellingResult.sort_values('RMSE')

Unnamed: 0,ModelName,MAE,MAPE,RMSE
0,Random Forest Regressor,679.288277,0.099894,1010.633903
0,lightgbm Regressor,1154.518817,0.173184,1658.615271
0,AverageModel,1354.800353,0.2064,1835.135542
0,XGBoost Regressor,1270.979455,0.171472,1888.918717
0,Linear Regression,1867.089774,0.292694,2671.049215
0,Linear Regression Regularized,1890.285928,0.294716,2720.254331


# VALIDATION

## Split DataFrame into Training and Validation Dataset

In [82]:
#Training Dataset
XTraining = XTrain[toKeepBoruta]

In [None]:
crossValidation(XTraining, 5, 'Linear Regression', verbose=False)