# IMPORTS

## Libraries

In [1]:
import pandas as pd
import numpy as np

import datetime

import warnings
warnings.filterwarnings("ignore")

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.display import HTML

from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression, Lasso

## Helper Functions

In [2]:
def mean_absolute_percentage_error(y, yhat):
    return np.mean(np.abs((y - yhat) / y))

def mlError(modelName, y, yhat):
    mae = mean_absolute_error(y, yhat)
    mape = mean_absolute_percentage_error(y, yhat)
    rmse = np.sqrt(mean_squared_error(y, yhat))
    
    return pd.DataFrame({
                            'ModelName': modelName,
                            'MAE': mae,
                            'MAPE': mape,
                            'RMSE': rmse,
                        }, index=[0])

def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 16]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    
    sns.set()

In [3]:
jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


## Loading Data

In [4]:
dfRaw = pd.read_csv('../../01-Data/Results/01-FirstRoundCRISP/dfDataPreparation.csv', low_memory=False, parse_dates=['Date'])

# MACHINE LEARNING MODELLING

In [31]:
dfRaw1 = dfRaw.copy()

## Feature Selection >> Boruta

In [32]:
toKeepBoruta = [
                'Store',
                'Promo',
                'StoreType',
                'Assortment',
                'CompetitionDistance',
                'CompetitionOpenSinceMonth',
                'CompetitionOpenSinceYear',
                'Promo2',
                'Promo2SinceWeek',
                'Promo2SinceYear',
                'CompetionTimeMonth',
                'PromoTimeWeek',
                'MonthSin',
                'MonthCos',
                'DaySin',
                'DayCos',
                'WeekOfYearSin',
                'WeekOfYearCos',
                'DayOfWeekSin',
                'DayOfWeekCos',
                'Date',
                'Sales']


## Split DataFrame into Training and Validation Dataset

In [34]:
toKeep = toKeepBoruta[:-2]

#Training Dataset
XTrain = dfRaw1[dfRaw1['Date'] < '2015-06-19']
yTrain = XTrain['Sales']
XTrain = XTrain[toKeep]


#Validation Dataset
XValid = dfRaw1[dfRaw1['Date'] >= '2015-06-19']
yValid = XValid['Sales']
XValid = XValid[toKeep]

## Average Model

In [37]:
aux1 = XValid.copy()
aux1['Sales'] = yValid.copy()

# Prediction
aux2 = aux1[['Store', 'Sales']].groupby('Store').mean().reset_index().rename(columns={'Sales': 'Predictions'})
aux1 = pd.merge(aux1, aux2, how='left', on='Store')
yhatBaseline = aux1['Predictions']

#Performance
baselineResult = mlError('AverageModel', np.expm1(yValid), np.expm1(yhatBaseline))
baselineResult

Unnamed: 0,ModelName,MAE,MAPE,RMSE
0,AverageModel,1354.800353,0.2064,1835.135542


## Linear Regression Model

In [41]:
# Model
lr = LinearRegression()
lr.fit(XTrain, yTrain)

# Prediction
yhatLr = lr.predict(XValid)

# Performance
lrResult = mlError('Linear Regression', np.expm1(yValid), np.expm1(yhatLr))
lrResult

Unnamed: 0,ModelName,MAE,MAPE,RMSE
0,Linear Regression,1867.089774,0.292694,2671.049215


## Linear Regression Regularized Model -> Lasso

In [43]:
# Model
lrr = Lasso(alpha=0.01)
lrr.fit(XTrain, yTrain)

# Prediction
yhatLrr = lrr.predict(XValid)

# Performance
lrrResult = mlError('Linear Regression Regularized', np.expm1(yValid), np.expm1(yhatLrr))
lrrResult

Unnamed: 0,ModelName,MAE,MAPE,RMSE
0,Linear Regression Regularized,1890.285928,0.294716,2720.254331
