In [3]:
#Importing library

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import datetime

from scipy import stats
from scipy.stats import skew
from scipy.stats import norm
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt


%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [4]:
#Importing dataset
df = pd.read_csv("dataset/train_processed.csv")

We normalize the data so that all values are between 0 and 1. So we first remove 'Date' which is the only non numerical attribute. The we apply range normalisation and at the end we add 'Date'. 

In [5]:
df_date=df['Date']
df_no_date=df.drop(['Date'], axis=1)
df_norm= (df_no_date - df_no_date.min()) / (df_no_date.max() - df_no_date.min())
df=df_norm
df['Date']=df_date

We split the dataset in 4 parts:
* df_train: the training set (all the rows before 2017-09-01). We use it to train our model.
* df_val1: the first validation set (all the rows between 2017-09-01 and 2017-11-01). We use it to tune the hyperparameter of our model.
* df_val2: the second validation set (all the rows between 2017-11-01 and 2018-01-01). We use it to select the best model.
* df_val3: the third validation set (all the rows after 2018-01-01). We use it to check that our final model works properly.

In [6]:
df_val3=df.loc[(df['Date']>='2018-01-01')]
df_val3_rest=df.loc[(df['Date']<'2018-01-01')]
df_val2=df_val3_rest.loc[(df_val3_rest['Date']>='2017-11-01')]
df_val2_rest=df.loc[(df['Date']<'2017-11-01')]
df_val1=df_val2_rest.loc[(df_val2_rest['Date']>='2017-09-01')]
df_train=df.loc[(df['Date']<'2017-09-01')]

We remove 'Date' from our set because it is only used as index

In [7]:
df_train_clear=df_train.drop(['Date'], axis=1)
df_val1_clear=df_val1.drop(['Date'], axis=1)
df_val2_clear=df_val2.drop(['Date'], axis=1)
df_val3_clear=df_val3.drop(['Date'], axis=1)

We prepare the target valuesy for all the sets

In [8]:
y=df_train_clear['NumberOfSales']
y_val1=df_val1_clear['NumberOfSales']
y_val2=df_val2_clear['NumberOfSales']
y_val3=df_val3_clear['NumberOfSales']

We prepare the training input samples for all the sets

In [9]:
X=df_train_clear.drop(['StoreID','NumberOfSales','NumberOfCustomers','Unnamed: 0'], axis=1)
X_val1=df_val1_clear.drop(['StoreID','NumberOfSales','NumberOfCustomers', 'Unnamed: 0'], axis=1)
X_val2=df_val2_clear.drop(['StoreID','NumberOfSales','NumberOfCustomers', 'Unnamed: 0'], axis=1)
X_val3=df_val3_clear.drop(['StoreID','NumberOfSales','NumberOfCustomers', 'Unnamed: 0'], axis=1)

We create the dataframe in which we put the result of the hyperparameter tuning. It is a table with the following colums:
* num_of_trees: the number of trees of the random forest
* depth: the maximum depth of the random forest
* num_of_attr: the maximum number of attributes considered at each split
* mse: the mean squared error

In [10]:
columns=['num_of_trees','depth','num_of_attr','mse','mae']
index=range(1,1125000)
hp=pd.DataFrame(index=index,columns=columns)

We cycle over all the combinations of the hyperparameters. At each iterations we do the following operations:
* computer the model using the training set
* predict the samples in the first validation set using the model
* compute the mean squared error
* add the result in the dataframe for the hyperparameters

In [11]:
for depth in range(21,22,1):
    for nt in range(201,202,1):
        for na in range(12,13,1):
            forest = RandomForestRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
            forest.fit(X, y)
            y_pred=forest.predict(X_val1)
            mse_val1=mean_squared_error(y_val1,y_pred)
            mae_val1=mean_absolute_error(y_val1,y_pred)
            index = depth * 250 * 42 + nt * 42 + na
            hp.at[index,'num_of_trees']=nt
            hp.at[index,'num_of_attr']=na
            hp.at[index,'depth']=depth
            hp.at[index, 'mae']=mae_val1
            hp.at[index,'mse']=sqrt(mse_val1)
            print(na, sqrt(mse_val1), mae_val1)

12 0.027997338070694683 0.0193014125829


In [15]:
for depth in range(21,22,1):
    for nt in range(176,201,2):
        for na in [12]:
            forest = RandomForestRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
            forest.fit(X, y)
            y_pred=forest.predict(X_val1)
            mse_val1=mean_squared_error(y_val1,y_pred)
            mae_val1=mean_absolute_error(y_val1,y_pred)
            index = depth * 250 * 42 + nt * 42 + na
            hp.at[index,'num_of_trees']=nt
            hp.at[index,'num_of_attr']=na
            hp.at[index,'depth']=depth
            hp.at[index, 'mae']=mae_val1
            hp.at[index,'mse']=sqrt(mse_val1)
            print(nt, na, sqrt(mse_val1), mae_val1)

176 12 0.02800948796158203 0.0193163116927
178 12 0.02802115974454208 0.019324038642
180 12 0.028018845956474994 0.0193246237388
182 12 0.02801809810887908 0.0193220944042
184 12 0.028014272321774687 0.0193192960455
186 12 0.02801309103280776 0.0193189740376
188 12 0.028010769313112784 0.0193164733655
190 12 0.028007156963204046 0.0193124552313
192 12 0.028006442104933454 0.0193126827693
194 12 0.028002756979256537 0.0193088393605
196 12 0.028000385362644944 0.0193066573033


KeyboardInterrupt: 

In [14]:
for depth in range(1,100,20):
    for nt in [176, 201]:
        for na in [12]:
            forest = RandomForestRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
            forest.fit(X, y)
            y_pred=forest.predict(X_val1)
            mse_val1=mean_squared_error(y_val1,y_pred)
            mae_val1=mean_absolute_error(y_val1,y_pred)
            index = depth * 250 * 42 + nt * 42 + na
            hp.at[index,'num_of_trees']=nt
            hp.at[index,'num_of_attr']=na
            hp.at[index,'depth']=depth
            hp.at[index, 'mae']=mae_val1
            hp.at[index,'mse']=sqrt(mse_val1)
            print(nt, na, depth, sqrt(mse_val1), mae_val1)

176 12 0.06635467590993827 0.0491102811965
201 12 0.06656747009820121 0.0492854508562
176 12 0.027080232972220227 0.0187110225892
201 12 0.027050311862253797 0.0186987448412
176 12 0.027192424839826833 0.0187773032505
201 12 0.027160908899775137 0.0187654702558
176 12 0.027270196468718477 0.0188467487732
201 12 0.027246287222304554 0.0188340027005
176 12 0.027270196468718477 0.0188467487732


KeyboardInterrupt: 

In [None]:
with pd.option_context('display.max_rows', None, 'display.max_columns', 5):
    print(hp.loc[(hp['mse'] < 0.02115)])

In [16]:
with pd.option_context('display.max_rows', None, 'display.max_columns', 5):
    print(hp.loc[(hp['mae'] < 0.0187)])

       num_of_trees depth num_of_attr        mse        mae
228954          201    21          12  0.0270503  0.0186987


In [17]:
best_na = 12
best_depth = 21
best_nt = 201
best_forest = RandomForestRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
best_forest.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=81,
           max_features=12, max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=201, n_jobs=-1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [21]:
feat_labels = list(X.columns.values)
features = zip(feat_labels, best_forest.feature_importances_)
sorted(features, key = lambda t: t[1])

[('Hail', 6.227645103790334e-05),
 ('Thunderstorm', 0.00029765657310432976),
 ('Snow', 0.00054289998066171525),
 ('Fog', 0.00063365419472071697),
 ('Rain', 0.00078691827346538939),
 ('IsHoliday', 0.00087542612476168829),
 ('AssortmentType_With Fish Department', 0.001004806331563014),
 ('Precipitationmm', 0.0019528622052473518),
 ('Max_VisibilityKm', 0.0026327032839793016),
 ('Max_Humidity', 0.0030576337373929138),
 ('Min_VisibilitykM', 0.0033351856993745284),
 ('CloudCover', 0.0033941052945535692),
 ('Max_Wind_SpeedKm_h', 0.0035160587811830357),
 ('Mean_Wind_SpeedKm_h', 0.003889749804441412),
 ('Max_Gust_SpeedKm_h', 0.0039470626683652628),
 ('IsSunday', 0.0041630080061116443),
 ('Min_TemperatureC', 0.0043936603217298205),
 ('Mean_VisibilityKm', 0.0044259196158941356),
 ('Max_Dew_PointC', 0.0045698099674693227),
 ('Mean_TemperatureC', 0.0045999157447391041),
 ('Mean_Dew_PointC', 0.0046467638023455458),
 ('Min_Dew_PointC', 0.0049460439414199158),
 ('Mean_Humidity', 0.005782325736101929),

In [30]:
subset_features = ['MeanStoreSales', 'HasPromotions','NearestCompetitor','WasOpenYesterday','MeanMonthSales',
'MeanMonthSales', 'Region_AreaKM2','StoreType','IsSaturday','Region_GDP','Region_PopulationK',
'Max_Sea_Level_PressurehPa', 'IsOpenTomorrow', 'AssortmentType_With Non-Food Department'
]
reducedX = X.loc[:,subset_features]
reducedX_val1 = X_val1.loc[:,subset_features]

In [33]:
columns=['num_of_trees','depth','num_of_attr','mse','mae']
index=range(1,1125000)
hp_reduced=pd.DataFrame(index=index,columns=columns)

In [37]:
for depth in range(10,11,1):
    for nt in range(150,200,1):
        for na in range(1,15,1):
            forest = RandomForestRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
            forest.fit(reducedX, y)
            y_pred=forest.predict(reducedX_val1)
            mse_val1=mean_squared_error(y_val1,y_pred)
            mae_val1=mean_absolute_error(y_val1,y_pred)
            index = depth * 250 * 14 + nt * 14 + na
            hp_reduced.at[index,'num_of_trees']=nt
            hp_reduced.at[index,'num_of_attr']=na
            hp_reduced.at[index,'depth']=depth
            hp_reduced.at[index, 'mae']=mae_val1
            hp_reduced.at[index,'mse']=sqrt(mse_val1)
            print(na, nt, depth, sqrt(mse_val1), mae_val1)

1 150 10 0.04510597298383087 0.0334100418863
2 150 10 0.03539640070494816 0.026023585045
3 150 10 0.03175133640719739 0.0230714242029
4 150 10 0.029948783379345506 0.021405071154
5 150 10 0.029629580462081046 0.0209851216117
6 150 10 0.02963270379956382 0.020879818962
7 150 10 0.029644720697368003 0.0208620652953
8 150 10 0.029731976434802666 0.0208920970946
9 150 10 0.029779358143908072 0.0209230902561
10 150 10 0.02987495477641638 0.0209902700687
11 150 10 0.029879857323073845 0.0210055566808
12 150 10 0.029982607516331836 0.0210865963429
13 150 10 0.03006430930870395 0.0211411087126
14 150 10 0.030163281644171945 0.0212075205036
1 151 10 0.045114953855821455 0.0334164893074
2 151 10 0.035419023616248116 0.0260373849254
3 151 10 0.031772518342297634 0.0230890024102
4 151 10 0.029949947551557433 0.0214057407334
5 151 10 0.029623085304476507 0.0209798683351
6 151 10 0.02962523197249009 0.0208738501478
7 151 10 0.02963392476994888 0.0208532902808
8 151 10 0.029731481055590505 0.02089168

KeyboardInterrupt: 