In [4]:
#Importing library

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt

from scipy import stats
from scipy.stats import skew
from scipy.stats import norm
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt


%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [5]:
#Importing dataset
df = pd.read_csv("dataset/train_processed.csv")

In [6]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,StoreID,Date,IsHoliday,IsOpen,HasPromotions,StoreType,NearestCompetitor,NumberOfCustomers,NumberOfSales,...,Hail,Thunderstorm,IsSaturday,IsSunday,WasOpenYesterday,IsOpenTomorrow,YesterdaySales,Previous3DaysSales,PreviousWeekSales,PreviousMonthSales
0,29,1000,2016-03-31,0,1,1,2,326,846,11907,...,0,0,0,0,1,0,9055,8731.333333,6875.714286,6490.517241
1,30,1000,2016-04-01,1,0,1,2,326,0,0,...,0,0,0,0,1,1,11907,9850.333333,7428.0,6705.37931
2,31,1000,2016-04-02,0,1,0,2,326,851,11129,...,0,0,1,0,0,0,0,6987.333333,6300.857143,6425.689655
3,32,1000,2016-04-03,0,0,0,2,326,0,0,...,0,0,0,1,1,0,11129,7678.666667,7032.857143,6582.466667
4,33,1000,2016-04-04,1,0,0,2,326,0,0,...,0,0,0,0,0,1,0,3709.666667,7032.857143,6305.8


In [7]:
df.shape

(501300, 49)

We normalize the data so that all values are between 0 and 1. So we first remove 'Date' which is the only non numerical attribute. The we apply range normalisation and at the end we add 'Date'. 

In [8]:
df_date=df['Date']
df_no_date=df.drop(['Date'], axis=1)
df_norm= (df_no_date - df_no_date.min()) / (df_no_date.max() - df_no_date.min())
df=df_norm
df['Date']=df_date
df.head(5)

Unnamed: 0.1,Unnamed: 0,StoreID,IsHoliday,IsOpen,HasPromotions,StoreType,NearestCompetitor,NumberOfCustomers,NumberOfSales,Region_AreaKM2,...,Thunderstorm,IsSaturday,IsSunday,WasOpenYesterday,IsOpenTomorrow,YesterdaySales,Previous3DaysSales,PreviousWeekSales,PreviousMonthSales,Date
0,0.0,0.0,0.0,1.0,1.0,0.666667,0.003281,0.3835,0.446943,0.291715,...,0.0,0.0,0.0,1.0,0.0,0.33989,0.363301,0.355788,0.372606,2016-03-31
1,2e-06,0.0,1.0,0.0,1.0,0.666667,0.003281,0.0,0.0,0.291715,...,0.0,0.0,0.0,1.0,1.0,0.446943,0.409861,0.384367,0.384941,2016-04-01
2,4e-06,0.0,0.0,1.0,0.0,0.666667,0.003281,0.385766,0.41774,0.291715,...,0.0,1.0,0.0,0.0,0.0,0.0,0.290735,0.326042,0.368885,2016-04-02
3,6e-06,0.0,0.0,0.0,0.0,0.666667,0.003281,0.0,0.0,0.291715,...,0.0,0.0,1.0,1.0,0.0,0.41774,0.319501,0.36392,0.377885,2016-04-03
4,8e-06,0.0,1.0,0.0,0.0,0.666667,0.003281,0.0,0.0,0.291715,...,0.0,0.0,0.0,0.0,1.0,0.0,0.154355,0.36392,0.362002,2016-04-04


We split the dataset in 4 parts:
* df_train: the training set (all the rows before 2017-09-01). We use it to train our model.
* df_val1: the first validation set (all the rows between 2017-09-01 and 2017-11-01). We use it to tune the hyperparameter of our model.
* df_val2: the second validation set (all the rows between 2017-11-01 and 2018-01-01). We use it to select the best model.
* df_val3: the third validation set (all the rows after 2018-01-01). We use it to check that our final model works properly.

In [9]:
df_val3=df.loc[(df['Date']>='2018-01-01')]
df_val3_rest=df.loc[(df['Date']<'2018-01-01')]
df_val2=df_val3_rest.loc[(df_val3_rest['Date']>='2017-11-01')]
df_val2_rest=df.loc[(df['Date']<'2017-11-01')]
df_val1=df_val2_rest.loc[(df_val2_rest['Date']>='2017-09-01')]
df_train=df.loc[(df['Date']<'2017-09-01')]

We remove 'Date' from our set because it is only used as index

In [18]:
df_train_clear=df_train.drop(['Date'], axis=1)
df_val1_clear=df_val1.drop(['Date'], axis=1)
df_val2_clear=df_val2.drop(['Date'], axis=1)
df_val3_clear=df_val3.drop(['Date'], axis=1)

Per ora facciamo tutto con un solo store così è più veloce

In [19]:
df_store=df_train_clear.loc[(df_train_clear['StoreID']==0.0)]
df_val1_store=df_val1_clear.loc[(df_val1_clear['StoreID']==0.0)]
df_val2_store=df_val2_clear.loc[(df_val2_clear['StoreID']==0.0)]
df_val3_store=df_val3_clear.loc[(df_val3_clear['StoreID']==0.0)]

We prepare the target valuesy for all the sets

In [20]:
y=df_store['NumberOfSales']
y_val1=df_val1_store['NumberOfSales']
y_val2=df_val2_store['NumberOfSales']
y_val3=df_val3_store['NumberOfSales']

We prepare the training input samples for all the sets

In [21]:
X=df_store.drop(['StoreID','NumberOfSales','NumberOfCustomers','YesterdaySales','Previous3DaysSales','PreviousWeekSales','PreviousMonthSales'], axis=1)
X_val1=df_val1_store.drop(['StoreID','NumberOfSales','NumberOfCustomers','YesterdaySales','Previous3DaysSales','PreviousWeekSales','PreviousMonthSales'], axis=1)
X_val2=df_val2_store.drop(['StoreID','NumberOfSales','NumberOfCustomers','YesterdaySales','Previous3DaysSales','PreviousWeekSales','PreviousMonthSales'], axis=1)
X_val3=df_val3_store.drop(['StoreID','NumberOfSales','NumberOfCustomers','YesterdaySales','Previous3DaysSales','PreviousWeekSales','PreviousMonthSales'], axis=1)

In [65]:
X.to_csv('X.csv')
X_val1.to_csv('X_val1.csv')
X_val2.to_csv('X_val2.csv')
X_val3.to_csv('X_val3.csv')
y_val1.to_csv('y_val1.csv')
y_val2.to_csv('y_val2.csv')
y_val3.to_csv('y_val3.csv')

We create the dataframe in which we put the result of the hyperparameter tuning. It is a table with the following colums:
* num_of_trees: the number of trees of the random forest
* depth: the maximum depth of the random forest
* num_of_attr: the maximum number of attributes considered at each split
* mse: the mean squared error

In [23]:
columns=['num_of_trees','depth','num_of_attr','mse']
index=range(1,200000)
hp=pd.DataFrame(index=index,columns=columns)

We cycle over all the combinations of the hyperparameters. At each iterations we do the following operations:
* computer the model using the training set
* predict the samples in the first validation set using the model
* compute the mean squared error
* add the result in the dataframe for the hyperparameters

In [61]:
count=0
for depth in range(1,250,1):
    print(depth)
    for nt in range(3,250,2):
        for na in range(1:42:1)
            forest = RandomForestRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
            forest.fit(X, y)
            y_pred=forest.predict(X_val1)
            mse_val1=mean_squared_error(y_val1,y_pred)
            hp.at[count,'num_of_trees']=nt
            hp.at[count,'num_of_attr']=na
            hp.at[count,'depth']=depth
            hp.at[count,'mse']=sqrt(mse_val1)
            count+=1

7
57
107
157
207
257
307
357
407
457


In [53]:
hp.to_csv('randomforest2.csv')

In [54]:
hp['mse'].min()

0.03082127758624276

In [62]:
hp_under_31=hp.loc[(hp['mse']<0.031)]
print(hp_under_31.shape)
print(hp_under_31.head(6))

(5, 4)
     num_of_trees depth num_of_attr        mse
2028            9    21           7  0.0308498
2031           10    21           7  0.0308213
2100           33    21           7  0.0309453
2676           33    25           7  0.0308803
2679           34    25           7  0.0309601


In [38]:
h = pd.read_csv('dataset/randomforest.csv')

In [60]:
good = hp.loc[(h['mse'] < 0.033)]
with pd.option_context('display.max_rows', None, 'display.max_columns', 5):
    print(hp.loc[(hp['mse'] < 0.033)])

      num_of_trees depth num_of_attr        mse
44              19     7           9  0.0329064
470             17    10           9  0.0320995
473             18    10           9  0.0324086
476             19    10           9   0.032235
479             20    10           9  0.0320452
482             21    10           9   0.031586
485             22    10           9   0.031018
488             23    10           9  0.0314287
491             24    10           9  0.0312567
494             25    10           9  0.0312293
497             26    10           9  0.0315364
500             27    10           9   0.032092
503             28    10           9  0.0326623
1812            33    19           7  0.0329538
1848            45    19           7  0.0329951
1851            46    19           7  0.0326568
1854            47    19           7  0.0327689
1884             9    20           7  0.0324715
1887            10    20           7  0.0325074
1959            34    20           7  0.

In [49]:
print(good['num_of_attr'].unique())
print(good['depth'].unique())
print(good['num_of_trees'].unique())

[ 25.   9.  19.   7.  23.]
[  7.   9.  13.  15.  19.  21.  23.  25.  27.  29.  31.  33.  35.  37.  39.
  41.  43.  45.  47.  49.  51.  53.  55.  57.  59.  61.  63.  65.  67.  69.
  71.  73.  75.  77.  79.  81.  83.  85.  87.  89.  91.  93.  95.  97.  99.]
[  5.   7.  19.  37.  21.  23.  27.  29.  31.  33.  35.  39.  41.  43.  47.
  45.   9.  11.  13.  15.  17.  49.  51.  53.]
