In [1]:
#Importing library

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import datetime

from scipy import stats
from scipy.stats import skew
from scipy.stats import norm
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt


%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
#Importing dataset
df = pd.read_csv("dataset/train_processed.csv")

We normalize the data so that all values are between 0 and 1. So we first remove 'Date' which is the only non numerical attribute. The we apply range normalisation and at the end we add 'Date'. 

In [3]:
df_date=df['Date']
df_no_date=df.drop(['Date'], axis=1)
df_norm= (df_no_date - df_no_date.min()) / (df_no_date.max() - df_no_date.min())
df=df_norm
df['Date']=df_date

We split the dataset in 4 parts:
* df_train: the training set (all the rows before 2017-09-01). We use it to train our model.
* df_val1: the first validation set (all the rows between 2017-09-01 and 2017-11-01). We use it to tune the hyperparameter of our model.
* df_val2: the second validation set (all the rows between 2017-11-01 and 2018-01-01). We use it to select the best model.
* df_val3: the third validation set (all the rows after 2018-01-01). We use it to check that our final model works properly.

In [4]:
df_val3=df.loc[(df['Date']>='2018-01-01')]
df_val3_rest=df.loc[(df['Date']<'2018-01-01')]
df_val2=df_val3_rest.loc[(df_val3_rest['Date']>='2017-11-01')]
df_val2_rest=df.loc[(df['Date']<'2017-11-01')]
df_val1=df_val2_rest.loc[(df_val2_rest['Date']>='2017-09-01')]
df_train=df.loc[(df['Date']<'2017-09-01')]

We remove 'Date' from our set because it is only used as index

In [5]:
df_train_clear=df_train.drop(['Date'], axis=1)
df_val1_clear=df_val1.drop(['Date'], axis=1)
df_val2_clear=df_val2.drop(['Date'], axis=1)
df_val3_clear=df_val3.drop(['Date'], axis=1)

We prepare the target valuesy for all the sets

In [6]:
y=df_train_clear['NumberOfSales']
y_val1=df_val1_clear['NumberOfSales']
y_val2=df_val2_clear['NumberOfSales']
y_val3=df_val3_clear['NumberOfSales']

We prepare the training input samples for all the sets

In [7]:
X=df_train_clear.drop(['StoreID','NumberOfSales','NumberOfCustomers'], axis=1)
X_val1=df_val1_clear.drop(['StoreID','NumberOfSales','NumberOfCustomers'], axis=1)
X_val2=df_val2_clear.drop(['StoreID','NumberOfSales','NumberOfCustomers'], axis=1)
X_val3=df_val3_clear.drop(['StoreID','NumberOfSales','NumberOfCustomers'], axis=1)

We create the dataframe in which we put the result of the hyperparameter tuning. It is a table with the following colums:
* num_of_trees: the number of trees of the random forest
* depth: the maximum depth of the random forest
* num_of_attr: the maximum number of attributes considered at each split
* mse: the mean squared error

In [8]:
columns=['num_of_trees','depth','num_of_attr','mse','mae']
index=range(1,1125000)
hp=pd.DataFrame(index=index,columns=columns)

We cycle over all the combinations of the hyperparameters. At each iterations we do the following operations:
* computer the model using the training set
* predict the samples in the first validation set using the model
* compute the mean squared error
* add the result in the dataframe for the hyperparameters

In [None]:
nt=201
na=12
depth=21

feature_names = list(X.columns)

forest = RandomForestRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
forest.fit(X, y)
y_pred=forest.predict(X_val1)
mse_val1=mean_squared_error(y_val1,y_pred)
mae_val1=mean_absolute_error(y_val1,y_pred)
print("con tutte le feature", sqrt(mse_val1), mae_val1)

for feature in feature_names[21:]:
    X=df_train_clear.drop(['StoreID','NumberOfSales','NumberOfCustomers'], axis=1)
    X_val1=df_val1_clear.drop(['StoreID','NumberOfSales','NumberOfCustomers'], axis=1)
    X=X.drop([feature], axis=1)
    X_val1=X_val1.drop([feature], axis=1)
    forest = RandomForestRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
    forest.fit(X, y)
    y_pred=forest.predict(X_val1)
    mse_val1=mean_squared_error(y_val1,y_pred)
    mae_val1=mean_absolute_error(y_val1,y_pred)
    print(feature, sqrt(mse_val1), mae_val1)