In [17]:
#Importing library

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import datetime

from scipy import stats
from scipy.stats import skew
from scipy.stats import norm
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from math import sqrt


%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [18]:
#Importing dataset
train = pd.read_csv("dataset/processed/train_rand.csv", index_col=0)
val = pd.read_csv("dataset/processed/val1_rand.csv", index_col=0)

In [19]:
train.OrdinalDate = train.OrdinalDate.map(lambda x: x % 365)
val.OrdinalDate = val.OrdinalDate.map(lambda x: x % 365)

We prepare the target values for all the sets

In [20]:
y=train['NumberOfSales']
y_val1=val['NumberOfSales']

We prepare the training input samples for all the sets

In [21]:
X=train.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)
X_val1=val.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)

We create a table to save the results for the search of the hyperparameters

In [6]:
res = {'number of trees' : [], 'number of attributes' : [], 'depth' : [], 'mae' : []}

In [7]:
nt=100
for na in range(20,30,4):
    for depth in range(31,50,8):
        forest = RandomForestRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
        forest.fit(X, y)
        y_pred=forest.predict(X_val1)
        mae_val1=mean_absolute_error(y_val1,y_pred)
        res['number of trees'].append(nt)
        res['number of attributes'].append(na)
        res['depth'].append(depth)
        res['mae'].append(mae_val1)
        print(mae_val1, na, nt, depth)
r_df = pd.DataFrame(res)

383.45218146864914 20 100 31
383.1556854659545 20 100 39
383.4923013175628 20 100 47
384.3861635570196 24 100 31
383.2323836652804 24 100 39
383.6505568652767 24 100 47
385.54816395849616 28 100 31
384.5295717747027 28 100 39
384.62050281557356 28 100 47


In [8]:
r_df['mae'].min()

383.1556854659545

In [9]:
r_df.to_csv("parameters.csv")

Run on the test set and save out in the format for the evaluation of the error

In [22]:
test = pd.read_csv("dataset/processed/val2.csv", index_col=0)
test.OrdinalDate = test.OrdinalDate.map(lambda x: x % 365)

In [24]:
y_test = test['NumberOfSales']
X_test=test.drop(['StoreID','Date', 'NumberOfSales', 'Region','Mean_Dew_PointC'], axis=1)

In [25]:
X=train.drop(['StoreID','Date', 'NumberOfSales', 'Region','Mean_Dew_PointC'], axis=1)

In [26]:
forest = RandomForestRegressor(max_depth=40, random_state=0, n_estimators=201, max_features=21, n_jobs=-1)
forest.fit(X, y)
y_pred_test=forest.predict(X_test)

In [27]:
y_test=y_test.reset_index(drop=True)
test=test.reset_index(drop=True)

In [28]:
columns=['Date', 'StoreID', 'RegionID', 'SalesPredicted', 'SalesReal']
index=range(y_test.shape[0])
result=pd.DataFrame(index=index,columns=columns)
result['Date']=test['Date']
result['StoreID']=test['StoreID']
result['RegionID']=test['Region']
result['SalesPredicted']=y_pred_test
result['SalesReal']=y_test

In [29]:
result.head(5)

Unnamed: 0,Date,StoreID,RegionID,SalesPredicted,SalesReal
0,2018-01-01,1000,7,9398.631841,8540.0
1,2018-01-02,1000,7,6781.164179,10364.0
2,2018-01-03,1000,7,6190.099502,4676.0
3,2018-01-05,1000,7,8987.298507,6267.0
4,2018-01-06,1000,7,6291.850746,5953.0


In [31]:
result.to_csv("results/results_rf.csv")

In [32]:
mean_absolute_error(y_pred_test,y_test)

586.153249529348