In [1]:
#Importing library

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import datetime

from scipy import stats
from scipy.stats import skew
from scipy.stats import norm
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt


%matplotlib inline
%config InlineBackend.figure_format = 'retina'

We load the training set and the validation set which have been already preprocessed

In [2]:
#Importing dataset
train = pd.read_csv("dataset/processed/train_rand.csv", index_col=0)
val = pd.read_csv("dataset/processed/val1_rand.csv", index_col=0)
test = pd.read_csv("dataset/processed/val2.csv", index_col=0)

In [3]:
train.OrdinalDate = train.OrdinalDate.map(lambda x: x % 365)
val.OrdinalDate = val.OrdinalDate.map(lambda x: x % 365)
test.OrdinalDate = test.OrdinalDate.map(lambda x: x % 365)

We prepare the training input samples and the labels for all the sets

In [4]:
y=train['NumberOfSales']
y_val1=val['NumberOfSales']
y_val2=test['NumberOfSales']

X=train.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)
X_val1=val.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)
X_val2=test.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)

We try to find good hyperparameter in Extremely Randomized Regressor. First of all we try to find good values for max_depth and max_features

In [5]:
nt=100
best_ma = 1000000

for depth in range(35,45,2):
    for na in range(15,25,2):
        
        forest = ExtraTreesRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
        forest.fit(X, y)
        
        y_pred=forest.predict(X_val1)
        mae_val1=mean_absolute_error(y_val1,y_pred)
        
        if(mae_val1 < best_ma):
            best_ma = mae_val1
            best_depth = depth
            best_na = na
       
        print("na= ", na, " depth= ", depth, " mae= ", mae_val1)

print("best_na= ", best_na, " best_depth= ", best_depth, " best_ma= ", best_ma)

na=  15  depth=  35  mae=  392.9575211698765
na=  17  depth=  35  mae=  393.4805472584526
na=  19  depth=  35  mae=  395.2070347529328
na=  21  depth=  35  mae=  394.87881238365276
na=  23  depth=  35  mae=  394.5399960661282
na=  15  depth=  37  mae=  392.7001126232367
na=  17  depth=  37  mae=  392.9004105457858
na=  19  depth=  37  mae=  394.0109684714265
na=  21  depth=  37  mae=  393.9910319242007
na=  23  depth=  37  mae=  395.24315332547053
na=  15  depth=  39  mae=  392.2170056738749
na=  17  depth=  39  mae=  393.1242348756863
na=  19  depth=  39  mae=  393.495005715599
na=  21  depth=  39  mae=  393.96618393774804
na=  23  depth=  39  mae=  394.1184020831948
na=  15  depth=  41  mae=  391.875723497066
na=  17  depth=  41  mae=  393.5910562051734
na=  19  depth=  41  mae=  394.38242491745166
na=  21  depth=  41  mae=  393.8198681241312
na=  23  depth=  41  mae=  394.72285930969673
na=  15  depth=  43  mae=  393.49224503441854
na=  17  depth=  43  mae=  393.0707207825673
na=  1

Now we check if removing some features we obtain a lower validation error

In [6]:
train.columns

Index(['StoreID', 'Date', 'IsHoliday', 'HasPromotions', 'StoreType',
       'NearestCompetitor', 'Region', 'NumberOfSales', 'Region_AreaKM2',
       'Region_GDP', 'Region_PopulationK', 'CloudCover', 'Max_Dew_PointC',
       'Max_Gust_SpeedKm_h', 'Max_Humidity', 'Max_Sea_Level_PressurehPa',
       'Max_TemperatureC', 'Max_VisibilityKm', 'Max_Wind_SpeedKm_h',
       'Mean_Dew_PointC', 'Mean_Humidity', 'Mean_Sea_Level_PressurehPa',
       'Mean_TemperatureC', 'Mean_VisibilityKm', 'Mean_Wind_SpeedKm_h',
       'Min_Dew_PointC', 'Min_Humidity', 'Min_Sea_Level_PressurehPa',
       'Min_TemperatureC', 'Min_VisibilitykM', 'Precipitationmm',
       'WindDirDegrees', 'AssortmentType_With Fish Department',
       'AssortmentType_With Non-Food Department', 'Rain', 'Snow', 'Fog',
       'Hail', 'Thunderstorm', 'IsSaturday', 'IsSunday', 'OrdinalDate',
       'WasOpenYesterday', 'IsOpenTomorrow', 'MeanMonthSales',
       'MeanStoreSales', 'StdStoreSales'],
      dtype='object')

In [7]:
nt=100
na=best_na
depth=best_depth

feature_names = list(X.columns)

feature_best = ''
mae_best = 10000000

for feature in feature_names:
    X_t=X[feature]
    X_val_t=X_val1[feature]
    X_f = X.drop([feature], axis=1)
    X_val_f = X_val1.drop([feature], axis=1)

    forest = ExtraTreesRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
    forest.fit(X_f, y)

    y_pred = forest.predict(X_val_f)
    mae = mean_absolute_error(y_val1, y_pred)

    print(feature, mae)

    if( mae < mae_best ):
        feature_best = feature
        mae_best = mae
    X[feature]=X_t
    X_val1[feature]=X_val_t
    
print(feature_best, mae_best)

IsHoliday 394.04614160148236
HasPromotions 466.71342997348034
StoreType 397.6493085726903
NearestCompetitor 397.98747313305046
Region_AreaKM2 393.1670051081153
Region_GDP 393.4981575283256
Region_PopulationK 393.8225351874293
CloudCover 392.8573209373665
Max_Dew_PointC 394.4272645211077
Max_Gust_SpeedKm_h 393.4433704298983
Max_Humidity 392.8910008340378
Max_Sea_Level_PressurehPa 393.76407495518555
Max_TemperatureC 393.79952635121833
Max_VisibilityKm 393.8495152559683
Max_Wind_SpeedKm_h 393.02884426213876
Mean_Dew_PointC 393.6829461482509
Mean_Humidity 393.723647210618
Mean_Sea_Level_PressurehPa 393.3813708424867
Mean_TemperatureC 393.30082365847176
Mean_VisibilityKm 392.4378063724292
Mean_Wind_SpeedKm_h 393.8634596997554
Min_Dew_PointC 392.75327675812696
Min_Humidity 392.8219941450898
Min_Sea_Level_PressurehPa 393.5966943074798
Min_TemperatureC 393.2227286994512
Min_VisibilitykM 393.53479979055135
Precipitationmm 393.65978668800585
WindDirDegrees 393.8310924524156
AssortmentType_With F

no feature has to be removed, since the mae doesn't decrease

Now we optimize the number of estimators

In [9]:
best_ma = 1000000
na=best_na
depth=best_depth

X=train.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)
X_val1=val.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)

for nt in range(50,201,25):
        
    forest = ExtraTreesRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
    forest.fit(X, y)

    y_pred=forest.predict(X_val1)
    mae_val1=mean_absolute_error(y_val1,y_pred)

    if(mae_val1 < best_ma):
        best_ma = mae_val1
        best_nt = nt

    print("nt= ", nt, " mae= ", mae_val1)

print("best_nt= ", best_nt, " best_ma= ", best_ma)

nt=  50  mae=  394.6453441452038
nt=  75  mae=  392.9002934538834
nt=  100  mae=  391.875723497066
nt=  125  mae=  391.70320399632544
nt=  150  mae=  391.34359623808
nt=  175  mae=  390.94026819713184
nt=  200  mae=  390.7971728092944
best_nt=  200  best_ma=  390.7971728092944


Now we predict the second validation set and we compute the mean square error and the mean absolute errorNow we predict the second validation set and we compute the mean square error and the mean absolute error

In [10]:
nt=200
forest = ExtraTreesRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
forest.fit(X, y)
    
y_pred_val2 = forest.predict(X_val2) 
mae_val2=mean_absolute_error(y_val2,y_pred_val2)

In [11]:
mae_val2

585.8973912216871

At the end we save the predicted values in a dataframe

In [14]:
columns=['Date', 'StoreID', 'RegionID', 'SalesPredicted', 'SalesReal']
index=range(y_val2.shape[0])
result=pd.DataFrame(index=index,columns=columns)

result['Date']=test['Date']
result['StoreID']=test['StoreID']
result['RegionID']=test['Region']
result['SalesPredicted']=y_pred_val2
result['SalesReal']=y_val2

result.to_csv("results_et.csv")