In [1]:
#Importing library

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import datetime

from scipy import stats
from scipy.stats import skew
from scipy.stats import norm
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt


%matplotlib inline
%config InlineBackend.figure_format = 'retina'

We load the training set and the validation set which have been already preprocessed

In [2]:
#Importing dataset
train = pd.read_csv("dataset/rand/train_rand_noout3.csv")
val1 = pd.read_csv("dataset/rand/train_val1_rand.csv")
val2 = pd.read_csv("dataset/rand/train_val2_rand.csv")

We prepare the training input samples and the labels for all the sets

In [3]:
X=train.drop(['StoreID','Date', 'NumberOfSales', 'Region','Unnamed: 0', 'Unnamed: 0.1'], axis=1)
X_val1=val1.drop(['StoreID','Date', 'NumberOfSales', 'Region','Unnamed: 0'], axis=1)
X_val2=val2.drop(['StoreID','Date', 'NumberOfSales', 'Region','Unnamed: 0'], axis=1)

y=train['NumberOfSales']
y_val1=val1['NumberOfSales']
y_val2=val2['NumberOfSales']

We try to find good hyperparameter in Extremely Randomized Regressor. First of all we try to find good values for max_depth and max_features

In [4]:
nt=100
best_ma = 1000000

for depth in range(35,45,2):
    for na in range(15,25,2):
        
        forest = ExtraTreesRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
        forest.fit(X, y)
        
        y_pred=forest.predict(X_val1)
        mae_val1=mean_absolute_error(y_val1,y_pred)
        
        if(mae_val1 < best_ma):
            best_ma = mae_val1
            best_depth = depth
            best_na = na
       
        print("na= ", na, " depth= ", depth, " mae= ", mae_val1)

print("best_na= ", best_na, " best_depth= ", best_depth, " best_ma= ", best_ma)

na=  15  depth=  35  mae=  382.74758148414526
na=  17  depth=  35  mae=  382.3938838421324
na=  19  depth=  35  mae=  382.71543965437496
na=  21  depth=  35  mae=  382.45078025818196
na=  23  depth=  35  mae=  382.27087371263565
na=  15  depth=  37  mae=  383.7711026686291
na=  17  depth=  37  mae=  382.12676173537875
na=  19  depth=  37  mae=  382.5523383456669
na=  21  depth=  37  mae=  382.21789490984906
na=  23  depth=  37  mae=  381.9982617692045
na=  15  depth=  39  mae=  382.234124666574
na=  17  depth=  39  mae=  382.23289550399204
na=  19  depth=  39  mae=  381.3113760725615
na=  21  depth=  39  mae=  382.1435051457042
na=  23  depth=  39  mae=  382.5049578511858
na=  15  depth=  41  mae=  382.98362671602666
na=  17  depth=  41  mae=  382.9317984771984
na=  19  depth=  41  mae=  382.092979433625
na=  21  depth=  41  mae=  382.0612857317109
na=  23  depth=  41  mae=  381.7750098555154
na=  15  depth=  43  mae=  382.3472753026699
na=  17  depth=  43  mae=  382.39098832803325
na=

Now we check if removing some features we obtain a lower validation error

In [5]:
train.columns

Index(['Unnamed: 0', 'Unnamed: 0.1', 'StoreID', 'Date', 'IsHoliday',
       'HasPromotions', 'StoreType', 'NearestCompetitor', 'Region',
       'NumberOfSales', 'Region_AreaKM2', 'Region_GDP', 'Region_PopulationK',
       'CloudCover', 'Max_Dew_PointC', 'Max_Gust_SpeedKm_h', 'Max_Humidity',
       'Max_Sea_Level_PressurehPa', 'Max_TemperatureC', 'Max_VisibilityKm',
       'Max_Wind_SpeedKm_h', 'Mean_Dew_PointC', 'Mean_Humidity',
       'Mean_Sea_Level_PressurehPa', 'Mean_TemperatureC', 'Mean_VisibilityKm',
       'Mean_Wind_SpeedKm_h', 'Min_Dew_PointC', 'Min_Humidity',
       'Min_Sea_Level_PressurehPa', 'Min_TemperatureC', 'Min_VisibilitykM',
       'Precipitationmm', 'WindDirDegrees',
       'AssortmentType_With Fish Department',
       'AssortmentType_With Non-Food Department', 'Rain', 'Snow', 'Fog',
       'Hail', 'Thunderstorm', 'IsSaturday', 'IsSunday', 'OrdinalDate',
       'WasOpenYesterday', 'IsOpenTomorrow', 'MeanMonthSales',
       'MeanStoreSales', 'StdStoreSales'],
      d

In [9]:
nt=100
na=best_na
depth=best_depth

feature_names = list(X.columns)

feature_best = ''
mae_best = 10000000

for feature in feature_names:
    X_t=X[feature]
    X_val_t=X_val1[feature]
    X_f = X.drop([feature], axis=1)
    X_val_f = X_val1.drop([feature], axis=1)

    forest = ExtraTreesRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
    forest.fit(X_f, y)

    y_pred = forest.predict(X_val_f)
    mae = mean_absolute_error(y_val1, y_pred)

    print(feature, mae)

    if( mae < mae_best ):
        feature_best = feature
        mae_best = mae
    X[feature]=X_t
    X_val1[feature]=X_val_t
    
print(feature_best, mae_best)

IsHoliday 383.2106390048167
HasPromotions 441.421560992092
StoreType 386.50050645499346
NearestCompetitor 386.35500230610506
Region_AreaKM2 383.07445813521247
Region_GDP 382.3052043723652
Region_PopulationK 382.21952977049017
CloudCover 381.4321742952659
Max_Dew_PointC 381.6206864966225
Max_Gust_SpeedKm_h 382.95921967431866
Max_Humidity 381.3096846727071
Max_Sea_Level_PressurehPa 382.3474022228836
Max_TemperatureC 382.10538181906304
Max_VisibilityKm 381.73256768653243
Mean_Dew_PointC 381.7931124676994
Mean_Humidity 381.51640427151796
Mean_Sea_Level_PressurehPa 381.53636759749406
Mean_TemperatureC 382.3282120240661
Mean_VisibilityKm 381.53245735088785
Mean_Wind_SpeedKm_h 381.83862469617264
Min_Dew_PointC 381.7677101128991
Min_Humidity 382.30797464586044
Min_Sea_Level_PressurehPa 381.9452656171194
Min_TemperatureC 382.078137095699
Min_VisibilitykM 381.58014700419915
Precipitationmm 382.1030211246323
WindDirDegrees 382.30336044467623
AssortmentType_With Fish Department 381.59739299990116


no feature has to be removed, since the mae doesn't decrease

Now we optimize the number of estimators

In [11]:
best_ma = 1000000
na=best_na
depth=best_depth

X=train.drop(['StoreID','Date', 'NumberOfSales', 'Region','Unnamed: 0', 'Unnamed: 0.1'], axis=1)
X_val1=val1.drop(['StoreID','Date', 'NumberOfSales', 'Region','Unnamed: 0'], axis=1)

for nt in range(50,200,25):
        
    forest = ExtraTreesRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
    forest.fit(X, y)

    y_pred=forest.predict(X_val1)
    mae_val1=mean_absolute_error(y_val1,y_pred)

    if(mae_val1 < best_ma):
        best_ma = mae_val1
        best_nt = nt

    print("nt= ", nt, " mae= ", mae_val1)

print("best_nt= ", best_nt, " best_ma= ", best_ma)

nt=  50  mae=  383.87710742530186
nt=  75  mae=  382.28112826067644
nt=  100  mae=  381.3113760725615
nt=  125  mae=  380.8787501284567
nt=  150  mae=  380.4206270358552
nt=  175  mae=  380.1149443087992
best_nt=  175  best_ma=  380.1149443087992


Now we predict the second validation set and we compute the mean square error and the mean absolute errorNow we predict the second validation set and we compute the mean square error and the mean absolute error

In [12]:
nt=200
forest = ExtraTreesRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
forest.fit(X, y)
    
y_pred_val2 = forest.predict(X_val2) 
mae_val2=mean_absolute_error(y_val2,y_pred_val2)

In [13]:
mae_val2

585.1358403156978

At the end we save the predicted values in a dataframe

In [14]:
columns=['Date', 'StoreID', 'RegionID', 'SalesPredicted', 'SalesReal']
index=range(y_val2.shape[0])
result=pd.DataFrame(index=index,columns=columns)

result['Date']=val2['Date']
result['StoreID']=val2['StoreID']
result['RegionID']=val2['Region']
result['SalesPredicted']=y_pred_val2
result['SalesReal']=y_val2

result.to_csv("results.csv")