In [3]:
#Importing library

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import datetime

from scipy import stats
from scipy.stats import skew
from scipy.stats import norm
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt


%matplotlib inline
%config InlineBackend.figure_format = 'retina'

We load the training set and the validation set which have been already preprocessed

In [4]:
#Importing dataset
train = pd.read_csv("dataset/rand/train_rand_noout3.csv")
val1 = pd.read_csv("dataset/rand/train_val1_rand.csv")
val2 = pd.read_csv("dataset/rand/train_val2_rand.csv")

We prepare the training input samples and the labels for all the sets

In [None]:
X=train.drop(['StoreID','Date', 'NumberOfSales', 'Region','Unnamed: 0', 'Unnamed: 0.1'], axis=1)
X_val1=val1.drop(['StoreID','Date', 'NumberOfSales', 'Region','Unnamed: 0'], axis=1)
X_val2=val2.drop(['StoreID','Date', 'NumberOfSales', 'Region','Unnamed: 0'], axis=1)

y=train['NumberOfSales']
y_val1=val1['NumberOfSales']
y_val2=val2['NumberOfSales']

We try to find good hyperparameter in Extremely Randomized Regressor. First of all we try to find good values for max_depth and max_features

In [16]:
nt=100
best_ma = 1000000

for depth in range(18,28,2):
    for na in range(10,17,1):
        
        forest = ExtraTreesRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
        forest.fit(X, y)
        
        y_pred=forest.predict(X_val1)
        mse_val1=mean_squared_error(y_val1,y_pred)
        mae_val1=mean_absolute_error(y_val1,y_pred)
        
        if(mae_val1 < best_ma):
            best_ma = mae_val1
            best_depth = depth
            best_na = na
       
        print("na= ", na, " depth= ", depth, " rmse= ", sqrt(mse_val1)," mae= ", mae_val1)

print("best_na= ", best_na, " best_depth= ", best_depth, " best_ma= ", best_ma)

12 0.027840053245100645 0.0195790631428


Now we check if removing some features we obtain a lower validation error

In [13]:
nt=100
na=best_na
depth=best_depth

feature_names = list(X.columns)

for feature in feature_names:
    X=df_train_clear.drop(['StoreID','NumberOfSales','NumberOfCustomers'], axis=1)
    X_val1=df_val1_clear.drop(['StoreID','NumberOfSales','NumberOfCustomers'], axis=1)
    X=X.drop([feature], axis=1)
    X_val1=X_val1.drop([feature], axis=1)
    forest = ExtraTreesRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
    forest.fit(X, y)
    y_pred=forest.predict(X_val1)
    mse_val1=mean_squared_error(y_val1,y_pred)
    mae_val1=mean_absolute_error(y_val1,y_pred)
    print(feature, sqrt(mse_val1), mae_val1)

IsHoliday 0.027461465475176503 0.0192611161096
HasPromotions 0.04003863317247128 0.029766546966
StoreType 0.027824418374292902 0.0196920401725
NearestCompetitor 0.027584527895542416 0.0195318982325
Region_AreaKM2 0.027287162569294057 0.0192372364336
Region_GDP 0.027523689189593494 0.0194295830634
Region_PopulationK 0.027428748102158052 0.0193894669588
CloudCover 0.02733722499481307 0.0192063738651
Max_Dew_PointC 0.02734089359584064 0.0192392218557
Max_Gust_SpeedKm_h 0.02725703009834444 0.0191719622117
Max_Humidity 0.027307794024024164 0.0191720375717
Max_Sea_Level_PressurehPa 0.027238982352806453 0.0191068415829
Max_TemperatureC 0.027197144724387872 0.0190890521565
Max_VisibilityKm 0.027483600412522943 0.0193079902133
Max_Wind_SpeedKm_h 0.027304641339577194 0.0192076982416
Mean_Dew_PointC 0.027335722822490284 0.0192455729875
Mean_Humidity 0.027260259474371412 0.0191488758657
Mean_Sea_Level_PressurehPa 0.027213049351888646 0.0191415247333
Mean_TemperatureC 0.027269236731764512 0.0191657

We remove the worst features and we compute the validation error using the best hyperparameters

In [None]:
worst_features = []

X=df_train_clear.drop(['StoreID','NumberOfSales','NumberOfCustomers'], axis=1)
X_val1=df_val1_clear.drop(['StoreID','NumberOfSales','NumberOfCustomers'], axis=1)
X=X.drop(worst_features, axis=1)
X_val1=X_val1.drop(worst_features, axis=1)
forest = ExtraTreesRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
forest.fit(X, y)

y_pred=forest.predict(X_val1)
mse_val1=mean_squared_error(y_val1,y_pred)
mae_val1=mean_absolute_error(y_val1,y_pred)
print("rmse= ",sqrt(mse_val1), " mae= ", mae_val1)

Now we optimize the number of estimators

In [None]:
best_ma = 1000000
na=best_na
depth=best_depth

for nt in range(50,200,25):
        
    forest = ExtraTreesRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
    forest.fit(X, y)

    y_pred=forest.predict(X_val1)
    mse_val1=mean_squared_error(y_val1,y_pred)
    mae_val1=mean_absolute_error(y_val1,y_pred)

    if(mae_val1 < best_ma):
        best_ma = mae_val1
        best_nt = nt

    print("nt= ", nt, sqrt(mse_val1)," mae= ", mae_val1)

print("best_nt= ", best_nt, " best_ma= ", best_ma)

Now we predict the second validation set and we compute the mean square error and the mean absolute errorNow we predict the second validation set and we compute the mean square error and the mean absolute error

In [None]:
best_depth = 
best_na = 
best_nt =
  
forest = ExtraTreesRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
forest.fit(X, y)
    
xclas.fit(X, y)   
y_pred_val2 = xclas.predict(X_val2) 
mse_val1=mean_squared_error(y_val2,y_pred_val2)
mae_val1=mean_absolute_error(y_val2,y_pred_val2)

At the end we save the predicted values in a dataframe

In [None]:
columns=['Date', 'StoreID', 'RegionID', 'SalesPredicted', 'SalesReal']
index=range(y_val2.shape[0])
result=pd.DataFrame(index=index,columns=columns)

result['Date']=val2['Date']
result['StoreID']=val2['StoreID']
result['RegionID']=val2['Region']
result['SalesPredicted']=y_pred_val2
result['SalesReal']=y_val2

result.to_csv("results.csv")