In [9]:
#Importing library

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
#import xgboost as xgb

from scipy import stats
from scipy.stats import skew
from scipy.stats import norm
from scipy.stats.stats import pearsonr
from sklearn.datasets import make_regression
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt


%matplotlib inline
%config InlineBackend.figure_format = 'retina'

We load the training set and the validation set which have been already preprocessed

In [27]:
#Importing dataset
train = pd.read_csv("dataset/rand/train_rand_noout3.csv")
val1 = pd.read_csv("dataset/rand/train_val1_rand.csv")
val2 = pd.read_csv("dataset/rand/train_val2_rand.csv")

We prepare the training input samples and the labels for all the sets

In [28]:
X=train.drop(['StoreID','Date', 'NumberOfSales', 'Region','Unnamed: 0', 'Unnamed: 0.1'], axis=1)
X_val1=val1.drop(['StoreID','Date', 'NumberOfSales', 'Region','Unnamed: 0'], axis=1)
X_val2=val2.drop(['StoreID','Date', 'NumberOfSales', 'Region','Unnamed: 0'], axis=1)

y=train['NumberOfSales']
y_val1=val1['NumberOfSales']
y_val2=val2['NumberOfSales']

We try to find good hyperparameter in xGBoost. First of all we try to find good values for max_depth and colsample_bytree 

In [15]:
index = 0

eta=0.3
min_child_weight=5
gamma=0
subsample=0.6
n_estimators=100

best_mae = 10000000

for depth in range(17,24,3):
    for col in {0.7,0.65,0.675}:
        xclas = XGBRegressor(subsample=subsample, eta=eta, max_depth=depth, 
                             colsample_bytree=col, nthread=4,
                             min_child_weight=min_child_weight, gamma=gamma,
                             n_estimators=n_estimators)  
        xclas.fit(X, y)   
        y_pred = xclas.predict(X_val1) 
        mae_val1=mean_absolute_error(y_val1,y_pred)

        if mae_val1 < best_mae:
            best_mae = mae_val1
            best_max_depth = max_depth
            best_colsample_by_tree = colsample_by_tree

        print("max_depth= ", depth, " colsample_bytree= ", col, " mae= ", mae_val1)
        
print("best_mae=", best_mae, " best_max_depth=", best_max_depth, " best_colsample_by_tree=", best_colsample_by_tree  )

max_depth=  17  colsample_bytree=  0.675  rmse=  514.633774552173  mae=  335.672759794
max_depth=  17  colsample_bytree=  0.65  rmse=  513.7299575563651  mae=  335.632454715
max_depth=  17  colsample_bytree=  0.7  rmse=  514.6278905577572  mae=  335.107118267
best_mae= 335.107118267  best_max_depth= 12  best_colsample_by_tree= 0.9


Now we check if removing some features we obtain a lower validation error

In [17]:
nt=100
col = 0.7
depth=17

feature_names = list(X.columns)

for feature in feature_names:
    
    X=train.drop(['StoreID','Date', 'NumberOfSales', 'Region','Unnamed: 0', 'Unnamed: 0.1'], axis=1)
    X_val1=val.drop(['StoreID','Date', 'NumberOfSales', 'Region','Unnamed: 0'], axis=1)
    X=X.drop([feature], axis=1)
    X_val1=X_val1.drop([feature], axis=1)
    
    xclas = XGBRegressor(subsample=subsample, eta=eta, max_depth=depth, 
                         colsample_bytree=col, nthread=4,
                         min_child_weight=min_child_weight, gamma=gamma,
                         n_estimators=n_estimators)  
    xclas.fit(X, y)   
    y_pred = xclas.predict(X_val1) 
    mae_val1=mean_absolute_error(y_val1,y_pred)
    print(feature, mae_val1)

IsHoliday 520.2090209326052 336.502975056
HasPromotions 556.9820967945254 365.496848259
StoreType 519.6220065073798 341.574411116
NearestCompetitor 517.8847470841648 339.763980609
Region_AreaKM2 515.2034300479282 336.61274969
Region_GDP 513.5311009343626 336.925736655
Region_PopulationK 514.061612857366 336.667751727
CloudCover 513.3580694252995 336.021242876
Max_Dew_PointC 512.6506558231049 336.009807515
Max_Gust_SpeedKm_h 513.1313184914924 335.91962265
Max_Humidity 513.453585679261 336.200499994
Max_Sea_Level_PressurehPa 513.6683648970707 335.954085276
Max_TemperatureC 514.8713276014958 336.760992984
Max_VisibilityKm 513.3149456386818 335.456215553
Max_Wind_SpeedKm_h 512.9964863530482 335.536146759
Mean_Dew_PointC 513.864458185659 336.253077594
Mean_Humidity 515.1181959377101 336.625094597
Mean_Sea_Level_PressurehPa 512.8867227279862 336.315754529
Mean_TemperatureC 512.3679006390973 336.141213256
Mean_VisibilityKm 513.506044481011 336.350260857
Mean_Wind_SpeedKm_h 513.0731087676451 3

Now we optimize the number of estimators

In [24]:
eta=0.3
min_child_weight=5
gamma=0
subsample=0.6
best_max_depth=17
best_colsample_bytree=0.7
best_mae = 10000000

X=train.drop(['StoreID','Date', 'NumberOfSales', 'Region','Unnamed: 0', 'Unnamed: 0.1'], axis=1)
X_val1=val.drop(['StoreID','Date', 'NumberOfSales', 'Region','Unnamed: 0'], axis=1)

for n_estimators in range(175, 226, 10):
    
    xclas = XGBRegressor(subsample=subsample, eta=eta, max_depth=best_max_depth, 
                         colsample_bytree=best_colsample_bytree, nthread=4,
                         min_child_weight=min_child_weight, gamma=gamma,
                         n_estimators=n_estimators)  
    
    xclas.fit(X, y)   
    y_pred = xclas.predict(X_val1) 
    mae_val1=mean_absolute_error(y_val1,y_pred)

    if mae_val1 < best_mae:
        best_mae = mae_val1
        best_n_estimators = n_estimators

    print("n_estimators= ", n_estimators, " rmse=", mae_val1)
        
print("best_mae =", best_mae, " best_n_estimators = ", n_estimators)

n_estimators=  175  rmse= 509.0978732254196 331.274821861
n_estimators=  185  rmse= 508.7408707211783 331.297567501
n_estimators=  195  rmse= 508.63967319446175 331.22868695
n_estimators=  205  rmse= 508.71005515048637 331.314007034


KeyboardInterrupt: 

Now we predict the second validation set and we compute the mean square error and the mean absolute error

In [29]:
best_depth = 17
best_colsample = 0.7
best_n_estimators = 200
eta=0.3
min_child_weight=5
gamma=0
subsample=0.6

xclas = XGBRegressor(subsample=subsample, eta=eta, max_depth=best_depth, 
                         colsample_bytree=best_colsample, nthread=4,
                         min_child_weight=min_child_weight, gamma=gamma,
                         n_estimators=best_n_estimators)  

xclas.fit(X, y)   
y_pred_val2 = xclas.predict(X_val2) 
mae_val2=mean_absolute_error(y_val2,y_pred_val2)
print("mae_val2=",mae_val2)

mae_val2= 558.791235742


At the end we save the predicted values in a dataframe

In [30]:
columns=['Date', 'StoreID', 'RegionID', 'SalesPredicted', 'SalesReal']
index=range(y_val2.shape[0])
result=pd.DataFrame(index=index,columns=columns)

result['Date']=val2['Date']
result['StoreID']=val2['StoreID']
result['RegionID']=val2['Region']
result['SalesPredicted']=y_pred_val2
result['SalesReal']=y_val2

result.to_csv("results.csv")