In [9]:
#Importing library

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import datetime
#import xgboost as xgb

from scipy import stats
from scipy.stats import skew
from scipy.stats import norm
from scipy.stats.stats import pearsonr
from sklearn.datasets import make_regression
from xgboost.sklearn import XGBRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from math import sqrt


%matplotlib inline
%config InlineBackend.figure_format = 'retina'

We load the training set and the validation set which have been already preprocessed

In [15]:
#Importing dataset
train = pd.read_csv("../dataset/train_rand.csv", index_col=0)
val1 = pd.read_csv("../dataset/val1_rand.csv", index_col=0)
val2 = pd.read_csv("../dataset/val2.csv", index_col=0)

In [16]:
train.OrdinalDate = train.OrdinalDate.map(lambda x: x % 365)
val1.OrdinalDate = val1.OrdinalDate.map(lambda x: x % 365)
val2.OrdinalDate = val2.OrdinalDate.map(lambda x: x % 365)

We prepare the training input samples and the labels for all the sets

In [17]:
X=train.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)
X_val1=val1.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)
X_val2=val2.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)

y=train['NumberOfSales']
y_val1=val1['NumberOfSales']
y_val2=val2['NumberOfSales']

We try to find good hyperparameter in xGBoost. First of all we try to find good values for max_depth and colsample_bytree 

In [22]:
index = 0

eta=0.3
min_child_weight=5
gamma=0
subsample=0.6
n_estimators=100

best_mae = 10000000

for depth in range(17,24,3):
    for col in {0.7,0.65,0.675}:
        xclas = XGBRegressor(subsample=subsample, eta=eta, max_depth=depth, 
                             colsample_bytree=col, nthread=4,
                             min_child_weight=min_child_weight, gamma=gamma,
                             n_estimators=n_estimators)  
        xclas.fit(X, y)   
        y_pred = xclas.predict(X_val1) 
        mae_val1=mean_absolute_error(y_val1,y_pred)

        if mae_val1 < best_mae:
            best_mae = mae_val1
            best_max_depth = depth
            best_colsample_by_tree = col

        print("max_depth= ", depth, " colsample_bytree= ", col, " mae= ", mae_val1)
        
print("best_mae=", best_mae, " best_max_depth=", best_max_depth, " best_colsample_by_tree=", best_colsample_by_tree  )

max_depth=  17  colsample_bytree=  0.675  mae=  348.195179779


KeyboardInterrupt: 

Now we check if removing some features we obtain a lower validation error

In [None]:
nt=100
col = 0.7
depth=17

feature_names = list(X.columns)

for feature in feature_names:
    
    X=train.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)
    X_val1=val1.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)
    X=X.drop([feature], axis=1)
    X_val1=X_val1.drop([feature], axis=1)
    
    xclas = XGBRegressor(subsample=subsample, eta=eta, max_depth=depth, 
                         colsample_bytree=col, nthread=4,
                         min_child_weight=min_child_weight, gamma=gamma,
                         n_estimators=n_estimators)  
    xclas.fit(X, y)   
    y_pred = xclas.predict(X_val1) 
    mae_val1=mean_absolute_error(y_val1,y_pred)
    print(feature, mae_val1)

Now we optimize the number of estimators

In [None]:
eta=0.3
min_child_weight=5
gamma=0
subsample=0.6
best_max_depth=17
best_colsample_bytree=0.7
best_mae = 10000000

X=train.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)
X_val1=val1.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)

for n_estimators in range(175, 226, 10):
    
    xclas = XGBRegressor(subsample=subsample, eta=eta, max_depth=best_max_depth, 
                         colsample_bytree=best_colsample_bytree, nthread=4,
                         min_child_weight=min_child_weight, gamma=gamma,
                         n_estimators=n_estimators)  
    
    xclas.fit(X, y)   
    y_pred = xclas.predict(X_val1) 
    mae_val1=mean_absolute_error(y_val1,y_pred)

    if mae_val1 < best_mae:
        best_mae = mae_val1
        best_n_estimators = n_estimators

    print("n_estimators= ", n_estimators, " rmse=", mae_val1)
        
print("best_mae =", best_mae, " best_n_estimators = ", n_estimators)

Now we predict the second validation set and we compute the mean square error and the mean absolute error

In [29]:
xclas = XGBRegressor(subsample=0.6, eta=0.3, max_depth=17, 
                         colsample_bytree=0.7, nthread=4,
                         min_child_weight=5, gamma=0,
                         n_estimators=200)  

xclas.fit(X, y)  

y_pred_val2 = xclas.predict(X_val2) 
mae_val2=mean_absolute_error(y_val2,y_pred_val2)

print("mae_val2=",mae_val2)

mae_val2= 600.470504139


In [31]:
train.MeanMonthSales.unique()

array([ 4769.82013426,  4747.76400521,  5020.29140456,  4855.92867655,
        4748.95402713,  4841.38001729,  4568.84885641,  4706.29391511,
        4937.15531304,  6132.80332181,  4717.35655563,  4716.17788595])

In [None]:
y_pred_val2=y_pred_val2.reset_index(drop=True)
val2=val2.reset_index(drop=True)

At the end we save the predicted values in a dataframe

In [None]:
columns=['Date', 'StoreID', 'RegionID', 'SalesPredicted', 'SalesReal']
index=range(y_val2.shape[0])
result=pd.DataFrame(index=index,columns=columns)

result['Date']=val2['Date']
result['StoreID']=val2['StoreID']
result['RegionID']=val2['Region']
result['SalesPredicted']=y_pred_val2
result['SalesReal']=y_val2

result.to_csv("results.csv")