In [None]:
#Importing library

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import datetime

from scipy import stats
from scipy.stats import skew
from scipy.stats import norm
from scipy.stats.stats import pearsonr
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from math import sqrt
from sklearn.feature_selection import SelectFromModel

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

Importing all the preprocessed dataset 

In [None]:
train_no_val2 = pd.read_csv("../dataset/processed1/train_no_val2.csv")
train_no_val1 = train_no_val2.loc[(train_no_val2['Date']<'2017-11-01')]
train_complete = pd.read_csv("../dataset/processed1/train.csv")

val1 = train_no_val2.loc[((train_no_val2['Date']>='2017-11-01') & (train_no_val2['Date']<='2017-12-31'))]
val2 = pd.read_csv("../dataset/processed1/val2.csv")

test = pd.read_csv("../dataset/processed1/test.csv")

Modifying the date

In [None]:
train_no_val1.OrdinalDate = train_no_val1.OrdinalDate.map(lambda x: x % 365)
train_no_val2.OrdinalDate = train_no_val2.OrdinalDate.map(lambda x: x % 365)
train_complete.OrdinalDate = train_complete.OrdinalDate.map(lambda x: x % 365)

val1.OrdinalDate = val1.OrdinalDate.map(lambda x: x % 365)
val2.OrdinalDate = val2.OrdinalDate.map(lambda x: x % 365)

test.OrdinalDate = test.OrdinalDate.map(lambda x: x % 365)

Preparing train-val-test set

In [None]:
y_train_no_val1 = train_no_val1['NumberOfSales']
y_train_no_val2 = train_no_val2['NumberOfSales']
y_train_complete = train_complete['NumberOfSales']

y_val1 = val1['NumberOfSales']
y_val2 = val2['NumberOfSales']

X_train_no_val1 = train_no_val1.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)
X_train_no_val2 = train_no_val2.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)
X_train_complete = train_complete.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)

X_val1 = val1.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)
X_val2 = val2.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)

X_test = test.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)

Selecting the best features

In [None]:
forest = RandomForestRegressor(max_depth=40, random_state=0, n_estimators=100, max_features=21, n_jobs=-1)
forest.fit(X_train_no_val1, y_train_no_val1)

importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(X_train_no_val1.shape[1]):
    print("%d. feature %d %s (%f)" % (f + 1, indices[f], X_train_no_val1.columns[indices[f]], importances[indices[f]]))

# Plot the feature importances of the forest
plt.figure(num=None, figsize=(14, 10), dpi=80, facecolor='w', edgecolor='k')
plt.title("Feature importances")
plt.bar(range(X_train_no_val1.shape[1]), importances[indices],
       color="r", yerr=std[indices], align="center")
plt.xticks(range(X_train_no_val1.shape[1]), X_train_no_val1.columns[indices] ,rotation=90)
plt.xlim([-1, X_train_no_val1.shape[1]])
plt.show()

In [None]:
feature_selection_model = SelectFromModel(forest, threshold=0.0065, prefit=True)

X_train_no_val1 = feature_selection_model.transform(X_train_no_val1)
X_train_no_val2 = feature_selection_model.transform(X_train_no_val2)
X_train_complete = feature_selection_model.transform(X_train_complete)

X_val1 = feature_selection_model.transform(X_val1)
X_val2 = feature_selection_model.transform(X_val2)

X_test = feature_selection_model.transform(X_test)

compute the model for val2

Function used to compute the error

In [None]:
def calcola_errore(ypred, y, val):
    val=val.reset_index(drop=True)
    y = y.reset_index(drop=True)

    columns=['Date', 'StoreID', 'RegionID', 'SalesPredicted', 'SalesReal']
    index=range(y.shape[0])
    result=pd.DataFrame(index=index,columns=columns)

    result['Date']=val['Date']
    result['StoreID']=val['StoreID']
    result['RegionID']=val['Region']
    result['SalesPredicted']=ypred
    result['SalesReal']=y
    

    # Transform dates from '%Y-%m-%d' to datetime objects.
    def transform_date(x):
        date = datetime.datetime.strptime(x, '%Y-%m-%d')
        return date

    result['Date'] = result['Date'].map(transform_date)
    result['Month'] = result.Date.map(lambda d: d.strftime('%Y-%m'))
    result = result.groupby(['StoreID', 'RegionID', 'Month']).sum().reset_index()[['Month', 'StoreID', 'RegionID', 'SalesPredicted', 'SalesReal']]

    result['SalesError'] = abs(result.SalesPredicted - result.SalesReal)

    region_error = (result.groupby('RegionID').sum().SalesError / result.groupby('RegionID').sum().SalesReal).reset_index().rename(columns={0: 'RegionError'}).set_index('RegionID')
    total_error = np.mean(region_error.RegionError)
    return total_error

In [None]:
#best parameters
na=4
depth=15
nt=250

In [None]:
forest = RandomForestRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
forest.fit(X_train_no_val2, y_train_no_val2)

y_pred_val2 = forest.predict(X_val2)

mae = mean_absolute_error(y_val2, y_pred_val2)
e = calcola_errore(y_pred_val2, y_val2, val2)

print(mae, e)

In [None]:
columns=['Date', 'StoreID', 'RegionID', 'SalesPredicted', 'SalesReal']
index=range(y_val2.shape[0])

val2 = val2.reset_index(drop=True)
y_val2 = y_val2.reset_index(drop=True)

result_val2=pd.DataFrame(index=index,columns=columns)

result_val2['Date']=val2['Date']
result_val2['StoreID']=val2['StoreID']
result_val2['RegionID']=val2['Region']
result_val2['SalesPredicted']=y_pred_val2
result_val2['SalesReal']=y_val2

def transform_date(x):
    date = datetime.datetime.strptime(x, '%Y-%m-%d')
    return date

result_val2['Date'] = result_val2['Date'].map(transform_date)

In [None]:
d = {}

def estimate_sales(x):
    store = x['StoreID']
    month = x['Date'].month
    sales = x['SalesPredicted']
    if (store, month) in d.keys():
        d[(store, month)] += sales
    else:
        d[(store, month)] = sales
        
result_val2.loc[:, ['StoreID', 'Date', 'SalesPredicted']].apply(estimate_sales, axis=1)

d1 = {'StoreID':[], 'Month':[], 'NumberOfSales':[]}

for s,m in d.keys():
    d1['StoreID'].append(s)
    d1['Month'].append(m)
    d1['NumberOfSales'].append(d[(s,m)])
    
final_result_val2 = pd.DataFrame(d1)  

compute the model for the test

In [None]:
forest = RandomForestRegressor(max_depth=depth, random_state=0, n_estimators=nt, max_features=na, n_jobs=-1)
forest.fit(X_train_complete, y_train_complete)

y_pred_test = forest.predict(X_test)

In [None]:
columns=['Date', 'StoreID', 'RegionID', 'SalesPredicted', 'SalesReal']
index=range(test.shape[0])

test = test.reset_index(drop=True)

result_test=pd.DataFrame(index=index,columns=columns)

result_test['Date']=test['Date']
result_test['StoreID']=test['StoreID']
result_test['RegionID']=test['Region']
result_test['SalesPredicted']=y_pred_test

result_test['Date'] = result_test['Date'].map(transform_date)

In [None]:
d = {}
        
result_test.loc[:, ['StoreID', 'Date', 'SalesPredicted']].apply(estimate_sales, axis=1)

d1 = {'StoreID':[], 'Month':[], 'NumberOfSales':[]}

for s,m in d.keys():
    d1['StoreID'].append(s)
    d1['Month'].append(m)
    d1['NumberOfSales'].append(d[(s,m)])
    
final_result_test = pd.DataFrame(d1)  

In [None]:
final_result_test.to_csv("final_result.csv")