In [None]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import time
import datetime

from sklearn.linear_model.stochastic_gradient import SGDRegressor

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

from math import sqrt

pd.options.display.max_columns = None

In [None]:
#Importing dataset
train_no_val2 = pd.read_csv("../dataset/train_no_val2.csv", index_col=0)
train_no_val1 = train_no_val2.loc[(train_no_val2['Date']<'2017-11-01')]

val1 = train_no_val2.loc[((train_no_val2['Date']>='2017-11-01') & (train_no_val2['Date']<='2017-12-31'))]
val2 = pd.read_csv("../dataset/val2.csv", index_col=0)

We save 'Date', 'StoreID' and 'Region' for the final csv with all the results.

In [None]:
train_no_val1.OrdinalDate = train_no_val1.OrdinalDate.map(lambda x: x % 365)
train_no_val2.OrdinalDate = train_no_val2.OrdinalDate.map(lambda x: x % 365)

val1.OrdinalDate = val1.OrdinalDate.map(lambda x: x % 365)
val2.OrdinalDate = val2.OrdinalDate.map(lambda x: x % 365)

In [None]:
y_train_no_val1 = train_no_val1['NumberOfSales']
y_train_no_val2 = train_no_val2['NumberOfSales']

y_val1 = val1['NumberOfSales']
y_val2 = val2['NumberOfSales']

X_train_no_val1 = train_no_val1.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)
X_train_no_val2 = train_no_val2.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)

X_val1 = val1.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)
X_val2 = val2.drop(['StoreID','Date', 'NumberOfSales', 'Region'], axis=1)

We normalize the values and set the 'OrdinalDate' on the day of of the year

In [None]:
X_train_no_val1 = (X_train_no_val1 - X_train_no_val1.min()) / (X_train_no_val1.max() - X_train_no_val1.min())
X_train_no_val2 = (X_train_no_val2 - X_train_no_val2.min()) / (X_train_no_val2.max() - X_train_no_val2.min())

X_val1 = (X_val1 - X_val1.min()) / (X_val1.max() - X_val1.min())
X_val2 = (X_val2 - X_val2.min()) / (X_val2.max() - X_val2.min())

In [None]:
X_train_no_val1.head()

We try to find the best parameters

In [None]:
dict_params = {'loss_function' : [], 'alpha' : [], 'epsilon' : [], 'learning_rate' : [], 'error' : [],'mse' : [], 'mae' : []}

In [None]:
def errore(ypred, y, val):
    val = val.reset_index(drop=True)
    y = y.reset_index(drop=True)

    columns=['Date', 'StoreID', 'RegionID', 'SalesPredicted', 'SalesReal']
    index=range(y.shape[0])
    result=pd.DataFrame(index=index,columns=columns)

    result['Date']=val['Date']
    result['StoreID']=val['StoreID']
    result['RegionID']=val['Region']
    result['SalesPredicted']=ypred
    result['SalesReal']=y
    

    # Transform dates from '%Y-%m-%d' to datetime objects.
    def transform_date(x):
        date = datetime.datetime.strptime(x, '%Y-%m-%d')
        return date

    result['Date'] = result['Date'].map(transform_date)
    result['Month'] = result.Date.map(lambda d: d.strftime('%Y-%m'))
    result = result.groupby(['StoreID', 'RegionID', 'Month']).sum().reset_index()[['Month', 'StoreID', 'RegionID', 'SalesPredicted', 'SalesReal']]

    result['SalesError'] = abs(result.SalesPredicted - result.SalesReal)

    region_error = (result.groupby('RegionID').sum().SalesError / result.groupby('RegionID').sum().SalesReal).reset_index().rename(columns={0: 'RegionError'}).set_index('RegionID')
    total_error = np.mean(region_error.RegionError)
    return total_error

In [None]:
loss_fun = ['squared_loss', 'epsilon_insensitive']
learning_r = ['constant', 'optimal', 'invscaling']

start = time.time()

for l_f in loss_fun:
    for l_r in learning_r:
        for a in np.linspace(.000001, .00001 , 100):
            for e in np.linspace(.1, .01, 5):
                sgd = SGDRegressor(loss=l_f, alpha=a, epsilon=e, learning_rate=l_r, shuffle=True, random_state=1234567890)
                dict_params['loss_function'].append(l_f)
                dict_params['alpha'].append(a)
                dict_params['epsilon'].append(e)
                dict_params['learning_rate'].append(l_r)
                sgd.fit(X_train_no_val1, y_train_no_val1)
                y_pred = sgd.predict(X_val1)
                mse = sqrt(mean_squared_error(y_val1, y_pred))
                mae = mean_absolute_error(y_val1, y_pred)
                err = errore(y_pred, y_val1, val1)
                dict_params['mse'].append(mse)
                dict_params['mae'].append(mae)
                dict_params['error'].append(err)
                
                print(l_f, l_r, a, e, mse, mae, err)
            
end = time.time() - start
end

In [None]:
params = pd.DataFrame(dict_params)

We extrapolate the best parameters

In [None]:
min_err = params.error.min()
best_params = params[(params['error'] == min_err)].iloc[0]

print("Minimum error: %f" %min_err)

loss_f = best_params['loss_function']
alp = best_params['alpha']
eps = best_params['epsilon']
learn_rate = best_params['learning_rate']

print("Loss function: " + loss_f + "; alpha: %f: epsilon: %f" %(alp, eps) + "; learning rate: " + learn_rate)

We validate our model on the last two months of the dataset

In [None]:
sgd = SGDRegressor(loss=loss_f, alpha=alp, epsilon=eps, learning_rate=learn_rate, shuffle=True, random_state=1234567890)
start = time.time()
sgd.fit(X_train_no_val2, y_train_no_val2)
y_pred2 = sgd.predict(X_val2)
end = time.time() - start
mse2 = sqrt(mean_squared_error(y_val2, y_pred2))
mae2 = mean_absolute_error(y_val2, y_pred2)
err = errore(y_pred2, y_val2, val2)

print("mse: %f, mae: %f, error: %f, time: %f" %(mse2, mae2, err, end))

In [None]:
dict_val2 = {'Date' : date_val2, 'StoreID' : store_val2, 'RegionID' : region_val2, 'SalesPredicted': [], 'SalesReal': []}

In [None]:
dict_val2['SalesReal'] = list(y_val2.to_frame()['NumberOfSales'].values)
dict_val2['SalesPredicted'] = list(y_pred2)

In [None]:
res = pd.DataFrame(dict_val2)

In [None]:
res.head(50)

In [None]:
res.to_csv("../results/result_linear_regression.csv")