In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import time

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.linear_model.stochastic_gradient import SGDRegressor
from sklearn.metrics import mean_squared_error

from math import sqrt

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [2]:
#Importing dataset
df = pd.read_csv("dataset/train_processed.csv")

In [3]:
df.shape

(501300, 49)

We normalize the data so that all values are between 0 and 1. So we first remove 'Date' which is the only non numerical attribute. Then we apply range normalization and at the end we add 'Date'. 

In [4]:
df_date=df['Date']
df_no_date=df.drop(['Date'], axis=1)
df_norm= (df_no_date - df_no_date.min()) / (df_no_date.max() - df_no_date.min())
df=df_norm
df['Date']=df_date
df.head(5)

Unnamed: 0.1,Unnamed: 0,StoreID,IsHoliday,IsOpen,HasPromotions,StoreType,NearestCompetitor,NumberOfCustomers,NumberOfSales,Region_AreaKM2,...,Thunderstorm,IsSaturday,IsSunday,WasOpenYesterday,IsOpenTomorrow,YesterdaySales,Previous3DaysSales,PreviousWeekSales,PreviousMonthSales,Date
0,0.0,0.0,0.0,1.0,1.0,0.666667,0.003281,0.3835,0.446943,0.291715,...,0.0,0.0,0.0,1.0,0.0,0.33989,0.363301,0.355788,0.372606,2016-03-31
1,2e-06,0.0,1.0,0.0,1.0,0.666667,0.003281,0.0,0.0,0.291715,...,0.0,0.0,0.0,1.0,1.0,0.446943,0.409861,0.384367,0.384941,2016-04-01
2,4e-06,0.0,0.0,1.0,0.0,0.666667,0.003281,0.385766,0.41774,0.291715,...,0.0,1.0,0.0,0.0,0.0,0.0,0.290735,0.326042,0.368885,2016-04-02
3,6e-06,0.0,0.0,0.0,0.0,0.666667,0.003281,0.0,0.0,0.291715,...,0.0,0.0,1.0,1.0,0.0,0.41774,0.319501,0.36392,0.377885,2016-04-03
4,8e-06,0.0,1.0,0.0,0.0,0.666667,0.003281,0.0,0.0,0.291715,...,0.0,0.0,0.0,0.0,1.0,0.0,0.154355,0.36392,0.362002,2016-04-04


We split the dataset in 4 parts:
* df_train: the training set (all the rows before 2017-09-01). We use it to train our model.
* df_val1: the first validation set (all the rows between 2017-09-01 and 2017-11-01). We use it to tune the hyperparameter of our model.
* df_val2: the second validation set (all the rows between 2017-11-01 and 2018-01-01). We use it to select the best model.
* df_val3: the third validation set (all the rows after 2018-01-01). We use it to check that our final model works properly.

In [5]:
df_val3=df.loc[(df['Date']>='2018-01-01')]
df_val3_rest=df.loc[(df['Date']<'2018-01-01')]
df_val2=df_val3_rest.loc[(df_val3_rest['Date']>='2017-11-01')]
df_val2_rest=df.loc[(df['Date']<'2017-11-01')]
df_val1=df_val2_rest.loc[(df_val2_rest['Date']>='2017-09-01')]
df_train=df.loc[(df['Date']<'2017-09-01')]

We remove 'Date' from our set because it is only used as index

In [6]:
df_train_clear=df_train.drop(['Date'], axis=1)
df_val1_clear=df_val1.drop(['Date'], axis=1)
df_val2_clear=df_val2.drop(['Date'], axis=1)
df_val3_clear=df_val3.drop(['Date'], axis=1)

We try to analyse only the first store for a faster overview

In [7]:
df_store=df_train_clear.loc[(df_train_clear['StoreID']==0.0)]
df_val1_store=df_val1_clear.loc[(df_val1_clear['StoreID']==0.0)]
df_val2_store=df_val2_clear.loc[(df_val2_clear['StoreID']==0.0)]
df_val3_store=df_val3_clear.loc[(df_val3_clear['StoreID']==0.0)]

In [8]:
y=df_store['NumberOfSales']
y_val1=df_val1_store['NumberOfSales']
y_val2=df_val2_store['NumberOfSales']
y_val3=df_val3_store['NumberOfSales']

In [9]:
X=df_store.drop(['StoreID','NumberOfSales','NumberOfCustomers','YesterdaySales','Previous3DaysSales','PreviousWeekSales','PreviousMonthSales'], axis=1)
X_val1=df_val1_store.drop(['StoreID','NumberOfSales','NumberOfCustomers','YesterdaySales','Previous3DaysSales','PreviousWeekSales','PreviousMonthSales'], axis=1)
X_val2=df_val2_store.drop(['StoreID','NumberOfSales','NumberOfCustomers','YesterdaySales','Previous3DaysSales','PreviousWeekSales','PreviousMonthSales'], axis=1)
X_val3=df_val3_store.drop(['StoreID','NumberOfSales','NumberOfCustomers','YesterdaySales','Previous3DaysSales','PreviousWeekSales','PreviousMonthSales'], axis=1)

In [13]:
columns=['var_threshold','num_feat','mse']
index=range(1,200000)
hp=pd.DataFrame(index=index,columns=columns)

In [14]:
lasso = LassoCV(n_jobs=-1, random_state=12345)

In [15]:
i = 1
start = time.time()

for v in np.arange(0.00001, 0.003, 0.0001):
    sfm = SelectFromModel(lasso, threshold=v)
    sfm.fit(X, y)
    n_features = sfm.transform(X).shape[1]
    if n_features == 0:
        hp.at[i,'var_threshold']= v
        hp.at[i,'num_feat'] = n_features
        hp.at[i, 'mse'] = 99999999999999999
        i+=1
    else:
        mask = sfm.get_support()
        X_mask = X.loc[:, mask]
        X_val1_mask = X_val1.loc[:, mask]
        lasso.fit(X_mask, y)
        y_pred=lasso.predict(X_val1_mask)
        hp.at[i,'var_threshold']= v
        hp.at[i,'num_feat'] = n_features
        hp.at[i,'mse'] = sqrt(mean_squared_error(y_val1,y_pred))
        i+=1
            
end = time.time() - start
end

6.755202054977417

In [16]:
hp.dropna()

Unnamed: 0,var_threshold,num_feat,mse
1,1e-05,16,0.0391194
2,0.00011,16,0.0391194
3,0.00021,16,0.0391194
4,0.00031,16,0.0391194
5,0.00041,16,0.0391194
6,0.00051,15,0.039182
7,0.00061,15,0.039182
8,0.00071,15,0.039182
9,0.00081,15,0.039182
10,0.00091,15,0.039182


Now, we can try to work on the whole dataset.
    

In [27]:
X=df_train_clear.drop(['StoreID','NumberOfSales','NumberOfCustomers','YesterdaySales','Previous3DaysSales','PreviousWeekSales','PreviousMonthSales'], axis=1)
X_val1=df_val1.drop(['StoreID','NumberOfSales','NumberOfCustomers','YesterdaySales','Previous3DaysSales','PreviousWeekSales','PreviousMonthSales'], axis=1)
X_val2=df_val2.drop(['StoreID','NumberOfSales','NumberOfCustomers','YesterdaySales','Previous3DaysSales','PreviousWeekSales','PreviousMonthSales'], axis=1)
X_val3=df_val3.drop(['StoreID','NumberOfSales','NumberOfCustomers','YesterdaySales','Previous3DaysSales','PreviousWeekSales','PreviousMonthSales'], axis=1)

In [28]:
y=df_train_clear['NumberOfSales']
y_val1=df_val1_clear['NumberOfSales']
y_val2=df_val2_clear['NumberOfSales']
y_val3=df_val3_clear['NumberOfSales']

In [35]:
dict_df = {'var_threshold' : [],'num_feat' : [],'mse' : []}

In [45]:
start = time.time()

for v in np.arange(0.00001, 0.03, 0.0001):
    sfm = SelectFromModel(lasso, threshold=v)
    sfm.fit(X, y)
    n_features = sfm.transform(X).shape[1]
    if n_features == 0:
        dict_df['var_threshold'].append(v)
        dict_df['num_feat'].append(n_features)
        dict_df['mse'].append(99999999999999999)
    else:
        mask = sfm.get_support()
        X_mask = X.loc[:, mask]
        X_val1_mask = X_val1.loc[:, mask]
        lasso.fit(X_mask, y)
        y_pred=lasso.predict(X_val1_mask)
        dict_df['var_threshold'].append(v)
        dict_df['num_feat'].append(n_features)
        dict_df['mse'].append(sqrt(mean_squared_error(y_val1,y_pred)))
            
end = time.time() - start
end

3221.8681728839874

In [46]:
hp_tot = pd.DataFrame(dict_df)

In [47]:
hp_tot.describe()

Unnamed: 0,var_threshold,num_feat,mse
count,330.0,330.0,330.0
mean,0.013733,17.112121,0.063919
std,0.009141,8.149599,0.000342
min,1e-05,9.0,0.063566
25%,0.005235,9.0,0.063635
50%,0.01346,14.0,0.063778
75%,0.021685,22.0,0.064345
max,0.02991,34.0,0.064504


In [48]:
hp_tot.to_csv("hp_tot.csv")