In [1]:
import glob
from os import getcwd
import os
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt


In [2]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression, f_regression
from scipy.stats import spearmanr
from sklearn.linear_model import Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV


In [3]:
# get working directory
path = os.path.dirname(getcwd())
os.chdir(path)
with open("data/full_features/traindt_not_tf.pkl", 'rb') as f:
    xtrain, ytrain = pickle.load(file=f)
with open("data/full_features/testdt_not_tf.pkl", 'rb') as f:
    xtest, ytest = pickle.load(file=f)


In [4]:
def normalize(train, test):
   sincos_fea_name = train.loc[:,train.columns.str.contains('sin|cos')].columns
   scaled_fea_name = [i for i in train.columns if i not in sincos_fea_name]
   # train data
   scaler = StandardScaler()
   scal_fea = train.loc[:, scaled_fea_name]
   scal_fea_tf = scaler.fit_transform(scal_fea)
   sincos_fea = np.array(train.loc[:, sincos_fea_name])
   xtrain_tf = np.hstack((scal_fea_tf, sincos_fea))

   # test dat
   scal_fea_test = test.loc[:, scaled_fea_name]
   scal_fea_test_tf = scaler.transform(scal_fea_test)
   sincos_fea_test = np.array(test.loc[:, sincos_fea_name])
   xtest_tf = np.hstack((scal_fea_test_tf, sincos_fea_test))
   return xtrain_tf, xtest_tf




### Full features


In [5]:
full_xtrain_tf, full_xtest_tf = normalize(xtrain, xtest)
full_traindt = (full_xtrain_tf, np.ravel(ytrain,'C'))
full_testdt = (full_xtest_tf, np.ravel(ytest,'C'))

with open("data/full_features/full-feature-tf/traindt.pkl", 'wb') as f:
    pickle.dump(file=f, obj=full_traindt)

with open("data/full_features/full-feature-tf/testdt.pkl", 'wb') as f:
    pickle.dump(file=f, obj=full_testdt)

### Feature selection

In [6]:
cat_var = xtrain.loc[:,xtrain.columns.str.contains('sin|cos|holidays|weekend')].columns
num_var = [i for i in xtrain.columns if i not in cat_var]
xtrain_for_fs = xtrain.loc[:, num_var].copy()
xtest_for_fs = xtest.loc[:, num_var].copy()

In [7]:
len(num_var)

166


#### Univariate feature selection -Pearson’s Correlation Coefficient 

In [8]:
fs = SelectKBest(score_func=f_regression, k=44)
fs.fit(xtrain_for_fs, np.ravel(ytrain,'C'))
fs_cols = np.concatenate((fs.get_feature_names_out(), cat_var))
fs_xtrain = xtrain.loc[:, fs_cols]
fs_xest = xtest.loc[:, fs_cols]

In [9]:
fs_xtrain_tf, fs_xtest_tf = normalize(fs_xtrain, fs_xest)
fs_traindt_tf = (fs_xtrain_tf, np.ravel(ytrain,'C'))
fs_testdt_tf = (fs_xtest_tf, np.ravel(ytest,'C'))

with open("data/full_features/f-regression/traindt-tf.pkl", 'wb') as f:
    pickle.dump(file=f, obj=fs_traindt_tf)

with open("data/full_features/f-regression/testdt-tf.pkl", 'wb') as f:
    pickle.dump(file=f, obj=fs_testdt_tf)

with open("data/full_features/f-regression/traindt.pkl", 'wb') as f:
    pickle.dump(file=f, obj=(fs_xtrain, ytrain))

with open("data/full_features/f-regression/testdt.pkl", 'wb') as f:
    pickle.dump(file=f, obj=(fs_xest, ytest))

In [10]:
fs.get_feature_names_out()

array(['sales_lag_1', 'sales_lag_6', 'sales_lag_7', 'sales_lag_21',
       'sales_lag_28', 'sales_lag_14', 'sales_mean_1_8', 'sales_mean_1_9',
       'sales_mean_1_7', 'sales_mean_1_14', 'sales_mean_1_15',
       'sales_mean_1_22', 'sales_min_1_5', 'sales_std_1_9',
       'sales_std_1_12', 'sales_std_1_11', 'sales_std_1_7',
       'sales_std_1_10', 'sales_std_1_19', 'sales_std_1_13',
       'sales_std_1_6', 'sales_std_1_4', 'sales_std_1_5',
       'sales_weighted_std5_1_5', 'sales_weighted_std6_1_6',
       'sales_weighted_std7_1_7', 'sales_weighted_std8_1_8',
       'sales_weighted_std9_1_9', 'sales_weighted_std10_1_10',
       'sales_weighted_std11_1_11', 'sales_weighted_std12_1_12',
       'sales_weighted_std13_1_13', 'sales_weighted_std14_1_14',
       'sales_weighted_std15_1_15', 'sales_weighted_std16_1_16',
       'sales_weighted_std17_1_17', 'sales_weighted_std18_1_18',
       'sales_weighted_std19_1_19', 'sales_weighted_std21_1_21',
       'sales_weighted_std22_1_22', 'sales_we

### Embedded with Lasso

In [11]:

pipeline = Pipeline([('sc',StandardScaler()),('model',Lasso())])
     

lassocv = GridSearchCV(pipeline,
                      {'model__alpha':np.arange(0.1,10,0.2)},
                      cv = 4, scoring="neg_mean_squared_error",
                      n_jobs=-1)
     

lassocv.fit(xtrain_for_fs,ytrain)

In [12]:
lassocv.best_params_

{'model__alpha': 0.30000000000000004}

In [13]:

coefs = pd.DataFrame({'variable':xtrain_for_fs.columns,
                            'coef':lassocv.best_estimator_.named_steps['model'].coef_,
                            'abs_coef':np.abs(lassocv.best_estimator_.named_steps['model'].coef_)})

coefs.sort_values('abs_coef', inplace=True, ascending=False)

In [14]:

print('{} features are eliminated!'.format(coefs[coefs['abs_coef']==0].shape[0]))

140 features are eliminated!


In [15]:
df_lasso = xtrain[coefs[coefs['abs_coef']!=0]['variable'].tolist()]

In [16]:
len(df_lasso.columns)

26

In [17]:
df_lasso.columns

Index(['trend', 'sales_lag_7', 'sales_std_1_5', 'sales_lag_21', 'sales_lag_28',
       'sales_lag_16', 'sales_max_1_7', 'sales_weighted_std3_1_3',
       'sales_lag_5', 'sales_lag_1', 'sales_lag_4', 'sales_lag_10',
       'sales_std_1_15', 'sales_mean_1_5', 'sales_min_1_5', 'sales_lag_14',
       'sales_lag_23', 'resid', 'sales_lag_9', 'sales_std_1_8', 'sales_lag_26',
       'sales_lag_24', 'sales_lag_13', 'sales_lag_19', 'sales_lag_3',
       'sales_min_1_6'],
      dtype='object')

In [18]:
lasso_cols = np.concatenate((df_lasso.columns, cat_var))
lasso_xtrain = xtrain.loc[:, lasso_cols]
lasso_xest = xtest.loc[:, lasso_cols]

In [19]:
lasso_xtrain.shape, lasso_xest.shape


((738, 34), (104, 34))

In [20]:
lasso_xtrain_tf, lasso_xtest_tf = normalize(lasso_xtrain, lasso_xest)
lasso_traindt_tf = (lasso_xtrain_tf, np.ravel(ytrain,'C'))
lasso_testdt_tf = (lasso_xtest_tf, np.ravel(ytest,'C'))

with open("data/full_features/lasso/traindt-tf.pkl", 'wb') as f:
    pickle.dump(file=f, obj=lasso_traindt_tf)

with open("data/full_features/lasso/testdt-tf.pkl", 'wb') as f:
    pickle.dump(file=f, obj=lasso_testdt_tf)

with open("data/full_features/lasso/traindt.pkl", 'wb') as f:
    pickle.dump(file=f, obj=(lasso_xtrain, ytrain))

with open("data/full_features/lasso/testdt.pkl", 'wb') as f:
    pickle.dump(file=f, obj=(lasso_xest, ytest))