# IMPORTS

## Libraries

In [1]:
import pandas as pd
import numpy as np

import datetime

import warnings
warnings.filterwarnings("ignore")

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.display import HTML

from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder
from sklearn.ensemble import RandomForestRegressor

from boruta import BorutaPy

## Helper Functions

In [2]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 16]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    
    sns.set()

In [3]:
jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


## Loading Data

In [4]:
dfRaw = pd.read_csv('../../01-Data/Results/01-FirstRoundCRISP/dfDataPreparation.csv', low_memory=False, parse_dates=['Date'])

# FEATURE SELECTION

In [5]:
dfRaw1 = dfRaw.copy()

## Split DataFrame into Training and Validation Dataset

In [6]:
toDrop = ['WeekOfYear', 'Day', 'Month', 'DayOfWeek', 'PromoSince', 'CompetionSinse', 'YearWeek']
dfRaw1 = dfRaw1.drop(toDrop, axis=1)

In [7]:
dfRaw1[['Store', 'Date']].groupby('Store').max().reset_index()['Date'][0] - datetime.timedelta(days=6*7)

Timestamp('2015-06-19 00:00:00')

In [8]:
#Training Dataset
XTrain = dfRaw1[dfRaw1['Date'] < '2015-06-19']
yTrain = XTrain['Sales']

#Validation Dataset
XTest = dfRaw1[dfRaw1['Date'] >= '2015-06-19']
yTest = XTest['Sales']

print('Training Min Date: {}'.format(XTrain['Date'].min()))
print('Training Max Date: {}'.format(XTrain['Date'].max()))

print('\nTest Min Date: {}'.format(XTest['Date'].min()))
print('Test Max Date: {}'.format(XTest['Date'].max()))

Training Min Date: 2013-01-01 00:00:00
Training Max Date: 2015-06-18 00:00:00

Test Min Date: 2015-06-19 00:00:00
Test Max Date: 2015-07-31 00:00:00


## Boruta as Feature Selector

In [9]:
# Training and Validation dataset for Boruta
XTrainN = XTrain.drop(['Date', 'Sales'], axis=1).to_numpy()
yTrainN = yTrain.values.ravel()

# Define RandomForestRegressor
rf = RandomForestRegressor(n_jobs=-1)

#Define Boruta
boruta = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=42).fit(XTrainN, yTrainN)

Iteration: 	1 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	2 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	3 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	4 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	5 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	6 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	7 / 100
Confirmed: 	0
Tentative: 	27
Rejected: 	0
Iteration: 	8 / 100
Confirmed: 	18
Tentative: 	0
Rejected: 	9


BorutaPy finished running.

Iteration: 	9 / 100
Confirmed: 	18
Tentative: 	0
Rejected: 	9


### Best Features Boruta

In [10]:
colsSelected = boruta.support_.tolist()

# Best Features
XTrainFS = XTrain.drop(['Date', 'Sales'], axis=1)
colsSelectedBoruta = XTrainFS.iloc[:, colsSelected].columns.to_list()

colsNotSelectBoruta = list(np.setdiff1d(XTrainFS.columns, colsSelectedBoruta))

In [11]:
colsSelectedBoruta

['Store',
 'Promo',
 'StoreType',
 'Assortment',
 'CompetitionDistance',
 'CompetitionOpenSinceMonth',
 'CompetitionOpenSinceYear',
 'Promo2',
 'Promo2SinceWeek',
 'Promo2SinceYear',
 'CompetionTimeMonth',
 'PromoTimeWeek',
 'MonthCos',
 'DaySin',
 'DayCos',
 'WeekOfYearCos',
 'DayOfWeekSin',
 'DayOfWeekCos']

In [12]:
#MonthSin
#WeekofYear

colsSelectedBoruta = [
                        'Store',
                        'Promo',
                        'StoreType',
                        'Assortment',
                        'CompetitionDistance',
                        'CompetitionOpenSinceMonth',
                        'CompetitionOpenSinceYear',
                        'Promo2',
                        'Promo2SinceWeek',
                        'Promo2SinceYear',
                        'CompetionTimeMonth',
                        'PromoTimeWeek',
                        'MonthSin',
                        'MonthCos',
                        'DaySin',
                        'DayCos',
                        'WeekOfYearSin',
                        'WeekOfYearCos',
                        'DayOfWeekSin',
                        'DayOfWeekCos']

# Columns to Add
featToAdd = ['Date', 'Sales']

colsSelectedBoruta.extend(featToAdd)

In [13]:
colsSelectedBoruta

['Store',
 'Promo',
 'StoreType',
 'Assortment',
 'CompetitionDistance',
 'CompetitionOpenSinceMonth',
 'CompetitionOpenSinceYear',
 'Promo2',
 'Promo2SinceWeek',
 'Promo2SinceYear',
 'CompetionTimeMonth',
 'PromoTimeWeek',
 'MonthSin',
 'MonthCos',
 'DaySin',
 'DayCos',
 'WeekOfYearSin',
 'WeekOfYearCos',
 'DayOfWeekSin',
 'DayOfWeekCos',
 'Date',
 'Sales']