# Feature Selection

In [1]:
import pandas                    as pd
import seaborn                   as sns
sns.set_style("darkgrid")
import matplotlib.pyplot         as plt
import numpy                     as np
import datetime                  as datetime

import warnings
warnings.filterwarnings("ignore")

from boruta                     import BorutaPy
from sklearn.ensemble           import RandomForestRegressor
from sklearn                    import ensemble        as en
from sklearn.tree               import DecisionTreeClassifier 

In [2]:
apple_prep = pd.read_pickle('data/apple_prep.pkl')
boeing_prep = pd.read_pickle('data/boeing_prep.pkl')
nike_prep = pd.read_pickle('data/nike_prep.pkl')

In [3]:
apple_prep.tail()

Unnamed: 0,date,open,high,low,close,adj_close,volume,year,month,day,week_of_year,day_of_week,day_of_week_number,close_open_variation,intraday_variation,daily_change,moving_average_10,moving_average_15,moving_average_50
10184,2021-05-05,129.199997,130.449997,127.970001,128.100006,127.88279,84000900,2021,5,5,18,Wednesday,3,-0.851386,1.937951,0.001955,132.238001,132.832667,126.544
10185,2021-05-06,127.889999,129.75,127.129997,129.740005,129.520004,78128300,2021,5,6,18,Thursday,4,1.44656,2.060885,0.012802,132.018001,132.515334,126.6318
10186,2021-05-07,130.850006,131.259995,129.479996,130.210007,130.210007,78892700,2021,5,7,18,Friday,5,-0.489109,1.374729,0.003623,131.607001,132.252001,126.8162
10187,2021-05-10,129.410004,129.539993,126.809998,126.849998,126.849998,88071200,2021,5,10,19,Monday,1,-1.978213,2.152823,-0.025805,130.820001,131.719335,126.928
10188,2021-05-11,123.5,126.269997,122.769997,125.910004,125.910004,126053700,2021,5,11,19,Tuesday,2,1.95142,2.850859,-0.00741,129.972002,131.239335,126.8904


## Split dataframe into training and test dataset


### Apple

In [4]:
print( 'Min Date: {}'.format(apple_prep['date'].min()))
print( 'Max Date: {}'.format(apple_prep['date'].max()))

Min Date: 1980-12-12 00:00:00
Max Date: 2021-05-11 00:00:00


#### One day split

In [5]:
apple_prep['date'].max() - datetime.timedelta(days=1)

Timestamp('2021-05-10 00:00:00')

In [6]:
# training dataset
XA1_train = apple_prep[apple_prep['date'] < '2021-05-10']
yA1_train = XA1_train['close']
# test dataset
XA1_test = apple_prep[apple_prep['date'] >= '2021-05-10']
yA1_test = XA1_test['close']

print( 'Training Min Date: {}'.format( XA1_train['date'].min() ) )
print( 'Training Max Date: {}'.format( XA1_train['date'].max() ) )
print( '\nTest Min Date: {}'.format( XA1_test['date'].min() ) )
print( 'Test Max Date: {}'.format( XA1_test['date'].max() ) )

Training Min Date: 1980-12-12 00:00:00
Training Max Date: 2021-05-07 00:00:00

Test Min Date: 2021-05-10 00:00:00
Test Max Date: 2021-05-11 00:00:00


#### 7 days split

In [7]:
apple_prep['date'].max() - datetime.timedelta(days=7)

Timestamp('2021-05-04 00:00:00')

In [8]:
# training dataset
XA7_train = apple_prep[apple_prep['date'] < '2021-05-04']
yA7_train = XA7_train['close']
# test dataset
XA7_test = apple_prep[apple_prep['date'] >= '2021-05-04']
yA7_test = XA7_test['close']

print( 'Training Min Date: {}'.format( XA7_train['date'].min() ) )
print( 'Training Max Date: {}'.format( XA7_train['date'].max() ) )
print( '\nTest Min Date: {}'.format( XA7_test['date'].min() ) )
print( 'Test Max Date: {}'.format( XA7_test['date'].max() ) )

Training Min Date: 1980-12-12 00:00:00
Training Max Date: 2021-05-03 00:00:00

Test Min Date: 2021-05-04 00:00:00
Test Max Date: 2021-05-11 00:00:00


In [9]:
%store XA7_train
%store XA7_test

Stored 'XA7_train' (DataFrame)
Stored 'XA7_test' (DataFrame)


#### 30 days split

In [10]:
apple_prep['date'].max() - datetime.timedelta(days=30)

Timestamp('2021-04-11 00:00:00')

In [11]:
# training dataset
XA30_train = apple_prep[apple_prep['date'] < '2021-05-04']
yA30_train = XA30_train['close']
# test dataset
XA30_test = apple_prep[apple_prep['date'] >= '2021-05-04']
yA30_test = XA30_test['close']

print( 'Training Min Date: {}'.format( XA30_train['date'].min() ) )
print( 'Training Max Date: {}'.format( XA30_train['date'].max() ) )
print( '\nTest Min Date: {}'.format( XA30_test['date'].min() ) )
print( 'Test Max Date: {}'.format( XA30_test['date'].max() ) )

Training Min Date: 1980-12-12 00:00:00
Training Max Date: 2021-05-03 00:00:00

Test Min Date: 2021-05-04 00:00:00
Test Max Date: 2021-05-11 00:00:00


#### 90 days split

In [12]:
apple_prep['date'].max() - datetime.timedelta(days=90)

Timestamp('2021-02-10 00:00:00')

In [13]:
# training dataset
XA90_train = apple_prep[apple_prep['date'] < '2021-02-10']
yA90_train = XA90_train['close']
# test dataset
XA90_test = apple_prep[apple_prep['date'] >= '2021-02-10']
yA90_test = XA90_test['close']

print( 'Training Min Date: {}'.format( XA90_train['date'].min() ) )
print( 'Training Max Date: {}'.format( XA90_train['date'].max() ) )
print( '\nTest Min Date: {}'.format( XA90_test['date'].min() ) )
print( 'Test Max Date: {}'.format( XA90_test['date'].max() ) )

Training Min Date: 1980-12-12 00:00:00
Training Max Date: 2021-02-09 00:00:00

Test Min Date: 2021-02-10 00:00:00
Test Max Date: 2021-05-11 00:00:00


### Boeing

In [14]:
print( 'Min Date: {}'.format(boeing_prep['date'].min()))
print( 'Max Date: {}'.format(boeing_prep['date'].max()))

Min Date: 1962-01-02 00:00:00
Max Date: 2021-05-11 00:00:00


#### One day split

In [15]:
boeing_prep['date'].max() - datetime.timedelta(days=1)

Timestamp('2021-05-10 00:00:00')

In [16]:
# training dataset
XB1_train = boeing_prep[boeing_prep['date'] < '2021-05-10']
yB1_train = XB1_train['close']
# test dataset
XB1_test = boeing_prep[boeing_prep['date'] >= '2021-05-10']
yB1_test = XB1_test['close']

print( 'Training Min Date: {}'.format( XB1_train['date'].min() ) )
print( 'Training Max Date: {}'.format( XB1_train['date'].max() ) )
print( '\nTest Min Date: {}'.format( XB1_test['date'].min() ) )
print( 'Test Max Date: {}'.format( XB1_test['date'].max() ) )

Training Min Date: 1962-01-02 00:00:00
Training Max Date: 2021-05-07 00:00:00

Test Min Date: 2021-05-10 00:00:00
Test Max Date: 2021-05-11 00:00:00


#### 7 days split

In [17]:
boeing_prep['date'].max() - datetime.timedelta(days=7)

Timestamp('2021-05-04 00:00:00')

In [18]:
# training dataset
XB7_train = boeing_prep[boeing_prep['date'] < '2021-05-04']
yB7_train = XB7_train['close']
# test dataset
XB7_test = boeing_prep[boeing_prep['date'] >= '2021-05-04']
yB7_test = XB7_test['close']

print( 'Training Min Date: {}'.format( XB7_train['date'].min() ) )
print( 'Training Max Date: {}'.format( XB7_train['date'].max() ) )
print( '\nTest Min Date: {}'.format( XB7_test['date'].min() ) )
print( 'Test Max Date: {}'.format( XB7_test['date'].max() ) )

Training Min Date: 1962-01-02 00:00:00
Training Max Date: 2021-05-03 00:00:00

Test Min Date: 2021-05-04 00:00:00
Test Max Date: 2021-05-11 00:00:00


#### 30 days split

In [19]:
boeing_prep['date'].max() - datetime.timedelta(days=30)

Timestamp('2021-04-11 00:00:00')

In [20]:
# training dataset
XB30_train = boeing_prep[boeing_prep['date'] < '2021-04-11']
yB30_train = XB30_train['close']
# test dataset
XB30_test = boeing_prep[boeing_prep['date'] >= '2021-04-11']
yB30_test = XB30_test['close']

print( 'Training Min Date: {}'.format( XB30_train['date'].min() ) )
print( 'Training Max Date: {}'.format( XB30_train['date'].max() ) )
print( '\nTest Min Date: {}'.format( XB30_test['date'].min() ) )
print( 'Test Max Date: {}'.format( XB30_test['date'].max() ) )

Training Min Date: 1962-01-02 00:00:00
Training Max Date: 2021-04-09 00:00:00

Test Min Date: 2021-04-12 00:00:00
Test Max Date: 2021-05-11 00:00:00


#### 90 days split

In [21]:
boeing_prep['date'].max() - datetime.timedelta(days=90)

Timestamp('2021-02-10 00:00:00')

In [22]:
# training dataset
XB90_train = boeing_prep[boeing_prep['date'] < '2021-02-10']
yB90_train = XB90_train['close']
# test dataset
XB90_test = boeing_prep[boeing_prep['date'] >= '2021-02-10']
yB90_test = XB90_test['close']

print( 'Training Min Date: {}'.format( XB90_train['date'].min() ) )
print( 'Training Max Date: {}'.format( XB90_train['date'].max() ) )
print( '\nTest Min Date: {}'.format( XB90_test['date'].min() ) )
print( 'Test Max Date: {}'.format( XB90_test['date'].max() ) )

Training Min Date: 1962-01-02 00:00:00
Training Max Date: 2021-02-09 00:00:00

Test Min Date: 2021-02-10 00:00:00
Test Max Date: 2021-05-11 00:00:00


### Nike

In [23]:
print( 'Min Date: {}'.format(nike_prep['date'].min()))
print( 'Max Date: {}'.format(nike_prep['date'].max()))

Min Date: 1980-12-02 00:00:00
Max Date: 2021-05-11 00:00:00


#### One day split

In [24]:
nike_prep['date'].max() - datetime.timedelta(days=1)

Timestamp('2021-05-10 00:00:00')

In [25]:
# training dataset
XN1_train = nike_prep[nike_prep['date'] < '2021-05-10']
yN1_train = XN1_train['close']
# test dataset
XN1_test = nike_prep[nike_prep['date'] >= '2021-05-10']
yN1_test = XN1_test['close']

print( 'Training Min Date: {}'.format( XN1_train['date'].min() ) )
print( 'Training Max Date: {}'.format( XN1_train['date'].max() ) )
print( '\nTest Min Date: {}'.format( XN1_test['date'].min() ) )
print( 'Test Max Date: {}'.format( XN1_test['date'].max() ) )

Training Min Date: 1980-12-02 00:00:00
Training Max Date: 2021-05-07 00:00:00

Test Min Date: 2021-05-10 00:00:00
Test Max Date: 2021-05-11 00:00:00


#### 7 days split

In [26]:
nike_prep['date'].max() - datetime.timedelta(days=7)

Timestamp('2021-05-04 00:00:00')

In [27]:
# training dataset
XN7_train = nike_prep[nike_prep['date'] < '2021-05-04']
yN7_train = XN7_train['close']
# test dataset
XN7_test = nike_prep[nike_prep['date'] >= '2021-05-04']
yN7_test = XN7_test['close']

print( 'Training Min Date: {}'.format( XN7_train['date'].min() ) )
print( 'Training Max Date: {}'.format( XN7_train['date'].max() ) )
print( '\nTest Min Date: {}'.format( XN7_test['date'].min() ) )
print( 'Test Max Date: {}'.format( XN7_test['date'].max() ) )

Training Min Date: 1980-12-02 00:00:00
Training Max Date: 2021-05-03 00:00:00

Test Min Date: 2021-05-04 00:00:00
Test Max Date: 2021-05-11 00:00:00


#### 30 days split

In [28]:
nike_prep['date'].max() - datetime.timedelta(days=30)

Timestamp('2021-04-11 00:00:00')

In [29]:
# training dataset
XN30_train = nike_prep[nike_prep['date'] < '2021-04-11']
yN30_train = XN30_train['close']
# test dataset
XN30_test = nike_prep[nike_prep['date'] >= '2021-04-11']
yN30_test = XN30_test['close']

print( 'Training Min Date: {}'.format( XN30_train['date'].min() ) )
print( 'Training Max Date: {}'.format( XN30_train['date'].max() ) )
print( '\nTest Min Date: {}'.format( XN30_test['date'].min() ) )
print( 'Test Max Date: {}'.format( XN30_test['date'].max() ) )

Training Min Date: 1980-12-02 00:00:00
Training Max Date: 2021-04-09 00:00:00

Test Min Date: 2021-04-12 00:00:00
Test Max Date: 2021-05-11 00:00:00


#### 90 days split

In [30]:
nike_prep['date'].max() - datetime.timedelta(days=30)

Timestamp('2021-04-11 00:00:00')

In [31]:
# training dataset
XN90_train = nike_prep[nike_prep['date'] < '2021-02-10']
yN90_train = XN90_train['close']
# test dataset
XN90_test = nike_prep[nike_prep['date'] >= '2021-02-10']
yN90_test = XN90_test['close']

print( 'Training Min Date: {}'.format( XN90_train['date'].min() ) )
print( 'Training Max Date: {}'.format( XN90_train['date'].max() ) )
print( '\nTest Min Date: {}'.format( XN90_test['date'].min() ) )
print( 'Test Max Date: {}'.format( XN90_test['date'].max() ) )

Training Min Date: 1980-12-02 00:00:00
Training Max Date: 2021-02-09 00:00:00

Test Min Date: 2021-02-10 00:00:00
Test Max Date: 2021-05-11 00:00:00


## Feature Selector


### Boruta

#### Apple

In [32]:
## training and test dataset for Boruta
XA1_train_n = XA1_train.drop( ['date', 'close'], axis=1 ).values
yA1_train_n = yA1_train.values.ravel()

## define RandomForestRegressor
rf = RandomForestRegressor( n_jobs=-1 )

## define Boruta
boruta = BorutaPy( rf, n_estimators='auto', verbose=2, random_state=42 ).fit(XA1_train_n, yA1_train_n )


ValueError: could not convert string to float: 'Friday'

 ##### Best Features

In [None]:
cols_selected = boruta.support_.tolist()

## best features
XA1_train_fs = XA1_train.drop( ['date', 'close'], axis=1 )

cols_selected_boruta = XA1_train_fs.iloc[:, cols_selected].columns.to_list()
cols_selected_boruta

In [None]:
## not selected boruta
cols_not_selected_boruta = list( np.setdiff1d( XA1_train_fs.columns,cols_selected_boruta ) )
cols_not_selected_boruta

#### Manual Feature Selection

In [None]:
cols_selected_boruta = [
'open',
 'high',
 'low',
 'adj_close',
 'close_open_variation',
 'moving_average_10',
 'moving_average_15',
 'moving_average_50',
 'month_cos',
 'week_of_year_sin',
 'week_of_year_cos']


### Boeing

### Nike