# Import packages

In [None]:
import numpy as np
import pandas as pd
from window_generator import WindowGenerator

# Import data

Import the dataset and create train/val/test split using a *70/30* % split. Only the training data will be used for feature selection

In [None]:
directory = "stationary_dataset.csv"
df = pd.read_csv(directory, index_col=0, parse_dates=[0])

Create split and save the index of each feature

In [None]:
column_indices = {name: i for i, name in enumerate(df.columns)}

n = len(df)
train_ind = int(n*0.7)
train_df = df[0:train_ind]
test_df = df[train_ind:]

num_features = df.shape[1]

# Perform Boruta feature selection

The *Boruta* feature selection procedure is applied to four windows of the training data to avoid data leakage. Then, features with strong support in two or more folds are saved to the set of selected features. 

In [None]:
# Import packages
from sklearn.ensemble import RandomForestRegressor
from boruta import BorutaPy

Iterate over folds, and perform feature selection within each fold. Save the selected features. The selection is done using 1-day ahead data in this demonstratio. To use a different horizon the `shift` parameter can be changed in the following cell

In [None]:
input_width = 5
window = WindowGenerator(
    input_width=input_width, label_width=1, shift=1, label_columns=['VIX'], 
    train_df=train_df, val_df=val_df, test_df=test_df, scale=False)
window

Create dictionaries to store results for each window. The features with *weak support* are those which has not been confirmed nor denied as the number of iterations has been completed.

In [None]:
n_splits = window.n_splits
strong_support = {i:None for i in range(n_splits)}
weak_support = {i:None for i in range(n_splits)}
ranking = {i:None for i in range(n_splits)}

In [None]:
# Create 2D numpy array of the windows
folds = w.np_folds

for i, fold in enumerate(folds):
    # Get data
    train_x, train_y, val_x, val_y = fold
    train_x = train_x.reshape(len(train_x), -1)
    train_y = train_y.reshape((-1))
    
    # Create model and boruta instance
    model = RandomForestRegressor(n_estimators=250, max_depth=5, random_state=42)
    feat_selector = BorutaPy(
        verbose=0,
        estimator=model,
        n_estimators='auto',
        max_iter=250,  # number of iterations to perform
    )
    
    # Run feature selection
    print(f"Started running Boruta on fold {i}")
    feat_selector.fit(train_x, train_y)
    print(f"Finished running Boruta on fold {i}")
    print("-"*20)
    # Save results
    strong_support[i] = feat_selector.support_.copy()
    weak_support[i] = feat_selector.support_weak_.copy()
    ranking[i] = feat_selector.ranking_.copy()
    
    
    # Delete model to clear up memory
    del feat_selector
    del model
    

# Save the results

Store the feature and its indices

In [None]:
cols = df.columns
flat_cols = []

for i in range(w.input_width, 0, -1):
    lag = f'_Lag_{i}'
    for col in cols:
        flat_cols.append(f'{col}{lag}')

flat_column_indices = {name: i for i, name in enumerate(flat_cols)}

In [None]:
# Get all features without lag suffix
ordered_cols = [i[:-6] for i in flat_cols[:47]]

Save the support and ranking for each feature for each window using a Pandas dataframe

In [None]:
iterables = [ordered_cols, ["Strong", "Weak", "Ranking"]]

multi = pd.MultiIndex.from_product(iterables, names=["feature", "results"])

In [None]:
support_df = pd.DataFrame(index=range(1, 6), columns=multi)
support_df.index.name = "Folds"

Iterate over all features and their corresponding lag, and save its support to the dataframe

In [None]:
for key, item in strong_support.items():
    weak = weak_support[key]
    rank = ranking[key]
    for i in range(len(flat_cols)):
        name = flat_cols[i][:-6]  # Drop num lags
        lags = flat_cols[i][-1]  # Number of lags
        if item[i]:
            prev_val = support_df.loc[key+1, (name, "Strong")]
            if pd.isnull(prev_val):
                support_df.loc[key+1, (name, "Strong")] = [lags]
            else:
                support_df.loc[key+1, (name, "Strong")] = prev_val.append(lags)
        if weak[i]:
            prev_val = support_df.loc[key+1, (name, "Weak")]
            if pd.isnull(prev_val):
                support_df.loc[key+1, (name, "Weak")] = [lags]
            else:
                support_df.loc[key+1, (name, "Weak")] = prev_val.append(lags)
            
        support_df.loc[key+1, (name, "Ranking")] = rank[i]

# Create dataset

## Filter 

Create mask for all columns which doesn't have any NaN values for all folds. This entails features selected either with weak or strong support.
* Weak support means that the feature was tentative, ie neither confirmed nor denied at the final iteration of Boruta

In [None]:
m1 = support_df.loc[:, (slice(None), "Strong")].notna().values
m2 = support_df.loc[:, (slice(None), "Weak")].notna().values

Select all columns which were selected in three or more folds

In [None]:
t = (m1 | m2).sum(axis=0) >= 3
cols = support_df.columns.get_level_values("feature").unique()[t]

Display selected features

In [None]:
with pd.option_context("display.max_rows", 85):
    print(support_df.loc[:, (cols, ("Strong", "Weak"))].T.shape[0]/2)
    display(support_df.loc[:, (cols, ("Strong", "Weak"))].T)

## Filter original data

Save the selected set of features as a new dataframe, then save as a `csv`

In [None]:
cols.insert()

In [None]:
new_df = df[df.columns[df.columns.isin(cols)]].copy()

In [None]:
new_df.to_csv("feature_selected_data_1_day.csv")