# Import packages

In [None]:
import pandas as pd
import numpy as np
from windowing_and_cv.tscv_sliding import TimeSeriesSplitSliding
from window_generator import WindowGenerator
from tensorflow.keras.losses import Huber
from sklearn.linear_model import ElasticNet
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
from sklearn.preprocessing import MinMaxScaler

# Import data

## Import and split data

Import the dataset and create train/val/test split using a *60/10/30* % split. Only the training data will be used for feature selection

In [None]:
directory = "stationary_dataset.csv"
df = pd.read_csv(directory, index_col=0, parse_dates=[0])

In [None]:
# Create split
n = len(df)
train_ind = int(n*0.6)
val_ind = int(n*0.7)

train_df = df[:train_ind]
val_df = df[train_ind:val_ind]
test_df = df[val_ind:]

## Create folds for parameter tuning

Create the window using the `WindowGenerator` class of the `window_generator.py` module with an input width of 5 days. Values are scaled using the `MinMaxScaler` of `scikit-learn`.

In [None]:
window = WindowGenerator(
        input_width=5, label_width=1, shift=1, label_columns=['VIX'], 
        train_df=train_df, val_df=val_df, test_df=test_df, scale=True, scaler=MinMaxScaler
    )

Create four set of windows for cross-validation purposes on the training data.

In [None]:
folds = window.folds

# Perform cross-validation for hyperparameter tuning

To find the best set of parameters on the training set, a `grid search` cross-validation procedure is implemented. First, specify the parameter grid

In [None]:
param_grid = dict(
    elnet_alpha=[1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 0.0, 1.0, 10.0, 100.0],
    elnet_l1_ratio=np.arange(0, 1, 0.1)
)

Create and save the metrics to be utilized for evaluation as a Python *dictionary*

In [None]:
huber = Huber()
def rmse(y_actual, y_predicted):
    return mean_squared_error(y_actual, y_predicted, squared=False)
metrics = {
    "mse":mean_squared_error, "mae": mean_absolute_error, 
    "mape": mean_absolute_percentage_error, "rmse": rmse, "huber": huber, "r2":r2_score
}

Next, perform the grid search on the training data 

In [None]:
# Create dictionary for saving hyperparameter evaluation
param_results  = dict()

# Iterate over parameters
for alpha in param_grid["elnet_alpha"]:
    for l1_ratio in param_grid["elnet_l1_ratio"]:
        print(f"Params: {alpha}, {l1_ratio}")
        results = {key:[] for key in metrics.keys()}
        # Iterate over windows in training data
        for fold in folds:
            # Get data
            train, val = fold
            train_X = np.concatenate([x for x, y in train], axis=0)
            train_X = train_X.reshape(len(train_X), -1)
            train_y = np.concatenate([y for x, y in train], axis=0)
            train_y = train_y.reshape(-1)
            
            val_X = np.concatenate([x for x, y in val], axis=0)
            val_X = val_X.reshape(len(val_X), -1)
            val_y = np.concatenate([y for x, y in val], axis=0)
            val_y = val_y.reshape(-1)
            # Create model
            model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, max_iter=10000)
            # Fit
            model.fit(train_X, train_y)

            # Predict
            prediction = model.predict(val_X)
            # Save evaluation for given fold
            for key, metric in metrics.items():
                score = metric(val_y, prediction.flatten())
                results[key].append(score)
                
        # Compute mean for each metric over all folds
        mean_results = {key:[] for key in metrics.keys()}
        for key, metric in results.items():
            mean_score = np.mean(metric)
            mean_results[key] = mean_score
        name = f"alpha: {alpha}, l1_ratio: {l1_ratio}"
        param_results[name] = mean_results

Save evaluation metrics as a Pandas dataframe

In [None]:
resdf = pd.DataFrame(param_results).T

Display the results to find best overall set of parameters with respect to all set of metrics

In [None]:
for col in resdf.columns:
    print(col)
    display(resdf.sort_values(col).head())

# Perform the feature selection on validation data

Create 2D numpy arrays of features of the training data to use for feature selection. To do so, get the training Tensorflow `Dataset` from the window generator object, and convert into numpy arrays

In [None]:
tr = window.train

trx = np.concatenate([x for x,y in tr], axis=0)
trx = trx.reshape(len(trx), -1)
ytr = np.concatenate([y for x,y in tr], axis=0)
ytr = ytr.ravel()

Create and fit the elastic net model using the best set of hyperparameters

In [None]:
model = ElasticNet(alpha=0.0001, l1_ratio=0.9, max_iter=10000)
model.fit(trx, ytr)

Get all coefficients and corresponding lags as a Pandas `Series` object

In [None]:
cols = df.columns
flat_cols = []

for i in range(window.input_width, 0, -1):
    lag = f'_Lag_{i}'
    for col in cols:
        flat_cols.append(f'{col}{lag}')

coeff_value = pd.Series({name: model.coef_[i] for i, name in enumerate(flat_cols)})

Display all coefficients different from zero

In [None]:
coeff_value[coeff_value != 0]

Print all features with non-zero coefficients

In [None]:
print([i[:-6] for i in coeff_value[coeff_value != 0].index])