# Experiments 203

***Changelog:***
- Validation metrics added
- New network architectures added. 

In [1]:
cd /Users/camilacusicanqui/Documents/Pedro-Pineapple/forecasting

/Users/camilacusicanqui/Documents/Pedro-Pineapple/forecasting


In [2]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import environments
from dynamic_threshold import define_threshold

import numpy as np
from stable_baselines3 import A2C
from dataclasses import dataclass
from sklearn.metrics import (
    ConfusionMatrixDisplay,
    classification_report,
    confusion_matrix
)
from json import dumps
from typing import Tuple, Union, List

In [3]:
def data_splitter(
    raw_data: pd.DataFrame,
    proportion: int = 0.7,
    init: Union[int, Tuple[int, int]] = None,
    end: Union[int, Tuple[int, int]] = None
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """
    Split the raw time series data set.
    """
    if isinstance(init, int) and isinstance(end, int):
        train = raw_data.iloc[:init]
        test = raw_data.iloc[end:]
    
    if isinstance(init, tuple) and isinstance(end, tuple):
        train = raw_data.iloc[init[0]:init[1]]
        test = raw_data.iloc[end[0]:end[1]]

    if not init and not end:
        splitter = round(raw_data.shape[0] * proportion)
        train, test = raw_data.iloc[:splitter], raw_data.iloc[splitter:]

    return train, test

def evaluation_metrics(
    y_true: pd.Series,
    y_pred: pd.Series,
    target_names: List[Union[int,str,float]]
) -> Union[str, dict]:
    """
    Creates the confusion matrix from Scikit-learn.
    """
    cm = confusion_matrix(y_true=y_true, y_pred=y_pred)
    disp = ConfusionMatrixDisplay(
        confusion_matrix=cm,
        display_labels=target_names
    )
    disp.plot()
    plt.show()

    report = classification_report(
        y_true=y_true,
        y_pred=y_pred,
        target_names=target_names,
        output_dict=True
    )
    return report

########### LABELS ############

def create_labels(
    x: pd.Series,
    labels: List[Union[str, float, int]],
    perc_bounds: List[float],
    override_plot: bool = False
) -> Tuple[pd.Series, pd.Series]:
    """
    Create the labels based on a given pd.Series.
    """
    # Limit for bins.
    # Relative differences.
    relative_diff = x.pct_change(periods=1).fillna(value=0)
    
    # Percentual.
    perc_relative_diff = relative_diff * 100
    # Cut labels.
    threshold_up, threshold_low = define_threshold(
        df = perc_relative_diff,
        lower_bound = perc_bounds[0],
        upper_bound = perc_bounds[1],
        override_plot = override_plot
    )
    print(threshold_low, threshold_up)
    bins=[-float('inf'), threshold_low, threshold_up, float('inf')]
    type(perc_relative_diff)

    all_labels = pd.cut(
        x = perc_relative_diff,
        bins = bins,
        labels = labels,
        right = False
    )

    return all_labels, perc_relative_diff



In [4]:
# Time series parameters.
@dataclass
class PARAMETERS:
    TIMESTEPS = 200
    WINDOW_SIZE = 5
    DATA_PATH = '/Users/camilacusicanqui/Documents/Pedro-Pineapple/data/SPY_20172023.csv'

In [5]:
# Read data.
data = pd.read_csv(PARAMETERS.DATA_PATH)
data.columns = data.columns.str.lower()

In [6]:
data['labels'], data['perc_relative_diff'] = create_labels(
    x = data['close'],
    labels = [0, 1, 2],
    perc_bounds = [0.5, 0.5],
    override_plot  = False
)

-0.0863631393033738 0.0863631393033738


In [7]:
data

Unnamed: 0,date,open,high,low,close,adj close,volume,labels,perc_relative_diff
0,2017-01-03,225.039993,225.830002,223.880005,225.240005,200.629639,91366500,1,0.000000
1,2017-01-04,225.619995,226.750000,225.610001,226.580002,201.823273,78744400,2,0.594919
2,2017-01-05,226.270004,226.580002,225.479996,226.399994,201.662949,78379000,1,-0.079446
3,2017-01-06,226.529999,227.750000,225.899994,227.210007,202.384445,71559900,2,0.357780
4,2017-01-09,226.910004,227.070007,226.419998,226.460007,201.716431,46939700,0,-0.330091
...,...,...,...,...,...,...,...,...,...
1713,2023-10-24,422.649994,424.820007,420.739990,423.630005,423.630005,78564200,2,0.753939
1714,2023-10-25,421.890015,421.920013,417.019989,417.549988,417.549988,94223200,0,-1.435219
1715,2023-10-26,416.450012,417.329987,411.600006,412.549988,412.549988,115156800,0,-1.197461
1716,2023-10-27,414.190002,414.600006,409.209991,410.679993,410.679993,107228400,0,-0.453277


In [None]:
df_train, df_test = data_splitter(raw_data=data)

In [None]:
# Visualize the relative percentage changes.
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, sharey=True,  figsize=(15, 8))

sns.histplot(data=df_train, binwidth=0.1, x="perc_relative_diff", hue="labels", ax=ax1)
ax1.set_ylabel('Count')
ax1.set_xlabel('% Relative difference')
ax1.set_title('Train dataset')
ax1.grid()

sns.histplot(data=df_test, binwidth=0.1, x="perc_relative_diff", hue="labels", ax=ax2)
ax2.set_ylabel('Count')
ax2.set_xlabel('% Relative difference')
ax2.set_title('Test dataset')
ax2.grid()

In [None]:
# Create the environment.
env = environments.Forecasting(
    df=df_train, window_size=PARAMETERS.WINDOW_SIZE
)

In [None]:
# Random walk.
state = env.reset(seed=2008)

while True:
    # Sample action from space.
    action = env.action_space.sample()
    n_state, reward, done, truncated, info = env.step(
        action=action
    )

    if done or truncated:
        print('info', info, '\n')
        break

In [None]:
plt.figure(figsize=(15, 6))
plt.cla()
env.render_all(title='Random walk.')
plt.show()

In [None]:
y_true, y_pred = df_train['labels'].to_numpy()[PARAMETERS.WINDOW_SIZE + 1:], env.actions_history

In [None]:
performance = evaluation_metrics(
    y_true=y_true,
    y_pred=y_pred,
    target_names=['down', 'no', 'up']
)

In [None]:
print(dumps(performance, indent=4))

In [None]:
# Train Environment
model = A2C('MlpPolicy', env, verbose=1) 
model.learn(total_timesteps=PARAMETERS.TIMESTEPS)

In [None]:
env = environments.Forecasting(
    df=df_test, window_size=PARAMETERS.WINDOW_SIZE
)

observation, info = env.reset()

while True: 
    observation = observation[np.newaxis, ...]
    action, _states = model.predict(observation)
    observation, rewards, done, truncated, info = env.step(action)
    if done or truncated:
        print('info', info, '\n')
        break

In [None]:
plt.figure(figsize=(15, 6))
plt.cla()
env.render_all(title='Experiment')
plt.show()

In [None]:
y_true, y_pred = df_test['labels'].to_numpy()[PARAMETERS.WINDOW_SIZE + 1:], np.concatenate(env.actions_history)

In [None]:
performance = evaluation_metrics(
    y_true=y_true,
    y_pred=y_pred,
    target_names=['down', 'no', 'up']
)

In [None]:
print(dumps(performance, indent=4))

## Performance explanation

Precision = $\frac{TP}{TP + FP}$

Recall =  $\frac{TP}{TP + FN}$

F1 = $\frac{2}{Precision^{-1} + Precision^{-1}}$