In [1]:
# Package import
from datetime import datetime
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import os
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
import talib
from plotly.subplots import make_subplots
import pickle

from functions_TA_indicator import *


### Train, Val, Test splits 

Train set will be from 2003-01-01 to 2017-12-31

Validation set will be from 2018-01-01 to 2019-12-31

Test set will be from 2020-01-01 to 2022-12-31

In [2]:
pio.templates.default = "plotly_dark"
default_template = pio.templates[pio.templates.default]
default_template.layout.update(height=800, width=900)


Layout({
    'annotationdefaults': {'arrowcolor': '#f2f5fa', 'arrowhead': 0, 'arrowwidth': 1},
    'autotypenumbers': 'strict',
    'coloraxis': {'colorbar': {'outlinewidth': 0, 'ticks': ''}},
    'colorscale': {'diverging': [[0, '#8e0152'], [0.1, '#c51b7d'], [0.2,
                                 '#de77ae'], [0.3, '#f1b6da'], [0.4, '#fde0ef'],
                                 [0.5, '#f7f7f7'], [0.6, '#e6f5d0'], [0.7,
                                 '#b8e186'], [0.8, '#7fbc41'], [0.9, '#4d9221'],
                                 [1, '#276419']],
                   'sequential': [[0.0, '#0d0887'], [0.1111111111111111,
                                  '#46039f'], [0.2222222222222222, '#7201a8'],
                                  [0.3333333333333333, '#9c179e'],
                                  [0.4444444444444444, '#bd3786'],
                                  [0.5555555555555556, '#d8576b'],
                                  [0.6666666666666666, '#ed7953'],
                           

In [3]:
OHLC_df = pd.read_csv(f"Data/Original_Data/VIX_OHLC_2003_2023.csv")
OHLC_df

Unnamed: 0,Date,Open,High,Low,Close
0,2002-05-31,20.260000,20.260000,19.430000,19.980000
1,2002-06-03,20.980000,23.370001,20.620001,23.370001
2,2002-06-04,23.600000,24.820000,23.309999,23.889999
3,2002-06-05,23.520000,23.770000,22.459999,22.610001
4,2002-06-06,22.969999,24.639999,22.940001,24.160000
...,...,...,...,...,...
5346,2023-08-25,17.209999,17.360001,15.450000,15.680000
5347,2023-08-28,16.240000,16.280001,15.000000,15.080000
5348,2023-08-29,15.080000,15.300000,14.340000,14.450000
5349,2023-08-30,14.530000,14.700000,13.830000,13.880000


### Technical indicators creation and display

Most of the technical indicators have the first N instances as NAN, as they require previous data to be calculated.
To mitigate this, i use data from December 2002 as a starting point.

The actual dataset will consist of data from 2003 to 2023.

In [4]:
ta_indicator_dfs = {}

In [5]:
aroon_indicators_df = pd.DataFrame(OHLC_df['Date'])
timeframes = [3,6,9,12,18,24,30]
for timeframe in tqdm(timeframes):
    tmp_aroon_indicators_df = aroon(OHLC_df, timeframe)
    aroon_indicators_df = pd.merge(aroon_indicators_df, tmp_aroon_indicators_df, on='Date', how='left')

aroon_indicators_df = aroon_indicators_df[aroon_indicators_df['Date'] >= '2003-01-01']
ta_indicator_dfs['AROON'] = aroon_indicators_df

  0%|          | 0/7 [00:00<?, ?it/s]

In [6]:

bb_indicators_df = pd.DataFrame(OHLC_df['Date'])
timeframes = [3,6,9,12,18,24,30]
for timeframe in tqdm(timeframes):
    tmp_bb_indicators_df = bollinger_bands(OHLC_df, timeframe)
    bb_indicators_df = pd.merge(bb_indicators_df, tmp_bb_indicators_df, on='Date', how='left')

bb_indicators_df = bb_indicators_df[bb_indicators_df['Date'] >= '2003-01-01']
ta_indicator_dfs['BB'] = bb_indicators_df

  0%|          | 0/7 [00:00<?, ?it/s]

In [7]:

ema_indicators_df = pd.DataFrame(OHLC_df['Date'])
timeframes = [3,6,9,12,18,24,30,36,42,48,54,60]
for timeframe in tqdm(timeframes):
    tmp_ema_indicators_df = ema(OHLC_df, timeframe)
    ema_indicators_df = pd.merge(ema_indicators_df, tmp_ema_indicators_df, on='Date', how='left')

ema_indicators_df = ema_indicators_df[ema_indicators_df['Date'] >= '2003-01-01']
ta_indicator_dfs['EMA'] = ema_indicators_df


  0%|          | 0/12 [00:00<?, ?it/s]

In [8]:
macd_indicators_df = pd.DataFrame(OHLC_df['Date'])
timeframes = [3,6,9,12,18,24,30]
for timeframe in tqdm(timeframes):
    tmp_macd_indicators_df = macd(OHLC_df, timeframe)
    macd_indicators_df = pd.merge(macd_indicators_df, tmp_macd_indicators_df, on='Date', how='left')

macd_indicators_df = macd_indicators_df[macd_indicators_df['Date'] >= '2003-01-01']
ta_indicator_dfs['MACD'] = macd_indicators_df

  0%|          | 0/7 [00:00<?, ?it/s]

In [9]:
roc_indicators_df = pd.DataFrame(OHLC_df['Date'])
timeframes = [1,2,3,6,9,12,18,24,30]
for timeframe in tqdm(timeframes):
    tmp_roc_indicators_df = roc(OHLC_df, timeframe)
    roc_indicators_df = pd.merge(roc_indicators_df, tmp_roc_indicators_df, on='Date', how='left')
    
roc_indicators_df = roc_indicators_df[roc_indicators_df['Date'] >= '2003-01-01']
ta_indicator_dfs['ROC'] = roc_indicators_df


  0%|          | 0/9 [00:00<?, ?it/s]

In [10]:

rsi_indicators_df = pd.DataFrame(OHLC_df['Date'])
timeframes = [3,6,9,12,18,24,30]
for timeframe in tqdm(timeframes):
    tmp_rsi_indicators_df = rsi(OHLC_df, timeframe)
    rsi_indicators_df = pd.merge(rsi_indicators_df, tmp_rsi_indicators_df, on='Date', how='left')
    
rsi_indicators_df = rsi_indicators_df[rsi_indicators_df['Date'] >= '2003-01-01']
ta_indicator_dfs['RSI'] = rsi_indicators_df
    

  0%|          | 0/7 [00:00<?, ?it/s]

In [11]:
high_indicators_df = pd.DataFrame(OHLC_df['Date'])
timeframes = [3,6,9,12,18,24,30,36,42,48,54,60]
for timeframe in tqdm(timeframes):
    tmp_high_indicators_df = max_high(OHLC_df, timeframe)
    high_indicators_df = pd.merge(high_indicators_df, tmp_high_indicators_df, on='Date', how='left')
    
high_indicators_df = high_indicators_df[high_indicators_df['Date'] >= '2003-01-01']
ta_indicator_dfs['Max_High'] = high_indicators_df

  0%|          | 0/12 [00:00<?, ?it/s]

In [12]:
low_indicators_df = pd.DataFrame(OHLC_df['Date'])
timeframes = [3,6,9,12,18,24,30,36,42,48,54,60]
for timeframe in tqdm(timeframes):
    tmp_low_indicators_df = min_low(OHLC_df, timeframe)
    low_indicators_df = pd.merge(low_indicators_df, tmp_low_indicators_df, on='Date', how='left')
    
low_indicators_df = low_indicators_df[low_indicators_df['Date'] >= '2003-01-01']
ta_indicator_dfs['Min_Low'] = low_indicators_df

  0%|          | 0/12 [00:00<?, ?it/s]

In [13]:

high_low_diff_indicators_df = pd.DataFrame(OHLC_df['Date'])
timeframes = [3,6,9,12,18,24,30,36,42,48,54,60]
for timeframe in tqdm(timeframes):
    tmp_high_low_diff_indicators_df = high_low_diff(OHLC_df, timeframe)
    high_low_diff_indicators_df = pd.merge(high_low_diff_indicators_df, tmp_high_low_diff_indicators_df, on='Date', how='left')
    
high_low_diff_indicators_df = high_low_diff_indicators_df[high_low_diff_indicators_df['Date'] >= '2003-01-01']
ta_indicator_dfs['High_Low_Diff'] = high_low_diff_indicators_df

  0%|          | 0/12 [00:00<?, ?it/s]

In [14]:

high_low_mean_indicators_df = pd.DataFrame(OHLC_df['Date'])
timeframes = [3,6,9,12,18,24,30,36,42,48,54,60]
for timeframe in tqdm(timeframes):
    tmp_high_low_mean_indicators_df = high_low_mean(OHLC_df, timeframe)
    high_low_mean_indicators_df = pd.merge(high_low_mean_indicators_df, tmp_high_low_mean_indicators_df, on='Date', how='left')
    
high_low_mean_indicators_df = high_low_mean_indicators_df[high_low_mean_indicators_df['Date'] >= '2003-01-01']
ta_indicator_dfs['High_Low_Mean'] = high_low_mean_indicators_df

  0%|          | 0/12 [00:00<?, ?it/s]

### Storing for later use 

In [15]:
for key in ta_indicator_dfs:
    ta_indicator_dfs[key].to_csv(f"Data/Original_Data/{key}.csv", index=False)

### Visualizing the data

In [16]:

viz_df_1 = OHLC_df.copy(deep=True)
viz_df_1 = viz_df_1.merge(ema_indicators_df, on='Date', how='left')
viz_df_1 = viz_df_1[viz_df_1['Date'] >= '2022-01-01']

fig = go.Figure(data=[go.Candlestick(x=viz_df_1['Date'],
                open=viz_df_1['Open'],
                high=viz_df_1['High'],
                low=viz_df_1['Low'],
                close=viz_df_1['Close']),
                go.Scatter(x=viz_df_1['Date'], y=viz_df_1['EMA_3'],  line=dict(color='#ADD8E6', width=2), name='EMA_3'),
                go.Scatter(x=viz_df_1['Date'], y=viz_df_1['EMA_6'],  line=dict(color='#9CC4DF', width=2), name='EMA_6'),
                go.Scatter(x=viz_df_1['Date'], y=viz_df_1['EMA_9'],  line=dict(color='#8CB1D8', width=2), name='EMA_9'),
                go.Scatter(x=viz_df_1['Date'], y=viz_df_1['EMA_12'], line=dict(color='#7B9DD0', width=2), name='EMA_12'),
                go.Scatter(x=viz_df_1['Date'], y=viz_df_1['EMA_18'], line=dict(color='#6B89C9', width=2), name='EMA_18'),
                go.Scatter(x=viz_df_1['Date'], y=viz_df_1['EMA_24'], line=dict(color='#5B75C1', width=2), name='EMA_24'),
                go.Scatter(x=viz_df_1['Date'], y=viz_df_1['EMA_30'], line=dict(color='#4B61BA', width=2), name='EMA_30'),
                go.Scatter(x=viz_df_1['Date'], y=viz_df_1['EMA_36'], line=dict(color='#3B4DB2', width=2), name='EMA_36'),
                go.Scatter(x=viz_df_1['Date'], y=viz_df_1['EMA_42'], line=dict(color='#2B39AB', width=2), name='EMA_42'),
                go.Scatter(x=viz_df_1['Date'], y=viz_df_1['EMA_48'], line=dict(color='#1C25A3', width=2), name='EMA_48'),
                go.Scatter(x=viz_df_1['Date'], y=viz_df_1['EMA_54'], line=dict(color='#1C179C', width=2), name='EMA_54'),
                go.Scatter(x=viz_df_1['Date'], y=viz_df_1['EMA_60'], line=dict(color='#1611A8', width=2), name='EMA_60'),
                        ])

fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()


In [17]:

viz_df_2 = OHLC_df.copy(deep=True)
viz_df_2 = viz_df_2.merge(bb_indicators_df, on='Date', how='left')
viz_df_2 = viz_df_2[viz_df_2['Date'] >= '2022-01-01']

fig = go.Figure(data=[go.Candlestick(x=viz_df_2['Date'],
                open=viz_df_2['Open'],
                high=viz_df_2['High'],
                low=viz_df_2['Low'],
                close=viz_df_2['Close']),

                go.Scatter(x=viz_df_2['Date'], y=viz_df_2['BB_upper_9'],  line=dict(color='#FFEDA0 '   , width=1), name='BB_upper_9'),
                go.Scatter(x=viz_df_2['Date'], y=viz_df_2['BB_lower_9'],  line=dict(color='#FFEDA0 '   , width=1), name='BB_lower_9'),
                go.Scatter(x=viz_df_2['Date'], y=viz_df_2['BB_upper_18'],  line=dict(color='#FFD700  ' , width=1), name='BB_upper_18'),
                go.Scatter(x=viz_df_2['Date'], y=viz_df_2['BB_lower_18'],  line=dict(color='#FFD700  ' , width=1), name='BB_lower_18'),
                go.Scatter(x=viz_df_2['Date'], y=viz_df_2['BB_upper_30'], line=dict(color='#FFA500    ', width=1), name='BB_upper_30'),
                go.Scatter(x=viz_df_2['Date'], y=viz_df_2['BB_lower_30'], line=dict(color='#FFA500    ', width=1), name='BB_lower_30'),
                go.Scatter(x=viz_df_2['Date'], y=viz_df_2['Close'],       line=dict(color='#1966e3    ', width=2), name='Close'),
                        ])

fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()


In [18]:

viz_df_3 = OHLC_df.copy(deep=True)
viz_df_3 = viz_df_3.merge(aroon_indicators_df, on='Date', how='left')
viz_df_3 = viz_df_3[viz_df_3['Date'] >= '2022-01-01']

traces = [go.Scatter(x=viz_df_3['Date'], y=viz_df_3['AROON_up_18'], line=dict(color='#93FF77', width=2), name='Aroon_up_18'),
          go.Scatter(x=viz_df_3['Date'], y=viz_df_3['AROON_up_24'], line=dict(color='#47FF13 ', width=2), name='Aroon_up_24'),
          go.Scatter(x=viz_df_3['Date'], y=viz_df_3['AROON_up_30'], line=dict(color='#157F00 ', width=2), name='Aroon_up_30'),
          go.Scatter(x=viz_df_3['Date'], y=viz_df_3['AROON_down_18'], line=dict(color='#FF7F77', width=2), name='Aroon_down_18'),
          go.Scatter(x=viz_df_3['Date'], y=viz_df_3['AROON_down_24'], line=dict(color='#FF3D13 ', width=2), name='Aroon_down_24'),
          go.Scatter(x=viz_df_3['Date'], y=viz_df_3['AROON_down_30'], line=dict(color='#7F1600 ', width=2), name='Aroon_down_30')
            ]

fig = go.Figure(data=traces)
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()


In [19]:
timeperiod = 9

viz_df_4 = OHLC_df.copy(deep=True)
viz_df_4 = viz_df_4.merge(macd_indicators_df, on='Date', how='left')
tmp_ema_df_1 = ema(OHLC_df, timeperiod = round(timeperiod*2.888)) # 26 by default
tmp_ema_df_2 = ema(OHLC_df, timeperiod = round(timeperiod*1.333)) # 12 by default
viz_df_4 = viz_df_4.merge(tmp_ema_df_1, on='Date', how='left')
viz_df_4 = viz_df_4.merge(tmp_ema_df_2, on='Date', how='left')

viz_df_4 = viz_df_4[viz_df_4['Date'] >= '2022-01-01']


fig = go.Figure(data=[go.Candlestick(x=viz_df_4['Date'],
                                     open=viz_df_4['Open'],
                                     high=viz_df_4['High'],
                                     low=viz_df_4['Low'],
                                     close=viz_df_4['Close'])])

fig.add_trace(go.Scatter(x=viz_df_4['Date'], y=viz_df_4[f'MACD_{timeperiod}']*6, mode='lines', name='MACD', line=dict(color='blue')))
fig.add_trace(go.Scatter(x=viz_df_4['Date'], y=viz_df_4[f'MACD_signal_{timeperiod}']*6, mode='lines', name='Signal', line=dict(color='orange')))
fig.add_trace(go.Bar(x=viz_df_4['Date'], y=viz_df_4[f'MACD_hist_{timeperiod}']*6, name='Hist', 
                    marker_color=viz_df_4[f'MACD_hist_{timeperiod}'].apply(lambda x: 'green' if x > 0 else 'red')))

fig.add_trace(go.Scatter(x=viz_df_4['Date'], y=viz_df_4['EMA_26'], mode='lines', name='EMA 26', line=dict(color='#8a039c')))
fig.add_trace(go.Scatter(x=viz_df_4['Date'], y=viz_df_4['EMA_12'], mode='lines', name='EMA 26', line=dict(color='#b257f7')))

# Adjust the layout
fig.update_layout(title='OHLC with MACD Indicator',
                  xaxis_title='Date',
                  yaxis_title='Price',
                  xaxis_rangeslider_visible=False)

### Description of MACD:

For the MACD, the most common paramters are 12, 26 and 9. These describe the values for my "MACD_9" series.
The values for 12 and 26 are scaled proportionally the the values for 9.

$$ MACD = EMA_{12} - EMA_{26} $$
$$ Signal = EMA(MACD)_{9} $$
$$ Histogram = MACD - Signal $$

NOTE: The MACD data in the plot is scaled with 6 for visualization purposes.

In [20]:
timeperiod = 6  #6

viz_df_5 = OHLC_df.copy(deep=True)
viz_df_5 = viz_df_5.merge(roc_indicators_df, on='Date', how='left')
viz_df_5 = viz_df_5[viz_df_5['Date'] >= '2022-01-01']

fig = make_subplots(rows=2, cols=1, shared_xaxes=True, subplot_titles=('Closing Price', f'ROC_{timeperiod}'))
fig.add_trace(go.Scatter(x=viz_df_5['Date'], y=viz_df_5['Close'], name='Closing Price'), row=1, col=1)
fig.add_trace(go.Bar(x=viz_df_5['Date'], y=viz_df_5[f'ROC_{timeperiod}'], 
                     marker_color=viz_df_5[f'ROC_{timeperiod}'].apply(lambda x: 'green' if x > 0 else 'red'),
                     name=f'ROC_{timeperiod}'), row=2, col=1)

fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()


In [21]:
viz_df_6 = OHLC_df.copy(deep=True)
viz_df_6 = viz_df_6.merge(rsi_indicators_df, on='Date', how='left')
viz_df_6 = viz_df_6[viz_df_6['Date'] >= '2022-01-01']    
                       
traces = [      go.Scatter(x=viz_df_6['Date'], y=viz_df_6['RSI_3'],  line=dict(color='#ADD8E6', width=2), name='RSI_3'),
                go.Scatter(x=viz_df_6['Date'], y=viz_df_6['RSI_6'],  line=dict(color='#9CC4DF', width=2), name='RSI_6'),
                go.Scatter(x=viz_df_6['Date'], y=viz_df_6['RSI_9'],  line=dict(color='#8CB1D8', width=2), name='RSI_9'),
                go.Scatter(x=viz_df_6['Date'], y=viz_df_6['RSI_12'], line=dict(color='#7B9DD0', width=2), name='RSI_12'),
                go.Scatter(x=viz_df_6['Date'], y=viz_df_6['RSI_18'], line=dict(color='#6B89C9', width=2), name='RSI_18'),
                go.Scatter(x=viz_df_6['Date'], y=viz_df_6['RSI_24'], line=dict(color='#5B75C1', width=2), name='RSI_24'),
                go.Scatter(x=viz_df_6['Date'], y=viz_df_6['RSI_30'], line=dict(color='#4B61BA', width=2), name='RSI_30')]

fig = go.Figure(data=traces)
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()


In [22]:
viz_df_7 = OHLC_df.copy(deep=True)
viz_df_7 = viz_df_7.merge(low_indicators_df, on='Date', how='left')
viz_df_7 = viz_df_7.merge(high_indicators_df, on='Date', how='left')
viz_df_7 = viz_df_7[viz_df_7['Date'] >= '2022-01-01']
viz_df_7 = viz_df_7[viz_df_7['Date'] <= '2023-01-01']

traces = [go.Scatter(x=viz_df_7['Date'], y=viz_df_7['Close'], line=dict(color=  'Light Blue', width=2), name='Close'),
          go.Scatter(x=viz_df_7['Date'], y=viz_df_7['Min_Low_9'], line=dict(color=  '#FF7F77', width=2), name='Min_Low_18'),
          go.Scatter(x=viz_df_7['Date'], y=viz_df_7['Min_Low_18'], line=dict(color= '#FF3D13', width=2), name='Min_Low_30'),
          go.Scatter(x=viz_df_7['Date'], y=viz_df_7['Min_Low_30'], line=dict(color= '#7F1600', width=2), name='Min_Low_60'),
          go.Scatter(x=viz_df_7['Date'], y=viz_df_7['Max_High_9'], line=dict(color= '#93FF77', width=2), name='Max_High_18'),
          go.Scatter(x=viz_df_7['Date'], y=viz_df_7['Max_High_18'], line=dict(color='#47FF13', width=2), name='Max_High_30'),
          go.Scatter(x=viz_df_7['Date'], y=viz_df_7['Max_High_30'], line=dict(color='#157F00', width=2), name='Max_High_60')
                ]

fig = go.Figure(data=traces)
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()


In [23]:
viz_df_8 = OHLC_df.copy(deep=True)
viz_df_8 = viz_df_8.merge(high_low_diff_indicators_df, on='Date', how='left')
viz_df_8 = viz_df_8[viz_df_8['Date'] >= '2019-01-01']

traces = [go.Scatter(x=viz_df_8['Date'], y=viz_df_8['Close'], line=dict(color=  'Light Blue', width=2), name='Close'),
          go.Scatter(x=viz_df_8['Date'], y=viz_df_8['High_Low_Diff_9'], line=dict(color=  '#FF7F77', width=2), name='High_Low_Diff_9'),
          go.Scatter(x=viz_df_8['Date'], y=viz_df_8['High_Low_Diff_18'], line=dict(color= '#FF3D13', width=2), name='High_Low_Diff_18'),
          go.Scatter(x=viz_df_8['Date'], y=viz_df_8['High_Low_Diff_30'], line=dict(color= '#7F1600', width=2), name='High_Low_Diff_30')
                ]

fig = go.Figure(data=traces)
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()




[f'High_Low_Diff_{timeperiod}']

['High_Low_Diff_6']

In [24]:
viz_df_9 = OHLC_df.copy(deep=True)
viz_df_9 = viz_df_9.merge(high_low_mean_indicators_df, on='Date', how='left')
viz_df_9 = viz_df_9[viz_df_9['Date'] >= '2021-01-01']

traces = [go.Scatter(x=viz_df_9['Date'], y=viz_df_9['Close'], line=dict(color=  'Light Blue', width=2), name='Close'),
          go.Scatter(x=viz_df_9['Date'], y=viz_df_9['High_Low_Mean_9'], line=dict(color=  '#FF7F77', width=2), name='High_Low_Mean_9'),
          go.Scatter(x=viz_df_9['Date'], y=viz_df_9['High_Low_Mean_18'], line=dict(color= '#FF3D13', width=2), name='High_Low_Mean_18'),
          go.Scatter(x=viz_df_9['Date'], y=viz_df_9['High_Low_Mean_30'], line=dict(color= '#7F1600', width=2), name='High_Low_Mean_30')
                ]

fig = go.Figure(data=traces)
fig.update_layout(xaxis_rangeslider_visible=True)
fig.show()




### Save the data in a dictionary


In [25]:

datasets = {}
primary_dataset = pd.read_csv("Data/Original_Data/VIX_OHLC_2003_2023.csv")

for file in os.listdir("Data/Original_Data/"):
    if file in ["VIX_OHLC_2003_2023.csv", "datasets.pickle"]:
        continue
    tmp_df = pd.read_csv("Data/Original_Data/" + file)
    tmp_df = tmp_df[tmp_df["Date"] >= "2003-01-01"]
    tmp_df.reset_index(drop=True, inplace=True)
    datasets[file.replace(".csv", "")] = tmp_df
    primary_dataset = primary_dataset.merge(tmp_df, on="Date", how="left")

primary_dataset = primary_dataset.dropna()
primary_dataset.reset_index(drop=True, inplace=True)
datasets["all"] = primary_dataset
print(datasets.keys())
datasets["all"].head()

dict_keys(['AROON', 'BB', 'EMA', 'High_Low_Diff', 'High_Low_Mean', 'MACD', 'Max_High', 'Min_Low', 'ROC', 'RSI', 'all'])


Unnamed: 0,Date,Open,High,Low,Close,AROON_down_3,AROON_up_3,AROON_down_6,AROON_up_6,AROON_down_9,...,ROC_18,ROC_24,ROC_30,RSI_3,RSI_6,RSI_9,RSI_12,RSI_18,RSI_24,RSI_30
0,2003-01-02,28.74,28.74,25.32,25.389999,100.0,33.333333,100.0,66.666667,100.0,...,-15.647844,-2.233346,-8.206799,19.691252,34.252023,39.191736,41.396219,43.241865,44.185714,44.972919
1,2003-01-03,25.549999,25.549999,24.68,24.68,100.0,0.0,100.0,50.0,100.0,...,-14.542933,-9.431192,-9.959867,16.346856,31.251339,36.852182,39.519639,41.936836,43.204966,44.192627
2,2003-01-06,25.32,25.42,24.290001,24.91,100.0,0.0,100.0,33.333333,100.0,...,-19.070828,-9.418182,-1.619273,22.724315,33.515488,38.196716,40.473261,42.531726,43.627919,44.515233
3,2003-01-07,25.129999,25.690001,24.91,25.129999,66.666667,0.0,83.333333,16.666667,88.888889,...,-12.6217,-8.485069,5.543888,30.343566,35.937199,39.581027,41.4368,43.121925,44.043837,44.830803
4,2003-01-08,25.620001,25.76,25.07,25.530001,33.333333,100.0,66.666667,0.0,77.777778,...,-8.033139,-9.883513,10.233164,45.105141,40.653671,42.227901,43.258542,44.224674,44.816301,45.414732



### Train, Val, Test splits 

Train set will be from 2003-01-01 to 2017-12-31

Validation set will be from 2018-01-01 to 2019-12-31

Test set will be from 2020-01-01 to 2022-12-31

In [26]:
ml_dataset = {"all" : {} ,"train" : {} , "val" : {} ,"test" : {}}

for key in datasets.keys():
    ml_dataset["all"][key] = datasets[key]
    ml_dataset["train"][key] = datasets[key][(datasets[key]["Date"] >= "2003-01-01") & (datasets[key]["Date"] <= "2017-12-31")].reset_index(drop=True)
    ml_dataset["val"][key] = datasets[key][(datasets[key]["Date"] >= "2018-01-01") & (datasets[key]["Date"] <= "2019-12-31")].reset_index(drop=True)
    ml_dataset["test"][key] = datasets[key][(datasets[key]["Date"] >= "2020-01-01")].reset_index(drop=True)

# Pickle dataset 
with open('Data/Transformed_Data/datasets_v1.pickle', 'wb') as handle:
    pickle.dump(ml_dataset, handle, protocol=pickle.HIGHEST_PROTOCOL)

ml_dataset["test"]["all"]

Unnamed: 0,Date,Open,High,Low,Close,AROON_down_3,AROON_up_3,AROON_down_6,AROON_up_6,AROON_down_9,...,ROC_18,ROC_24,ROC_30,RSI_3,RSI_6,RSI_9,RSI_12,RSI_18,RSI_24,RSI_30
0,2020-01-02,13.460000,13.720000,12.42,12.47,0.000000,66.666667,33.333333,83.333333,55.555556,...,-14.118458,8.058928,0.080259,30.193183,39.866906,42.927382,44.488843,45.900588,46.512465,46.894977
1,2020-01-03,15.010000,16.200001,13.13,14.02,66.666667,100.000000,16.666667,100.000000,44.444444,...,2.936862,19.319153,9.020224,61.156830,57.518885,55.395951,54.051196,52.309047,51.212375,50.533301
2,2020-01-06,15.450000,16.389999,13.54,13.85,33.333333,100.000000,0.000000,100.000000,33.333333,...,-12.673388,9.746438,8.372462,56.997542,55.379318,53.941882,52.959666,51.599232,50.702496,50.143519
3,2020-01-07,13.840000,14.460000,13.39,13.79,0.000000,66.666667,0.000000,83.333333,22.222222,...,-12.053573,-7.511736,5.026655,55.016633,54.520381,53.385482,52.551059,51.338890,50.517262,50.002699
4,2020-01-08,15.160000,15.240000,12.83,13.45,100.000000,33.333333,33.333333,66.666667,11.111111,...,-10.273516,-15.726818,8.995135,42.470457,49.318838,50.091630,50.158734,49.830397,49.449050,49.192856
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
918,2023-08-25,17.209999,17.360001,15.45,15.68,100.000000,0.000000,100.000000,16.666667,0.000000,...,12.562814,12.724662,17.541230,31.814872,41.583550,46.282421,48.603706,50.241605,50.381766,50.102303
919,2023-08-28,16.240000,16.280001,15.00,15.08,100.000000,66.666667,100.000000,0.000000,0.000000,...,-6.277192,8.802311,11.869439,24.733733,36.426712,42.197681,45.236947,47.794194,48.504977,48.608998
920,2023-08-29,15.080000,15.300000,14.34,14.45,100.000000,33.333333,100.000000,0.000000,100.000000,...,-9.233670,9.552693,8.646614,18.313786,31.504064,38.213768,41.911475,45.338773,46.602904,47.084655
921,2023-08-30,14.530000,14.700000,13.83,13.88,100.000000,0.000000,100.000000,0.000000,100.000000,...,-18.830410,-3.678000,0.872092,13.543066,27.473158,34.863505,39.076202,43.212040,44.939171,45.742078
