# Test Machine-Learning Features/Targets Adders

In [75]:
# Autoreload modules we're working on so that changes propagate to this notebook
%load_ext autoreload
%autoreload 1
%aimport ml_features

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [76]:
import pandas as pd
import holoviews as hv
import hvplot
import hvplot.pandas
import panel as pn
pn.extension()

from ml_features import MLFeaturesAdder
from pathlib import Path

In [77]:
# Cache general plot parameters
inch = 1
margin = 1 * inch
dpi = 100
landscape_for_presentations = True
if landscape_for_presentations:
    #plot_width = 642
    #plot_height = 400
    plot_width = int((11*inch - 2*margin) * 0.8) * dpi
    plot_height = int((8.5*inch - 2*margin) * 0.8) * dpi
else:
    #plot_width = 642
    #plot_height = 400
    plot_width = int((8.5*inch - 2*margin) / 1) * dpi
    plot_height = int((11*inch - 2*margin) / 2) * dpi
print(f"Plot (width, height) = ({plot_width}, {plot_height})")

subtitle_normalization = 'Scaled for Machine Learning'

Plot (width, height) = (700, 500)


## Load Test Data

In [107]:
# Load S&P 500 test data from database
database_path = Path('./data/database_test_sp500_20160713_20210712.csv')
df_database = pd.read_csv(database_path, header=[0, 1], index_col=0, parse_dates=True, infer_datetime_format=True)
df_database

Unnamed: 0_level_0,Close,Close,Close,Close,Close,Close,Close,Close,Close,Close,...,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume,Volume
Unnamed: 0_level_1,A,AAL,AAP,AAPL,ABBV,ABC,ABMD,ABT,ACN,ADBE,...,XEL,XLNX,XOM,XRAY,XYL,YUM,ZBH,ZBRA,ZION,ZTS
Date,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
2016-07-13,44.457333,33.107258,159.646942,22.623623,52.195896,77.790024,117.510002,38.464836,107.507896,97.339996,...,3601300,1597700,9458200,1359600,928700,8687491,1055300,451100,2856900,2574300
2016-07-14,44.830925,34.481514,161.655838,23.072033,51.269157,78.724274,117.779999,38.748955,107.452675,97.699997,...,3825400,1294200,9293500,1077400,1204000,14672407,1014400,684800,2867700,2617900
2016-07-15,44.706387,34.491123,161.293274,23.069700,51.027390,78.357918,117.139999,38.583984,105.906723,97.830002,...,3380600,1543400,10334500,998000,1051500,5880453,846800,754500,2257700,2089100
2016-07-18,44.830925,34.625668,161.107056,23.314920,51.220806,78.797562,117.470001,38.574818,105.032532,97.790001,...,2836500,1904500,9865900,944800,597500,4915933,1181400,449000,2414100,2756800
2016-07-19,45.003338,34.606449,160.421097,23.324263,51.027390,78.696800,116.589996,38.309032,104.323959,97.790001,...,2378100,2153300,5901800,633400,985500,3152562,1170700,436500,1854500,2305700
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-06,148.979996,21.010000,208.699997,142.020004,115.730003,115.419998,326.709991,118.820000,305.399994,596.900024,...,2637700,2810700,22110800,1202000,963300,963300,1218000,306100,902400,1535200
2021-07-07,149.490005,20.309999,209.440002,144.570007,116.750000,115.040001,334.529999,119.870003,309.609985,605.770020,...,1920300,2733600,21671400,923000,1637400,1182400,1483500,232100,1139800,2651100
2021-07-08,148.830002,20.350000,207.509995,143.240005,116.349998,114.440002,325.040009,119.260002,309.149994,605.950012,...,2155700,2544800,18935900,1300000,856600,1122700,1060900,229200,1690100,2818200
2021-07-09,150.029999,20.889999,213.470001,145.110001,116.580002,115.129997,324.769989,119.739998,312.619995,604.500000,...,2509200,1994000,20084000,1024000,649500,999900,723600,158900,1504200,1521100


## Plot Raw OHLCV data (Example)

In [108]:
# Select data for a single financial instrument
symbol = 'AAPL'
ndays_lookback = 252  # business days, not calendar days!
df_ohlcv = df_database.swaplevel(axis=1)[symbol].copy().iloc[-ndays_lookback:]
df_ohlcv

# Plot raw candlesticks
plot_ohlc = df_ohlcv.drop(columns='Volume').hvplot.ohlc(
    x='Date', y=['Open', 'Low', 'High', 'Close'],  # `hvplot` uses OLHC, not OHLC!
    width=plot_width, height=plot_height, bar_width=2,
    title=f"{symbol} Price vs. Time", ylabel='Price ($US)',
)

# Plot volume
plot_volume = df_ohlcv.hvplot.step(
    x='Date', y='Volume',
    width=plot_width, height=int(0.3 * plot_height),
    title='Volume', xaxis=None,
)

# Create a panel for OHLCV plots
panel_ohlcv = pn.Column(plot_ohlc, plot_volume)
panel_ohlcv

## Add Machine-Learning Features

### Add raw technical indicators

In [109]:
# Add raw technical indicators -- do not normalize
mlfa = MLFeaturesAdder(normalize=False)
X = mlfa.fit_transform(df_ohlcv.copy())
X

Level Percentage	 Price ($)
00.0%		 145.11000061035156
23.6%		 132.5748899230957
38.2%		 124.82011805725097
50.0%		 118.55256271362305
61.8%		 112.2850073699951
100.0%		 91.99512481689452


Unnamed: 0_level_0,Close,High,Low,Open,Volume,SMA5,SMA10,SMA20,SMA50,SMA100,...,FIB_236,FIB_382,FIB_500,FIB_618,FIB_MAX,STDEV,ATR,ADX,DI+,DI-
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-07-13,94.838470,99.286003,94.619942,96.614005,191649200,,,,,,...,132.57489,124.820118,118.552563,112.285007,145.110001,,,,,
2020-07-14,96.407898,96.604071,93.249181,94.205233,170989200,,,,,,...,132.57489,124.820118,118.552563,112.285007,145.110001,,,,,
2020-07-15,97.070930,98.583240,95.844196,98.327463,153198000,,,,,,...,132.57489,124.820118,118.552563,112.285007,145.110001,,,,,
2020-07-16,95.876472,96.753066,95.263105,95.916206,110577600,,,,,,...,132.57489,124.820118,118.552563,112.285007,145.110001,,,,,
2020-07-17,95.682777,96.497289,95.198537,96.338364,92186800,95.975310,,,,,...,132.57489,124.820118,118.552563,112.285007,145.110001,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-06,142.020004,143.149994,140.070007,140.070007,108181800,138.508005,136.152002,132.683500,129.592783,127.859774,...,132.57489,124.820118,118.552563,112.285007,145.110001,4.381832,2.123572,56.800317,58.255219,4.665985
2021-07-07,144.570007,144.889999,142.660004,143.539993,104911600,140.156006,137.211003,133.575001,129.794352,127.956465,...,132.57489,124.820118,118.552563,112.285007,145.110001,4.892943,2.155001,60.897752,60.819053,4.043854
2021-07-08,143.240005,144.059998,140.669998,141.580002,105575500,141.412006,138.165004,134.380501,129.975910,128.037461,...,132.57489,124.820118,118.552563,112.285007,145.110001,5.097856,2.226430,60.432134,52.709846,14.263424
2021-07-09,145.110001,145.649994,142.649994,142.750000,99788400,142.980005,139.335004,135.330501,130.211040,128.158919,...,132.57489,124.820118,118.552563,112.285007,145.110001,5.243771,2.330002,60.743228,54.036898,12.361634


### Add normalized technical indicators

In [110]:
# Add "normalized" technical indicators
mlfa_norm = MLFeaturesAdder(normalize=True)
X_norm = mlfa_norm.fit_transform(df_ohlcv.copy())
X_norm

Level Percentage	 Price ($)
00.0%		 145.11000061035156
23.6%		 132.5748899230957
38.2%		 124.82011805725097
50.0%		 118.55256271362305
61.8%		 112.2850073699951
100.0%		 91.99512481689452


Unnamed: 0_level_0,Close,High,Low,Open,Volume,SMA5*,SMA10*,SMA20*,SMA50*,SMA100*,...,FIB_MIN*,FIB_236*,FIB_382*,FIB_500*,FIB_618*,FIB_MAX*,STDEV,ATR,ADX*,DI_HIST*
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-07-13,94.838470,99.286003,94.619942,96.614005,191649200,,,,,,...,0.107064,-1.420936,-1.128936,-0.892936,-0.656936,-1.892936,,,,
2020-07-14,96.407898,96.604071,93.249181,94.205233,170989200,,,,,,...,0.166160,-1.361840,-1.069840,-0.833840,-0.597840,-1.833840,,,,
2020-07-15,97.070930,98.583240,95.844196,98.327463,153198000,,,,,,...,0.191126,-1.336874,-1.044874,-0.808874,-0.572874,-1.808874,,,,
2020-07-16,95.876472,96.753066,95.263105,95.916206,110577600,,,,,,...,0.146149,-1.381851,-1.089851,-0.853851,-0.617851,-1.853851,,,,
2020-07-17,95.682777,96.497289,95.198537,96.338364,92186800,,,,,,...,0.138856,-1.389144,-1.097144,-0.861144,-0.625144,-1.861144,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-07-06,142.020004,143.149994,140.070007,140.070007,108181800,0.701679,0.730449,0.375791,0.163216,0.172762,...,1.883649,0.355649,0.647649,0.883649,1.119649,-0.116351,4.381832,2.123572,0.568003,0.535892
2021-07-07,144.570007,144.889999,142.660004,143.539993,104911600,0.864307,0.754556,0.452974,0.170569,0.164036,...,1.979667,0.451667,0.743667,0.979667,1.215667,-0.020333,4.892943,2.155001,0.608978,0.567752
2021-07-08,143.240005,144.059998,140.669998,141.580002,105575500,0.922367,0.760177,0.510800,0.174131,0.153215,...,1.929587,0.401587,0.693587,0.929587,1.165587,-0.070413,5.097856,2.226430,0.604321,0.384464
2021-07-09,145.110001,145.649994,142.649994,142.750000,99788400,0.989399,0.768612,0.567312,0.176148,0.143505,...,2.000000,0.472000,0.764000,1.000000,1.236000,0.000000,5.243771,2.330002,0.607432,0.416753


## Plot Machine-Learning Features

### Simple Moving Averages (SMAs)

Q: What should we calculate/plot for "normalized" technical indicators? Try1: Generalized MACD, scaled for price volatility and random walk, 

\begin{equation}
    \left(\frac{MA_{\rm fast} - MA_{\rm slow}}{\{\sigma_{\rm close}, ATR\}}\right)  \times  \underbrace{\frac{1}{\sqrt{\tfrac{1}{2} |N_{\rm fast} - N_{\rm slow}|}}}_{\text{"Random Walk"}}
\end{equation}


In [84]:
title = f"{symbol}: Simple Moving Averages (SMAs)"
title_norm = f"{title} -- {subtitle_normalization}"
plot_smas_kwargs = dict(width=plot_width, height=plot_height, shared_axes=False, legend='top_left')
plot_smas = X.filter(regex='SMA').hvplot.line(title=title, **plot_smas_kwargs)
plot_smas_norm = X_norm.filter(regex='SMA').hvplot.line(title=title_norm, **plot_smas_kwargs)
if landscape_for_presentations:
    panel_smas = pn.Column(
        #f"## {title}",
        pn.Row(
            (plot_smas * plot_ohlc),
            plot_smas_norm
        )
    )
else:
    panel_smas = pn.Column(
        #f"## {title}",
        (plot_smas * plot_ohlc),
        plot_smas_norm
    )
panel_smas

### Exponential Moving Averages (EMAs)

Similar calculation as for SMAs, 

\begin{equation}
    \left(\frac{EMA_{\rm fast} - EMA_{\rm slow}}{\{\sigma_{\rm close}, ATR\}}\right)  \times  \underbrace{\frac{1}{\sqrt{\tfrac{1}{2} |N_{\rm fast} - N_{\rm slow}|}}}_{\text{"Random Walk"}}
\end{equation}

In [89]:
title = f"{symbol}: Exponential Moving Averages (EMAs, DEMAs)"
title_norm = f"{symbol}: EMAs, DEMAs -- {subtitle_normalization}"
plot_emas_kwargs = dict(width=plot_width, height=plot_height, shared_axes=False, legend='top_left')
plot_emas = X.filter(regex='EMA').hvplot.line(title=title, **plot_emas_kwargs)
plot_emas_norm = X_norm.filter(regex='EMA').hvplot.line(title=title_norm, **plot_emas_kwargs)
if landscape_for_presentations:
    panel_emas = pn.Column(
        #f"## {title}",
        pn.Row(
            (plot_emas * plot_ohlc),
            plot_emas_norm
        )
    )
else:
    panel_emas = pn.Column(
        #f"## {title}",
        (plot_emas * plot_ohlc),
        plot_emas_norm
    )
panel_emas

## Moving-Average Convergence/Divergence (MACD)

In [90]:
title = 'Moving-Average Convergence/Divergence (MACD)'
title_norm = f"{title} -- Scaled for Machine Learning"

plot_macd_kwargs = dict(width=plot_width, height=plot_height, shared_axes=False)
plot_macd = X.filter(regex='MACD').hvplot.line(title=title, **plot_macd_kwargs)
plot_macd_norm = X_norm.filter(regex='MACD').hvplot.line(title=title_norm, **plot_macd_kwargs)

if landscape_for_presentations:
    panel_macd = pn.Column(
        f"## {title}",
        pn.Row(
            plot_macd,
            plot_macd_norm
        )
    )
else:
    panel_macd = pn.Column(
        f"## {title}",
        plot_macd,
        plot_macd_norm
    )

panel_macd

In [93]:
title = f"{symbol}: Moving Averages"
title_norm = f"{symbol}: Generalized MACD -- Scaled for Machine Learning"

plot_gmacd_kwargs = dict(width=plot_width, height=plot_height, shared_axes=False)
plot_gmacd = X.filter(regex='MA\d').hvplot.line(title=title, **plot_macd_kwargs)
plot_gmacd_norm = X_norm.filter(regex='MA\d').hvplot.line(title=title_norm, ylabel='Unitless', **plot_macd_kwargs)

if landscape_for_presentations:
    panel_gmacd = pn.Column(
        #f"## {title}",
        pn.Row(
            (plot_gmacd * plot_ohlc),
            plot_gmacd_norm
        )
    )
else:
    panel_gmacd = pn.Column(
        #f"## {title}",
        (plot_gmacd * plot_ohlc),
        plot_gmacd_norm
    )
panel_gmacd

## Bollinger Bands (BBANDS)

### Standard Deviation (STDEV) vs. Average True Range (ATR) vs. GARCH

The Average True Range (ATR) of the price is more stable than the standard deviation (STDEV) of the price.

In [87]:
title = 'Bollinger Bands (BBANDS)'
title_norm = f"{title} -- Scaled for Machine Learning"

plot_bbands_kwargs = dict(width=plot_width, height=plot_height, shared_axes=False)
plot_bbands = X.filter(regex='BBANDS').hvplot.line(title=title, **plot_bbands_kwargs)
plot_bbands_norm = X_norm.filter(regex='BBANDS').hvplot.line(title=title_norm, **plot_bbands_kwargs)

if landscape_for_presentations:
    panel_bbands = pn.Column(
        f"## {title}",
        pn.Row(
            (plot_bbands * plot_ohlc),
            plot_bbands_norm
        )
    )
else:
    panel_bbands = pn.Column(
        f"## {title}",
        (plot_bbands * plot_ohlc),
        plot_bbands_norm
    )

panel_bbands

## Relative Strength Index (RSI)

In [96]:
title = f"{symbol}: Relative Strength Index (RSI)"
title_norm = f"{symbol}: RSI Scaled for Machine Learning: [-1, 1]"

plot_rsi_kwargs = dict(width=plot_width, height=plot_height, shared_axes=False)
plot_rsi = X.filter(regex='RSI').hvplot.line(title=title, **plot_rsi_kwargs)
plot_rsi_norm = X_norm.filter(regex='RSI').hvplot.line(title=title_norm, **plot_rsi_kwargs)

if landscape_for_presentations:
    panel_rsi = pn.Column(
        #f"## {title}",
        pn.Row(
            plot_rsi,
            plot_rsi_norm
        )
    )
else:
    panel_rsi = pn.Column(
        #f"## {title}",
        plot_rsi,
        plot_rsi_norm
    )

panel_rsi

## Stochastic Oscillator (STOCH)

In [98]:
title = f"{symbol}: Stochastic Oscillator (STOCH)"
title_norm = f"{symbol}: STOCH -- Scaled for Machine Learning"

plot_stoch_kwargs = dict(width=plot_width, height=plot_height, shared_axes=False)
plot_stoch = X.filter(regex='STOCH').hvplot.line(title=title, **plot_stoch_kwargs)
plot_stoch_norm = X_norm.filter(regex='STOCH').hvplot.line(title=title_norm, **plot_stoch_kwargs)

if landscape_for_presentations:
    panel_stoch = pn.Column(
        #f"## {title}",
        pn.Row(
            plot_stoch,
            plot_stoch_norm
        )
    )
else:
    panel_stoch = pn.Column(
        #f"## {title}",
        plot_stoch,
        plot_stoch_norm
    )

panel_stoch

## Average Directional Index (ADX)

In [113]:
title = f"{symbol}: Average Directional Index (ADX, DI+, DI-)"
title_norm = f"{title}: -- Scaled for Machine Learning"

plot_adx_dis_kwargs = dict(width=plot_width, height=plot_height, shared_axes=False)
#plot_adx_dis = X.filter(regex='ADX|^DI').hvplot.line(title=title, **plot_adx_dis_kwargs)
plot_adx_dis = \
    X['ADX'].hvplot.line(title=title, c='k', **plot_adx_dis_kwargs) * \
    X['DI+'].hvplot.line(title=title, c='g', **plot_adx_dis_kwargs) * \
    X['DI-'].hvplot.line(title=title, c='r', **plot_adx_dis_kwargs)
plot_adx_dis_norm = X_norm.filter(regex='ADX|DI').hvplot.line(title=title_norm, **plot_rsi_kwargs)

if landscape_for_presentations:
    panel_adx_dis = pn.Column(
        #f"## {title}",
        pn.Row(
            plot_adx_dis,
            plot_adx_dis_norm
        )
    )
else:
    panel_adx_dis = pn.Column(
        #f"## {title}",
        plot_adx_dis,
        plot_adx_dis_norm
    )

panel_adx_dis

## Fibonacci Retracement

## Ichimoku Cloud

# All Indicators: Un-normalized vs. Normalized for Machine Learning

In [112]:
X_norm_all = X_norm.drop(columns=['Open', 'High', 'Low', 'Close', 'Volume', 'ATR', 'STDEV'])  # .filter(regex='^(?!FIB)')
ax = X_norm_all.hvplot.line(title='Technical Indicators: Input Features Scaled for Machine Learning', width=1000, height=600)
ax