# Machine Learning Tester

This notebook is a scaffolding to test multiple ML libraries from SKlearn toolkit

In [35]:
# Imports
import pandas as pd
import numpy as np
from pathlib import Path
import hvplot.pandas
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from pandas.tseries.offsets import DateOffset
from sklearn.metrics import classification_report

# Import a new classifiers from SKLearn
from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

from utils.utils import epoch_to_datetime,make_label

## DataPrep

### CryptoCompare Hourly DataSets

In [15]:
datasets = []
files = {
    "BTC":Path('data/BTC_historical_price.csv'),
    "DAI":Path('data/DAI_historical_price.csv'),
    "ETH":Path('data/ETH_historical_price.csv'),
    "USDC":Path('data/USDC_historical_price.csv'),
    "USDT":Path('data/USDT_historical_price.csv'),
}
for f in files:
    df = pd.read_csv(
        files[f],
        infer_datetime_format=True,
        parse_dates=True,
        index_col='Time (UTC)'
    )
    df=df.drop(columns=['volume_from'])
    cols = df.columns
    new_cols = []
    for col in cols:
        if col == 'volume_to':
            col = 'volume'
        new_cols.append(f"{f}_{col}")
        
    df.columns=new_cols
    datasets.append(df)
df = pd.concat(datasets,axis=1)
df

Unnamed: 0_level_0,BTC_open,BTC_high,BTC_low,BTC_close,BTC_volume,DAI_open,DAI_high,DAI_low,DAI_close,DAI_volume,...,USDC_open,USDC_high,USDC_low,USDC_close,USDC_volume,USDT_open,USDT_high,USDT_low,USDT_close,USDT_volume
Time (UTC),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-07 00:00:00+00:00,36849.86,37354.53,36512.75,37226.92,2.696648e+08,1.0030,1.0170,0.9911,1.0000,78046.66,...,1.0000,1.003,0.9805,0.9997,7285.50,1.0030,1.0060,0.9830,1.0020,7303.75
2021-01-07 01:00:00+00:00,37226.92,37298.39,36762.78,36956.68,1.332165e+08,1.0020,1.0040,1.0010,1.0030,1262688.98,...,0.9997,1.009,0.9949,1.0000,3597.43,1.0020,1.0120,0.9973,1.0030,3606.06
2021-01-07 02:00:00+00:00,36956.68,37159.27,36554.54,37048.01,1.147212e+08,1.0030,1.0030,1.0010,1.0010,722891.83,...,1.0000,1.002,0.9861,0.9994,3110.10,1.0030,1.0060,0.9892,1.0030,3119.93
2021-01-07 03:00:00+00:00,37048.01,37775.67,37036.87,37558.68,1.548582e+08,1.0010,1.0030,1.0010,1.0020,707410.24,...,0.9994,1.006,0.9860,0.9999,4130.76,1.0030,1.0090,0.9891,1.0030,4143.36
2021-01-07 04:00:00+00:00,37558.68,37657.95,37167.05,37542.60,1.113238e+08,1.0020,1.0030,1.0010,1.0030,689094.50,...,0.9999,1.003,0.9898,0.9998,2974.81,1.0030,1.0060,0.9929,1.0030,2984.13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-07 15:00:00+00:00,20905.77,21002.82,20818.39,20873.41,5.108218e+07,0.9995,0.9997,0.9988,0.9997,388379.36,...,1.0000,1.007,0.9977,1.0000,2445.80,0.9993,1.0060,0.9967,0.9994,2443.55
2022-07-07 16:00:00+00:00,20873.41,21186.10,20851.60,21080.74,7.883323e+07,1.0010,1.0050,0.9857,1.0010,33677.01,...,1.0000,1.006,0.9901,1.0010,3756.60,0.9994,1.0050,0.9887,0.9995,3751.22
2022-07-07 17:00:00+00:00,21080.74,21089.08,20826.88,20890.82,5.962933e+07,1.0010,1.0100,0.9959,0.9998,36945.55,...,1.0010,1.010,0.9970,1.0000,2852.44,0.9995,1.0090,0.9964,0.9994,2850.58
2022-07-07 18:00:00+00:00,20890.82,21409.40,20880.97,21361.72,1.137739e+08,0.9998,1.0030,0.9893,1.0000,34950.54,...,1.0000,1.003,0.9786,1.0010,5371.31,0.9994,1.0020,0.9775,1.0000,5365.59


### KuCoin sUSD/USDT pair dataset

In [22]:
ku_df = pd.read_csv(
    Path('data/sUSD_USDC_ku_historical_price.csv'),
        infer_datetime_format=True,
        parse_dates=True,
)
ku_df['epoch'] = ku_df['epoch']/1000  # from epoch in ms to epoch in seconds
ku_df['Time (UTC)'] = ku_df['epoch'].apply(epoch_to_datetime) 
ku_df = ku_df.drop(columns=['epoch'])
ku_df = ku_df.set_index('Time (UTC)')
cols = ku_df.columns
new_cols = []
for col in cols:
    new_cols.append(f"{'sUSD/USDT'}_{col}")
ku_df.columns = new_cols
ku_df

Unnamed: 0_level_0,sUSD/USDT_open,sUSD/USDT_high,sUSD/USDT_low,sUSD/USDT_close,sUSD/USDT_volume
Time (UTC),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2021-01-01 08:00:00+00:00,1.012971,1.012971,1.012971,1.012971,0.0000
2021-01-01 09:00:00+00:00,1.016752,1.016752,1.012971,1.012971,1.2549
2021-01-01 10:00:00+00:00,1.012971,1.012971,1.012971,1.012971,0.2563
2021-01-01 11:00:00+00:00,1.012971,1.012971,1.012000,1.012000,42.5240
2021-01-01 12:00:00+00:00,1.012000,1.012000,1.012000,1.012000,0.0000
...,...,...,...,...,...
2022-07-07 21:00:00+00:00,1.005000,1.005000,1.005000,1.005000,412.1602
2022-07-07 22:00:00+00:00,1.005000,1.005000,1.005000,1.005000,0.7034
2022-07-07 23:00:00+00:00,1.006983,1.006983,1.006891,1.006891,0.8433
2022-07-08 00:00:00+00:00,1.005000,1.006990,1.005000,1.006891,10.2572


In [23]:
df = pd.concat([df,ku_df],axis=1)
df

Unnamed: 0_level_0,BTC_open,BTC_high,BTC_low,BTC_close,BTC_volume,DAI_open,DAI_high,DAI_low,DAI_close,DAI_volume,...,USDT_open,USDT_high,USDT_low,USDT_close,USDT_volume,sUSD/USDT_open,sUSD/USDT_high,sUSD/USDT_low,sUSD/USDT_close,sUSD/USDT_volume
Time (UTC),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-01 08:00:00+00:00,,,,,,,,,,,...,,,,,,1.012971,1.012971,1.012971,1.012971,0.0000
2021-01-01 09:00:00+00:00,,,,,,,,,,,...,,,,,,1.016752,1.016752,1.012971,1.012971,1.2549
2021-01-01 10:00:00+00:00,,,,,,,,,,,...,,,,,,1.012971,1.012971,1.012971,1.012971,0.2563
2021-01-01 11:00:00+00:00,,,,,,,,,,,...,,,,,,1.012971,1.012971,1.012000,1.012000,42.5240
2021-01-01 12:00:00+00:00,,,,,,,,,,,...,,,,,,1.012000,1.012000,1.012000,1.012000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-07 21:00:00+00:00,,,,,,,,,,,...,,,,,,1.005000,1.005000,1.005000,1.005000,412.1602
2022-07-07 22:00:00+00:00,,,,,,,,,,,...,,,,,,1.005000,1.005000,1.005000,1.005000,0.7034
2022-07-07 23:00:00+00:00,,,,,,,,,,,...,,,,,,1.006983,1.006983,1.006891,1.006891,0.8433
2022-07-08 00:00:00+00:00,,,,,,,,,,,...,,,,,,1.005000,1.006990,1.005000,1.006891,10.2572


In [24]:
df.iloc[5000:5050,:]

Unnamed: 0_level_0,BTC_open,BTC_high,BTC_low,BTC_close,BTC_volume,DAI_open,DAI_high,DAI_low,DAI_close,DAI_volume,...,USDT_open,USDT_high,USDT_low,USDT_close,USDT_volume,sUSD/USDT_open,sUSD/USDT_high,sUSD/USDT_low,sUSD/USDT_close,sUSD/USDT_volume
Time (UTC),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-07-28 15:00:00+00:00,39555.47,39931.68,39440.95,39701.31,76927870.0,1.002,1.006,0.9978,1.002,8700.99,...,1.001,1.006,0.9939,1.0,1938.86,1.0047,1.0047,1.004699,1.004699,192.4814
2021-07-28 16:00:00+00:00,39701.31,40175.56,39597.47,40125.75,95925090.0,1.002,1.003,0.985,1.001,13977.24,...,1.0,1.002,0.9872,1.0,2405.01,1.004699,1.004699,1.004699,1.004699,0.0
2021-07-28 17:00:00+00:00,40125.75,40252.89,38889.88,38985.26,182048000.0,1.001,1.028,0.9981,1.001,32320.56,...,1.0,1.033,0.998,1.0,4615.1,1.0047,1.0065,1.004699,1.004699,228.765192
2021-07-28 18:00:00+00:00,38985.26,40506.83,38809.89,40446.37,229279500.0,1.001,1.002,0.9721,1.0,30041.79,...,1.0,1.002,0.9599,1.0,5761.5,1.004699,1.0068,1.004699,1.0068,131.4936
2021-07-28 19:00:00+00:00,40446.37,40674.6,39972.98,40401.1,137157700.0,1.0,1.01,0.9965,1.002,18492.65,...,1.0,1.007,0.9901,1.001,3399.96,1.004699,1.004699,1.000369,1.003999,2530.9646
2021-07-28 20:00:00+00:00,40401.1,40405.62,39763.92,39999.0,91082700.0,1.002,1.007,0.9978,1.001,13053.55,...,1.001,1.011,0.9953,1.001,2276.57,1.00037,1.003688,1.000369,1.003688,80.7776
2021-07-28 21:00:00+00:00,39999.0,40166.95,39858.53,40101.75,42250980.0,1.001,1.001,1.001,1.001,463778.1,...,1.001,1.002,0.9941,1.0,1056.57,1.003687,1.003689,1.003676,1.003677,270.8753
2021-07-28 22:00:00+00:00,40101.75,40233.5,39620.99,39767.87,59462820.0,1.001,1.01,0.9978,1.001,8787.93,...,1.0,1.012,0.9966,1.0,1489.36,1.003657,1.003657,1.003657,1.003657,0.1
2021-07-28 23:00:00+00:00,39767.87,40207.85,39618.16,40030.01,73946370.0,1.001,1.006,0.9941,1.002,12722.72,...,1.0,1.005,0.99,1.0,1854.04,1.003647,1.003649,1.003647,1.003649,23.8657
2021-07-29 00:00:00+00:00,40030.01,40268.23,39458.06,39458.06,75603700.0,1.001,1.001,1.001,1.001,535036.23,...,1.0,1.021,1.0,1.0,1897.13,1.003627,1.003629,1.0005,1.0005,153.0091


In [None]:
# Define tuning permutations

# Tune models by evaluationg several training dataset sizes
training_dataset_months = [3,6,12,18]

# Tune models by evaluationg several short SMA window sizes
short_window_sizes = [3,4,6]

# Tune models by evaluationg several long SMA window sizes
long_window_sizes = [60,180,252]

In [27]:
df = df.dropna()
df

Unnamed: 0_level_0,BTC_open,BTC_high,BTC_low,BTC_close,BTC_volume,DAI_open,DAI_high,DAI_low,DAI_close,DAI_volume,...,USDT_open,USDT_high,USDT_low,USDT_close,USDT_volume,sUSD/USDT_open,sUSD/USDT_high,sUSD/USDT_low,sUSD/USDT_close,sUSD/USDT_volume
Time (UTC),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-07 00:00:00+00:00,36849.86,37354.53,36512.75,37226.92,2.696648e+08,1.0030,1.0170,0.9911,1.0000,78046.66,...,1.0030,1.0060,0.9830,1.0020,7303.75,0.969294,0.989499,0.961310,0.980163,136.7788
2021-01-07 01:00:00+00:00,37226.92,37298.39,36762.78,36956.68,1.332165e+08,1.0020,1.0040,1.0010,1.0030,1262688.98,...,1.0020,1.0120,0.9973,1.0030,3606.06,0.980163,0.984562,0.978573,0.984562,585.0986
2021-01-07 02:00:00+00:00,36956.68,37159.27,36554.54,37048.01,1.147212e+08,1.0030,1.0030,1.0010,1.0010,722891.83,...,1.0030,1.0060,0.9892,1.0030,3119.93,0.984562,0.989500,0.984562,0.989034,65.6000
2021-01-07 03:00:00+00:00,37048.01,37775.67,37036.87,37558.68,1.548582e+08,1.0010,1.0030,1.0010,1.0020,707410.24,...,1.0030,1.0090,0.9891,1.0030,4143.36,0.988510,0.988510,0.980163,0.980163,1.0381
2021-01-07 04:00:00+00:00,37558.68,37657.95,37167.05,37542.60,1.113238e+08,1.0020,1.0030,1.0010,1.0030,689094.50,...,1.0030,1.0060,0.9929,1.0030,2984.13,0.980163,0.989500,0.978573,0.986414,140.1833
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-07 15:00:00+00:00,20905.77,21002.82,20818.39,20873.41,5.108218e+07,0.9995,0.9997,0.9988,0.9997,388379.36,...,0.9993,1.0060,0.9967,0.9994,2443.55,1.006900,1.006900,1.002709,1.002709,1.2520
2022-07-07 16:00:00+00:00,20873.41,21186.10,20851.60,21080.74,7.883323e+07,1.0010,1.0050,0.9857,1.0010,33677.01,...,0.9994,1.0050,0.9887,0.9995,3751.22,1.002709,1.006892,1.002709,1.002726,26.6797
2022-07-07 17:00:00+00:00,21080.74,21089.08,20826.88,20890.82,5.962933e+07,1.0010,1.0100,0.9959,0.9998,36945.55,...,0.9995,1.0090,0.9964,0.9994,2850.58,1.002726,1.002726,1.002726,1.002726,0.0000
2022-07-07 18:00:00+00:00,20890.82,21409.40,20880.97,21361.72,1.137739e+08,0.9998,1.0030,0.9893,1.0000,34950.54,...,0.9994,1.0020,0.9775,1.0000,5365.59,1.006892,1.006893,1.002711,1.004992,3.4258


### Add Label, aka y, trading signal

In [28]:
# Labeling Parameters
period = 4
factor = 0.5
std_window = 24*42 # focus on the volatility of the past 42 days, why 42, ask Kevin L.

In [36]:
# calculate rolling standard deviation for set window size
df['sUSD/USDT_std']=df['sUSD/USDT_close'].rolling(std_window).std()

# calculate triggering offset
df['sUSD/USDT_offset'] = df['sUSD/USDT_std'] * factor

# created future price column
df['sUSD/USDT_future'] = df['sUSD/USDT_close'].shift(period*-1)

# make label
df['y'] = df.apply(
            make_label,
            axis=1,
            current_price_col='sUSD/USDT_close',
            future_price_col='sUSD/USDT_future',
            offset_col = 'sUSD/USDT_offset' )
df

Unnamed: 0_level_0,BTC_open,BTC_high,BTC_low,BTC_close,BTC_volume,DAI_open,DAI_high,DAI_low,DAI_close,DAI_volume,...,USDT_volume,sUSD/USDT_open,sUSD/USDT_high,sUSD/USDT_low,sUSD/USDT_close,sUSD/USDT_volume,sUSD/USDT_std,sUSD/USDT_offset,sUSD/USDT_future,y
Time (UTC),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-07 00:00:00+00:00,36849.86,37354.53,36512.75,37226.92,2.696648e+08,1.0030,1.0170,0.9911,1.0000,78046.66,...,7303.75,0.969294,0.989499,0.961310,0.980163,136.7788,,,0.986414,0
2021-01-07 01:00:00+00:00,37226.92,37298.39,36762.78,36956.68,1.332165e+08,1.0020,1.0040,1.0010,1.0030,1262688.98,...,3606.06,0.980163,0.984562,0.978573,0.984562,585.0986,,,0.988370,0
2021-01-07 02:00:00+00:00,36956.68,37159.27,36554.54,37048.01,1.147212e+08,1.0030,1.0030,1.0010,1.0010,722891.83,...,3119.93,0.984562,0.989500,0.984562,0.989034,65.6000,,,0.985000,0
2021-01-07 03:00:00+00:00,37048.01,37775.67,37036.87,37558.68,1.548582e+08,1.0010,1.0030,1.0010,1.0020,707410.24,...,4143.36,0.988510,0.988510,0.980163,0.980163,1.0381,,,0.985000,0
2021-01-07 04:00:00+00:00,37558.68,37657.95,37167.05,37542.60,1.113238e+08,1.0020,1.0030,1.0010,1.0030,689094.50,...,2984.13,0.980163,0.989500,0.978573,0.986414,140.1833,,,0.984406,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-07-07 15:00:00+00:00,20905.77,21002.82,20818.39,20873.41,5.108218e+07,0.9995,0.9997,0.9988,0.9997,388379.36,...,2443.55,1.006900,1.006900,1.002709,1.002709,1.2520,0.006617,0.003308,1.002709,0
2022-07-07 16:00:00+00:00,20873.41,21186.10,20851.60,21080.74,7.883323e+07,1.0010,1.0050,0.9857,1.0010,33677.01,...,3751.22,1.002709,1.006892,1.002709,1.002726,26.6797,0.006620,0.003310,,0
2022-07-07 17:00:00+00:00,21080.74,21089.08,20826.88,20890.82,5.962933e+07,1.0010,1.0100,0.9959,0.9998,36945.55,...,2850.58,1.002726,1.002726,1.002726,1.002726,0.0000,0.006623,0.003312,,0
2022-07-07 18:00:00+00:00,20890.82,21409.40,20880.97,21361.72,1.137739e+08,0.9998,1.0030,0.9893,1.0000,34950.54,...,5365.59,1.006892,1.006893,1.002711,1.004992,3.4258,0.006629,0.003315,,0


In [37]:
df.dropna()['y'].value_counts()

 0    8074
 1    2030
-1    2010
Name: y, dtype: int64

In [38]:
df.to_csv('data/the_data.csv')

In [None]:
# define a dictionary to manage each machine learning 
alternate_models = {}

# Initiate the model instances

alternate_models['SVC'] = {
        "model": svm.SVC()
    }
alternate_models['LogisticRegression'] = {
        "model": LogisticRegression()
    }
alternate_models['DecisionTreeClassifier'] = {
        "model": DecisionTreeClassifier()
    }
alternate_models['GradientBoostingClassifier'] = {
        "model": GradientBoostingClassifier()
    }
alternate_models['AdaBoostClassifier'] = {
        "model": AdaBoostClassifier()
    }
alternate_models['GaussianNB'] = {
        "model": GaussianNB()
    }

In [None]:
# Reusable functions to create signals for ML training

# TODO: these will need to be refactored based on the shape of the dataframe
# TODO: refactor these functions to library file
# create helper methods to facilitate assessing permutations

def make_signals_df(short_window,long_window):
    # Filter the date index and close columns
    signals_df = ohlcv_df.loc[:, ["close"]]

    # Use the pct_change function to generate  returns from close prices
    signals_df["Actual Returns"] = signals_df["close"].pct_change()

    # Generate the fast and slow simple moving averages (4 and 100 days, respectively)
    signals_df['SMA_Fast'] = signals_df['close'].rolling(window=short_window).mean()
    signals_df['SMA_Slow'] = signals_df['close'].rolling(window=long_window).mean()
    
    # Drop all NaN values from the DataFrame
    signals_df = signals_df.dropna()
    
    # Initialize the new Signal column
    signals_df['Signal'] = 0.0

    # When Actual Returns are greater than or equal to 0, generate signal to buy stock long
    signals_df.loc[(signals_df['Actual Returns'] >= 0), 'Signal'] = 1

    # When Actual Returns are less than 0, generate signal to sell stock short
    signals_df.loc[(signals_df['Actual Returns'] < 0), 'Signal'] = -1    
    
    # Calculate the strategy returns and add them to the signals_df DataFrame
    signals_df['Strategy Returns'] = signals_df['Actual Returns'] * signals_df['Signal'].shift()
    
    return signals_df

def create_train_test_datasets(months):
    # calculate the trainind start and end based on the given training months
    training_begin = X.index.min()
    training_end = X.index.min() + DateOffset(months=months)

    # create the training features dataset X_train and training classigication labels y_train for the training timeframe
    X_train = X.loc[training_begin:training_end]
    y_train = y.loc[training_begin:training_end]

    # create the testing features dataset X_test and testing classigication labels y_test following the training timeframe
    X_test = X.loc[training_end+DateOffset(hours=1):]
    y_test = y.loc[training_end+DateOffset(hours=1):]
    
    # Use StandardScaler to scale the data.
    # Scale the features DataFrames

    # Create a StandardScaler instance
    scaler = StandardScaler()

    # Apply the scaler model to fit the X-train data
    X_scaler = scaler.fit(X_train)

    # Transform the X_train and X_test DataFrames using the X_scaler
    X_train_scaled = X_scaler.transform(X_train)
    X_test_scaled = X_scaler.transform(X_test)  
    return X_train_scaled, y_train, X_test_scaled, y_test

def get_predictions(model,X_train_scaled,y_train,X_test_scaled):
    
    # Fit the model using the training data
    model = model.fit(X_train_scaled,y_train)

    # Use the testing dataset to generate the predictions for the new model
    # store these in disctionary associated with the model architecture being evaluated
    predictions = model.predict(X_test_scaled)

    return predictions
    
def backtest_model(model_name, y_test, y_predictions, actual_returns):
    
    # Use a classification report to evaluate the model using the predictions and testing data
    model_classification_report = classification_report(y_test, y_predictions)

    # Backtest model performance

    # Create a new empty predictions DataFrame.
    # Create a predictions DataFrame
    df=pd.DataFrame(index=y_test.index)

    # Add the alternate model predictions to the DataFrame
    df['Predicted'] = y_predictions

    # Add the actual returns to the DataFrame
    df['Actual Returns'] = actual_returns

    df = df.dropna()

    # Add the strategy returns to the DataFrame
    df['Strategy Returns'] = (df['Actual Returns'] * df['Predicted'])

    # Calculate the cumulative stategy return
    df['strategy_cum_return'] = (1 + df["Strategy Returns"]).cumprod()

    # Calculate the actual stategy return
    df['actual_cum_return'] = (1 + df["Actual Returns"]).cumprod()

#     # Plot the actual returns versus the strategy returns
#     cum_actual_strategy_plot = df[['actual_cum_return','strategy_cum_return']].plot(
#         figsize=(15,7),
#         title=f'Cumulative Returns Actual vs Strategy for model {model_name}'
#     )
#     # save plot
#     cum_actual_strategy_plot.figure.savefig(f'images/{model_name}_actual_vs_strategy_cum_returns.png', bbox_inches='tight')
    
    return df

In [None]:
# main loop
# TODO: Modify to fit data and looping needs

models = {}
# construct the model permutations and evaluate the model permutations
for name in alternate_models:
    returns = None
    max_return = 0
    selected_model = None
    for training_months in training_dataset_months:
        for short_window_size in short_window_sizes:
            for long_window_size in long_window_sizes:
                # create a key for this model permutation
                model_key = f"{name}-tr({training_months})-sw({short_window_size})-lw({long_window_size})"
                
                # configure model permutation
                models[model_key] = {
                    "model_name":name,
                    "training_months":training_months,
                    "short_window_size":short_window_size,
                    "long_window_size":long_window_size,
                    "model":alternate_models[name]["model"],
                }
                
                # create the signals data set with the actual returns, fast and slow SMA, signal and strategy returns
                models[model_key]['signals_df'] = make_signals_df(short_window_size,long_window_size)

                # create training and testing datasets
                models[model_key]['X_train_scaled'], models[model_key]['y_train'], models[model_key]['X_test_scaled'], models[model_key]['y_test'] = create_train_test_datasets(training_months)

                # get predictions
                models[model_key]['y_predictions'] =  get_predictions(
                    models[model_key]['model'], 
                    models[model_key]['X_train_scaled'], 
                    models[model_key]['y_train'], 
                    models[model_key]['X_test_scaled'])

                # Classification reports
                models[model_key]['classification_report'] = classification_report( models[model_key]['y_test'], models[model_key]['y_predictions'])

                # Print the classification report
                print(f"""
                {model_key} classification report: 
                ---------------------------------------------------------------------------
                {models[model_key]['classification_report']}
                ---------------------------------------------------------------------------
                """)                

                # backtest model
                models[model_key]['backtest'] = backtest_model(
                    model_key,
                    models[model_key]['y_test'],
                    models[model_key]['y_predictions'],
                    models[model_key]['signals_df'].loc[models[model_key]['y_test'].index.min():,'Actual Returns'],
                )   
                
                # add the cumulative return to the list of returns for plotting
                # add the actual and signal returns if the 
                if returns is None:
                    # This is the permutation for the model
                    # create the returns dataframe and add the actual returns and the signal returns
                    returns = {
                        "actual": (1 + models[model_key]['signals_df']['Actual Returns']).cumprod(),
                        "signal": (1 + models[model_key]['signals_df']['Strategy Returns']).cumprod()
                    }
                returns[model_key] = models[model_key]['backtest']['strategy_cum_return']
                if returns[model_key].iloc[-1] > max_return:
                    max_return = returns[model_key].iloc[-1]
                    selected_model = model_key
                    
    # create a plot for the family of returns for the range of training monts, and SMA window sizes
    returns_df = pd.DataFrame(returns)
    model_family_plot = returns_df.plot(
        figsize=(15,15),
        title=f'{name} Cumulative Returns for various training and SMA window sizes'
    )
    
    # save plot
    model_family_plot.figure.savefig(f'images/{name}_returns.png', bbox_inches='tight')

    # show the final returns for the family
    display(returns_df.iloc[-1:,:].T)

    # show the max return achieved with the model
    print(f"maximum cumulative return for {name} models was {max_return} from model permutation {selected_model}")