In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from functools import reduce
from sklearn.model_selection import train_test_split
import os

from utils import *

# Data Wrangling Process Overview
1. Load the data
1. Breakout some features
1. Add lags of all variables as additional features
1. Filter all data to only the intersection of dates across datasets
1. Modify the target dataset to be a binary classification problem (consider 3 classes)
1. Split all the datasets into train, test, and cross-validation sets
1. Melt train, test, and cross-validation sets into tall DataFrames
1. Concatenate columns into train, test, and cross-validation sets for model training

## Loading the data

In [2]:
# Load data
data_path = os.path.dirname(os.getcwd()) + '/data'
data_prefix = '/broad_assets_'

names = ['weekly_rets', 'roc_4w', 'roc_6w', 'roc_12w', 'roc_26w', 'roc_52w', 
         'bb_20d', 'bb_90d', 'bb_180d', 'bb_240d', 'macd', 'rsi', 'ht_transforms', 
         'rolling_vol_20', 'rolling_vol_60', 'rolling_vol_126', 'rolling_vol_252', 'rolling_vol_504',
         'rolling_skew_20', 'rolling_skew_60', 'rolling_skew_126', 'rolling_skew_252', 'rolling_skew_504',
         'rolling_kurt_20', 'rolling_kurt_60', 'rolling_kurt_126', 'rolling_kurt_252', 'rolling_kurt_504',
         'ewma_vol', 'skew', 'kurtosis']
dfs = {}
for name in names:
    dfs[name] = pd.read_excel(data_path + data_prefix + name + '.xlsx', sheet_name=0, index_col=0, parse_dates=True)

In [3]:
dfs['target'] = dfs.pop('weekly_rets')
dfs['target']

Unnamed: 0_level_0,Asset 1,Asset 2,Asset 3,Asset 4,Asset 5,Asset 6,Asset 7,Asset 8,Asset 9,Asset 10,Asset 11
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2000-07-31,0.0152,0.0757,-0.0023,0.0009,0.0422,0.0050,0.0373,0.0021,-0.0025,-0.0084,-0.0318
2000-08-07,0.0679,0.0526,0.0159,0.0134,0.0365,-0.0237,-0.0196,-0.0116,0.0241,-0.0266,0.0346
2000-08-14,0.0157,0.0023,0.0152,0.0063,0.0171,0.0254,0.0294,0.0369,0.0085,0.0063,0.0481
2000-08-21,0.0095,0.0566,-0.0018,-0.0003,0.0068,-0.0264,0.0016,-0.0024,0.0077,0.0018,0.0384
2000-08-28,0.0184,0.0653,0.0019,0.0041,0.0379,-0.0353,0.0195,-0.0170,0.0019,-0.0095,0.0313
...,...,...,...,...,...,...,...,...,...,...,...
2024-08-12,0.0607,0.0718,-0.0440,-0.0147,0.0210,0.0425,0.0431,0.0882,0.0093,0.0503,0.0583
2024-08-19,0.1006,0.1346,0.0199,-0.0004,0.1037,0.0321,0.0931,0.0771,-0.0238,0.0239,-0.0377
2024-08-26,0.0020,-0.0266,0.0028,0.0064,0.0450,0.0540,0.0290,-0.0180,-0.0195,0.0097,0.0332
2024-08-30,0.0108,0.0050,-0.0310,-0.0094,-0.0007,0.0091,0.0127,-0.0055,0.0175,-0.0126,-0.0348


## Breaking out features

### MACD

In [4]:
# MACD dataframe needs to be broken out into 3 separate dataframes
macd_names = ['MACD_Line', 'Signal_Line', 'MACD_Histogram']
for name in macd_names:
    cols = [col for col in dfs['macd'].columns if name in col]
    dfs[name] = dfs['macd'].loc[:, cols]

In [5]:
dfs['MACD_Histogram']

Unnamed: 0_level_0,MACD_Histogram_Asset 1,MACD_Histogram_Asset 2,MACD_Histogram_Asset 3,MACD_Histogram_Asset 4,MACD_Histogram_Asset 5,MACD_Histogram_Asset 6,MACD_Histogram_Asset 7,MACD_Histogram_Asset 8,MACD_Histogram_Asset 9,MACD_Histogram_Asset 10,MACD_Histogram_Asset 11
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2000-07-31,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
2000-08-01,-0.0003,-0.0080,0.0006,0.0003,-0.0034,0.0023,-0.0042,0.0005,0.0008,0.0006,0.0030
2000-08-02,-0.0011,-0.0105,0.0006,0.0004,-0.0040,0.0004,-0.0048,0.0002,0.0013,0.0009,0.0049
2000-08-03,-0.0004,-0.0055,0.0009,0.0003,-0.0050,-0.0016,-0.0069,-0.0006,0.0013,-0.0006,0.0051
2000-08-04,-0.0002,-0.0071,0.0010,0.0005,-0.0041,-0.0030,-0.0050,-0.0003,0.0007,-0.0001,0.0066
...,...,...,...,...,...,...,...,...,...,...,...
2024-08-30,-0.0002,-0.0003,-0.0016,-0.0005,-0.0001,0.0001,-0.0011,-0.0010,0.0014,-0.0015,-0.0014
2024-09-03,-0.0032,-0.0045,0.0005,0.0002,-0.0045,-0.0002,-0.0037,-0.0034,0.0012,-0.0017,-0.0035
2024-09-04,-0.0024,-0.0031,0.0020,0.0008,-0.0033,-0.0003,-0.0031,-0.0022,0.0002,-0.0010,-0.0033
2024-09-05,-0.0020,-0.0018,0.0022,0.0007,-0.0030,-0.0009,-0.0025,-0.0011,-0.0001,0.0004,-0.0015


In [6]:
del dfs['macd']

### RSI

In [7]:
# RSI dataframe has RSI and Stochastic RSI, break these out into two frames
rsi_cols = [col for col in dfs['rsi'].columns if 'rsi' in col]
stoch_rsi_cols = [col for col in dfs['rsi'].columns if 'stoch_rsi' in col]
rsi_cols = [col for col in rsi_cols if col not in stoch_rsi_cols]   # Removing stochastic rsi column names from rsi

rsi_names = ['stoch_rsi', 'rsi']
rsi_cols = [stoch_rsi_cols, rsi_cols]

for name, cols in zip(rsi_names, rsi_cols):
    dfs[name] = dfs['rsi'].loc[:, cols]

In [8]:
dfs['stoch_rsi']

Unnamed: 0_level_0,stoch_rsi_Asset 1,stoch_rsi_Asset 2,stoch_rsi_Asset 3,stoch_rsi_Asset 4,stoch_rsi_Asset 5,stoch_rsi_Asset 6,stoch_rsi_Asset 7,stoch_rsi_Asset 8,stoch_rsi_Asset 9,stoch_rsi_Asset 10,stoch_rsi_Asset 11
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2000-09-06,0.0000,0.0000,0.5547,0.8624,0.7024,0.4091,0.0000,0.0000,1.0000,0.2343,1.0000
2000-09-07,0.0000,0.0053,0.2675,0.6862,0.7508,0.6821,0.0845,0.0000,0.8614,0.0898,1.0000
2000-09-08,0.0000,0.0000,0.2712,0.6470,0.1682,1.0000,0.0000,0.0222,0.8236,0.1154,0.2458
2000-09-11,0.0000,0.0000,0.1728,0.5327,0.0474,1.0000,0.0000,0.0000,0.9052,0.3432,0.4348
2000-09-12,0.0000,0.0000,0.0429,0.4991,0.0000,1.0000,0.1115,0.0000,0.7717,0.4391,0.4206
...,...,...,...,...,...,...,...,...,...,...,...
2024-08-30,0.7792,0.6320,0.0729,0.1892,1.0000,0.9775,0.9121,0.3188,1.0000,0.0000,0.0635
2024-09-03,0.1587,0.0000,0.2123,0.2097,0.5829,0.9177,0.4005,0.0000,1.0000,0.0000,0.0000
2024-09-04,0.0485,0.0000,0.3790,0.5222,0.5883,0.9059,0.2992,0.0686,0.8607,0.2423,0.0000
2024-09-05,0.0000,0.0000,0.9582,1.0000,0.2977,0.8876,0.0195,0.0000,0.6245,0.3394,0.0000


### Bollinger Bands

In [9]:
bb_names = ['sma', 'upper', 'lower']
bb_frames = ['bb_20d', 'bb_90d', 'bb_180d', 'bb_240d']

for frame in bb_frames:
    for name in bb_names:
        cols = [col for col in dfs[frame].columns if name in col]
        dfs[f'{frame}_{name}'] = dfs[frame].loc[:, cols]

In [10]:
dfs['bb_90d_upper']

Unnamed: 0_level_0,upper90_Asset 1,upper90_Asset 2,upper90_Asset 3,upper90_Asset 4,upper90_Asset 5,upper90_Asset 6,upper90_Asset 7,upper90_Asset 8,upper90_Asset 9,upper90_Asset 10,upper90_Asset 11
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2000-12-05,0.0478,0.1299,0.0169,0.0106,0.0608,0.0303,0.0437,0.0413,0.0236,0.0233,0.0453
2000-12-06,0.0477,0.1280,0.0174,0.0109,0.0596,0.0301,0.0425,0.0422,0.0239,0.0246,0.0455
2000-12-07,0.0474,0.1281,0.0173,0.0108,0.0596,0.0284,0.0424,0.0419,0.0238,0.0246,0.0452
2000-12-08,0.0486,0.1326,0.0173,0.0108,0.0626,0.0290,0.0430,0.0429,0.0237,0.0245,0.0448
2000-12-11,0.0485,0.1318,0.0171,0.0107,0.0634,0.0295,0.0431,0.0435,0.0237,0.0242,0.0454
...,...,...,...,...,...,...,...,...,...,...,...
2024-08-30,0.0362,0.0519,0.0390,0.0157,0.0558,0.0410,0.0357,0.0392,0.0133,0.0434,0.0324
2024-09-03,0.0370,0.0531,0.0395,0.0159,0.0568,0.0410,0.0363,0.0396,0.0133,0.0434,0.0329
2024-09-04,0.0370,0.0531,0.0400,0.0161,0.0568,0.0410,0.0363,0.0396,0.0133,0.0432,0.0326
2024-09-05,0.0366,0.0523,0.0401,0.0161,0.0564,0.0410,0.0359,0.0391,0.0131,0.0435,0.0327


In [11]:
for frame in bb_frames:
    del dfs[frame]

### Hilbert Transforms

In [12]:
# Hilbert Transforms dataframe has five features
ht_names = ['ht_dcperiod', 'ht_dcphase', 'inphase', 'sine', 'ht_trendmode']

for name in ht_names:
    cols = [col for col in dfs['ht_transforms'].columns if name in col]
    dfs[name] = dfs['ht_transforms'].loc[:, cols]

In [13]:
dfs['ht_dcperiod']

Unnamed: 0_level_0,ht_dcperiod_Asset 1,ht_dcperiod_Asset 2,ht_dcperiod_Asset 3,ht_dcperiod_Asset 4,ht_dcperiod_Asset 5,ht_dcperiod_Asset 6,ht_dcperiod_Asset 7,ht_dcperiod_Asset 8,ht_dcperiod_Asset 9,ht_dcperiod_Asset 10,ht_dcperiod_Asset 11
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2000-10-27,22.9056,36.7185,22.0138,20.6361,21.0196,27.1222,23.8259,25.8897,22.0544,16.1676,24.6703
2000-10-30,22.2560,35.4042,23.5627,21.7373,21.0822,28.7781,24.0764,25.2568,22.1548,15.7801,23.8106
2000-10-31,21.7633,33.8106,24.6628,22.6617,21.2878,30.9482,24.6472,25.0396,22.4161,15.4657,22.8284
2000-11-01,21.2824,32.0770,25.3371,23.3965,21.4470,33.3688,25.0424,24.8796,22.9361,15.3363,22.0150
2000-11-02,20.8243,30.4675,26.5159,24.5571,21.4857,34.8065,25.1196,24.6085,23.5289,15.4506,21.4205
...,...,...,...,...,...,...,...,...,...,...,...
2024-08-30,19.7883,21.5685,16.1287,18.8742,20.7283,19.2176,20.7838,25.0923,21.8228,16.9015,20.9577
2024-09-03,19.9034,21.3596,16.2487,18.8142,20.8586,20.7360,21.3793,24.6678,22.1340,17.9949,20.0412
2024-09-04,20.0680,21.2081,16.4225,18.6878,21.0514,22.5393,22.0583,24.2788,22.2113,18.9384,19.3261
2024-09-05,20.2467,21.0688,16.5598,18.5762,21.2879,23.7503,22.7931,23.8953,22.1642,19.5856,18.7763


In [14]:
del dfs['ht_transforms']

## Dates Cleanup

In [15]:
dfs['target'] = dfs['target'].shift(-1).dropna()

In [16]:
# Get intersection of dates across datasets
dates_inter = reduce(lambda x, y: set(x).intersection(y.index), list(dfs.values())[1:], list(dfs.values())[0].index)
dates_inter = sorted(list(dates_inter))
print(f'Count: {len(dates_inter)}\nFirst: {dates_inter[0]}\nLast:{dates_inter[-1]}')

Count: 1153
First: 2002-08-05 00:00:00
Last:2024-08-30 00:00:00


In [17]:
for name, frame in dfs.items():
    dfs[name] = frame.loc[dates_inter]

In [18]:
dfs['kurtosis']

Unnamed: 0_level_0,Asset 1,Asset 2,Asset 3,Asset 4,Asset 5,Asset 6,Asset 7,Asset 8,Asset 9,Asset 10,Asset 11
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2002-08-05,1.6640,2.4623,0.8770,1.9608,0.5218,3.7164,2.9311,1.1800,0.7778,9.9041,1.3716
2002-08-12,1.5768,2.4247,0.8373,1.8495,0.5227,3.5994,2.7889,1.1641,0.7241,9.5074,1.3532
2002-08-19,1.5230,2.3602,0.7757,1.7816,0.4857,3.4877,2.7153,1.1861,0.7296,9.2300,1.3602
2002-08-26,1.4843,2.3660,0.7422,1.7375,0.4521,3.4850,2.6512,1.2117,0.7326,9.2312,1.3727
2002-08-30,1.4894,2.3645,0.7214,1.7298,0.4306,3.5034,2.5790,1.2152,0.7148,9.1211,1.3771
...,...,...,...,...,...,...,...,...,...,...,...
2024-08-05,10.5468,8.3711,3.5426,3.5074,6.1892,21.2872,9.2754,18.8675,1.6149,5.8244,3.1960
2024-08-12,10.5394,8.3660,3.5324,3.5030,6.1849,21.2892,9.2810,18.8699,1.6179,5.8164,3.1948
2024-08-19,10.5331,8.3580,3.5293,3.5005,6.1814,21.3058,9.2833,18.8751,1.6161,5.8132,3.1957
2024-08-26,10.5372,8.3601,3.5266,3.4982,6.1742,21.3152,9.2875,18.8784,1.6139,5.8141,3.1953


## Adding Lags to all variables

In [19]:
features = list(dfs.keys())
features.remove('target')
len(features)

45

In [20]:
for lag in [1, 2, 4, 6, 12]:
    for feature in features:
        dfs[f'{feature}_lag{lag}'] = dfs[feature].shift(lag).dropna()

In [21]:
# Write features into a file to be read in for performance evaluation
features = list(dfs.keys())
features.remove('target')

with open('classifier_features.txt', 'w') as f:
    for feature in features:
        f.write(f'{feature}\n')

In [22]:
# Get intersection of dates across datasets
dates_inter = reduce(lambda x, y: set(x).intersection(y.index), list(dfs.values())[1:], list(dfs.values())[0].index)
dates_inter = sorted(list(dates_inter))
print(f'Count: {len(dates_inter)}\nFirst: {dates_inter[0]}\nLast:{dates_inter[-1]}')

Count: 1141
First: 2002-10-28 00:00:00
Last:2024-08-30 00:00:00


In [23]:
for name, frame in dfs.items():
    dfs[name] = frame.loc[dates_inter]

## Modifying Target Variable

In [24]:
# Make the target data a binary classification
print('Before:')
display(dfs['target'])
print('After:')
display(pd.DataFrame(np.where(dfs['target'] > 0, 1, 0), columns=dfs['target'].columns, index=dfs['target'].index))

Before:


Unnamed: 0_level_0,Asset 1,Asset 2,Asset 3,Asset 4,Asset 5,Asset 6,Asset 7,Asset 8,Asset 9,Asset 10,Asset 11
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2002-10-28,0.0412,0.1394,0.0133,0.0082,0.0992,0.0389,0.0583,0.0278,-0.0194,0.0180,-0.0108
2002-11-04,-0.0700,-0.1403,0.0606,0.0220,-0.0915,-0.0248,-0.0571,0.0071,-0.0297,0.0198,-0.0351
2002-11-11,0.0562,0.1513,-0.0253,-0.0196,0.0737,0.0220,0.0704,0.0344,0.0137,-0.0162,0.0266
2002-11-18,0.0727,0.1578,-0.0294,-0.0216,0.1183,0.0280,0.0126,0.0391,0.0253,-0.0074,0.0001
2002-11-25,0.0028,-0.0144,-0.0101,-0.0044,0.0175,0.0311,0.0254,0.0218,-0.0043,-0.0022,0.0248
...,...,...,...,...,...,...,...,...,...,...,...
2024-08-05,0.0607,0.0718,-0.0440,-0.0147,0.0210,0.0425,0.0431,0.0882,0.0093,0.0503,0.0583
2024-08-12,0.1006,0.1346,0.0199,-0.0004,0.1037,0.0321,0.0931,0.0771,-0.0238,0.0239,-0.0377
2024-08-19,0.0020,-0.0266,0.0028,0.0064,0.0450,0.0540,0.0290,-0.0180,-0.0195,0.0097,0.0332
2024-08-26,0.0108,0.0050,-0.0310,-0.0094,-0.0007,0.0091,0.0127,-0.0055,0.0175,-0.0126,-0.0348


After:


Unnamed: 0_level_0,Asset 1,Asset 2,Asset 3,Asset 4,Asset 5,Asset 6,Asset 7,Asset 8,Asset 9,Asset 10,Asset 11
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2002-10-28,1,1,1,1,1,1,1,1,0,1,0
2002-11-04,0,0,1,1,0,0,0,1,0,1,0
2002-11-11,1,1,0,0,1,1,1,1,1,0,1
2002-11-18,1,1,0,0,1,1,1,1,1,0,1
2002-11-25,1,0,0,0,1,1,1,1,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...
2024-08-05,1,1,0,0,1,1,1,1,1,1,1
2024-08-12,1,1,1,0,1,1,1,1,0,1,0
2024-08-19,1,0,1,1,1,1,1,0,0,1,1
2024-08-26,1,1,0,0,0,1,1,0,1,0,0


In [25]:
dfs['target'] = pd.DataFrame(np.where(dfs['target'] > 0, 1, 0), columns=dfs['target'].columns, index=dfs['target'].index)

## Train-Test Split

In [26]:
# Splitting each frame into train-test sets
train_len = round(len(dfs['target'].index) * 0.8)
test_len = len(dfs['target'].index) - train_len
train_dfs, test_dfs = {}, {}
for name, frame in dfs.items():
    train_dfs[name] = frame.iloc[:train_len]
    test_dfs[name] = frame.iloc[-test_len:]

In [27]:
display(train_dfs['target'].tail(5))
display(test_dfs['target'].head(5))

Unnamed: 0_level_0,Asset 1,Asset 2,Asset 3,Asset 4,Asset 5,Asset 6,Asset 7,Asset 8,Asset 9,Asset 10,Asset 11
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-03-23,1,1,1,1,1,1,1,1,0,1,1
2020-03-30,1,1,0,0,0,0,0,1,1,1,1
2020-04-06,1,1,0,0,1,1,1,1,0,1,1
2020-04-13,1,1,1,1,0,0,0,1,1,0,0
2020-04-20,1,1,0,0,1,1,1,1,0,1,0


Unnamed: 0_level_0,Asset 1,Asset 2,Asset 3,Asset 4,Asset 5,Asset 6,Asset 7,Asset 8,Asset 9,Asset 10,Asset 11
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-04-27,0,0,0,1,0,0,0,0,0,0,1
2020-05-04,1,1,0,0,1,1,1,1,1,0,1
2020-05-11,1,1,1,0,1,0,1,1,0,1,1
2020-05-18,1,1,1,1,1,1,0,0,1,1,0
2020-05-22,1,1,0,1,1,1,1,1,0,1,1


In [28]:
train_dfs.keys()

dict_keys(['roc_4w', 'roc_6w', 'roc_12w', 'roc_26w', 'roc_52w', 'rsi', 'rolling_vol_20', 'rolling_vol_60', 'rolling_vol_126', 'rolling_vol_252', 'rolling_vol_504', 'rolling_skew_20', 'rolling_skew_60', 'rolling_skew_126', 'rolling_skew_252', 'rolling_skew_504', 'rolling_kurt_20', 'rolling_kurt_60', 'rolling_kurt_126', 'rolling_kurt_252', 'rolling_kurt_504', 'ewma_vol', 'skew', 'kurtosis', 'target', 'MACD_Line', 'Signal_Line', 'MACD_Histogram', 'stoch_rsi', 'bb_20d_sma', 'bb_20d_upper', 'bb_20d_lower', 'bb_90d_sma', 'bb_90d_upper', 'bb_90d_lower', 'bb_180d_sma', 'bb_180d_upper', 'bb_180d_lower', 'bb_240d_sma', 'bb_240d_upper', 'bb_240d_lower', 'ht_dcperiod', 'ht_dcphase', 'inphase', 'sine', 'ht_trendmode', 'roc_4w_lag1', 'roc_6w_lag1', 'roc_12w_lag1', 'roc_26w_lag1', 'roc_52w_lag1', 'rsi_lag1', 'rolling_vol_20_lag1', 'rolling_vol_60_lag1', 'rolling_vol_126_lag1', 'rolling_vol_252_lag1', 'rolling_vol_504_lag1', 'rolling_skew_20_lag1', 'rolling_skew_60_lag1', 'rolling_skew_126_lag1', 'rol

In [29]:
# Exporting Train and Test data in portfolio evaluation format before re-shaping for training
train_path = data_path + '/classifier_train'
test_path = data_path + '/classifier_test'
for name in train_dfs.keys():
    train_dfs[name].to_excel(train_path + f'/broad_assets_{name}_train.xlsx', sheet_name=f'broad_assets_{name}_train')
    test_dfs[name].to_excel(test_path + f'/broad_assets_{name}_test.xlsx', sheet_name=f'broad_assets_{name}_test')

## Melting Train and Test into tall datasets

In [30]:
train_df = pd.DataFrame()
for name, frame in train_dfs.items():
    new_col = frame.melt(value_name=name).drop(columns='variable')
    train_df = pd.concat([train_df, new_col],axis=1)

train_df

Unnamed: 0,roc_4w,roc_6w,roc_12w,roc_26w,roc_52w,rsi,rolling_vol_20,rolling_vol_60,rolling_vol_126,rolling_vol_252,...,bb_180d_upper_lag12,bb_180d_lower_lag12,bb_240d_sma_lag12,bb_240d_upper_lag12,bb_240d_lower_lag12,ht_dcperiod_lag12,ht_dcphase_lag12,inphase_lag12,sine_lag12,ht_trendmode_lag12
0,0.1799,-0.0160,0.1135,-0.3316,-0.3533,69.8340,0.0480,0.0426,0.0405,0.0322,...,0.0543,-0.0608,-0.0026,0.0555,-0.0607,27.9317,21.5566,0.0539,0.3674,1.0000
1,0.3281,0.1725,-0.0097,-0.2867,-0.3558,58.9595,0.0402,0.0402,0.0405,0.0320,...,0.0571,-0.0617,-0.0019,0.0574,-0.0612,23.8178,79.4941,-0.0469,0.9832,1.0000
2,0.0789,0.1426,-0.1661,-0.3623,-0.4174,45.7714,0.0357,0.0392,0.0398,0.0322,...,0.0593,-0.0628,-0.0013,0.0592,-0.0618,20.3409,187.2039,0.0283,-0.1254,1.0000
3,0.0002,0.3045,-0.1131,-0.3478,-0.4193,57.1682,0.0264,0.0385,0.0400,0.0323,...,0.0593,-0.0633,-0.0010,0.0597,-0.0617,18.2963,180.9663,0.0266,-0.0169,1.0000
4,0.0971,0.2223,0.0190,-0.2896,-0.3836,56.5647,0.0264,0.0388,0.0402,0.0324,...,0.0593,-0.0635,-0.0007,0.0587,-0.0602,18.6736,224.8681,0.0049,-0.7055,1.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10038,-0.4219,-0.4142,-0.5271,-0.4841,-0.5308,20.6627,0.0524,0.0342,0.0261,0.0232,...,0.0363,-0.0365,0.0004,0.0343,-0.0334,19.3831,191.1857,0.0177,-0.1940,1.0000
10039,-0.3399,-0.4112,-0.5144,-0.4267,-0.5140,34.2219,0.0588,0.0375,0.0281,0.0244,...,0.0366,-0.0365,0.0005,0.0343,-0.0334,25.3414,189.0703,0.0307,-0.1576,1.0000
10040,-0.1278,-0.3798,-0.4734,-0.4075,-0.5131,56.6867,0.0521,0.0394,0.0292,0.0250,...,0.0365,-0.0368,0.0002,0.0340,-0.0337,35.5674,183.0628,0.0310,-0.0534,1.0000
10041,0.0482,-0.2932,-0.4511,-0.3987,-0.4930,64.1125,0.0458,0.0398,0.0294,0.0251,...,0.0365,-0.0364,0.0002,0.0339,-0.0336,35.5255,209.6138,-0.0735,-0.4942,0.0000


In [31]:
test_df = pd.DataFrame()
for name, frame in test_dfs.items():
    new_col = frame.melt(value_name=name).drop(columns='variable')
    test_df = pd.concat([test_df, new_col],axis=1)

test_df

Unnamed: 0,roc_4w,roc_6w,roc_12w,roc_26w,roc_52w,rsi,rolling_vol_20,rolling_vol_60,rolling_vol_126,rolling_vol_252,...,bb_180d_upper_lag12,bb_180d_lower_lag12,bb_240d_sma_lag12,bb_240d_upper_lag12,bb_240d_lower_lag12,ht_dcperiod_lag12,ht_dcphase_lag12,inphase_lag12,sine_lag12,ht_trendmode_lag12
0,0.1870,0.4074,-0.2815,-0.1763,-0.1228,66.4231,0.0543,0.0788,0.0547,0.0406,...,0.0323,-0.0293,0.0014,0.0313,-0.0285,28.2630,214.8414,-0.1927,-0.5713,0.0000
1,0.1320,0.5730,-0.3428,-0.2179,-0.1393,50.3505,0.0491,0.0792,0.0551,0.0409,...,0.0329,-0.0288,0.0016,0.0319,-0.0286,24.4350,302.4536,-0.0744,-0.8438,1.0000
2,0.1206,0.2293,-0.3134,-0.1740,-0.0048,70.7707,0.0366,0.0794,0.0553,0.0409,...,0.0330,-0.0284,0.0017,0.0320,-0.0285,20.3095,124.5496,0.1614,0.8236,1.0000
3,0.0914,0.2223,-0.2338,-0.1804,-0.0101,58.7011,0.0350,0.0801,0.0558,0.0411,...,0.0332,-0.0301,0.0013,0.0327,-0.0300,17.6469,212.7406,0.0573,-0.5408,0.0000
4,0.0534,0.1399,-0.1599,-0.1855,0.0015,62.5326,0.0312,0.0786,0.0559,0.0411,...,0.0389,-0.0371,0.0009,0.0372,-0.0354,18.4343,314.6006,-0.2828,-0.7120,1.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2503,-0.1386,-0.1309,-0.1467,-0.0182,-0.1647,25.1601,0.0162,0.0168,0.0148,0.0165,...,0.0337,-0.0333,0.0008,0.0347,-0.0330,18.0478,6.8301,-0.0080,0.1189,1.0000
2504,-0.0596,-0.0957,-0.1316,0.0060,-0.1039,50.5970,0.0199,0.0177,0.0152,0.0167,...,0.0339,-0.0335,0.0008,0.0344,-0.0329,16.2415,128.1865,0.0136,0.7860,1.0000
2505,-0.0522,-0.1228,-0.1337,-0.0303,-0.1222,50.8652,0.0200,0.0174,0.0155,0.0168,...,0.0336,-0.0340,0.0007,0.0342,-0.0328,16.0384,192.7800,0.0477,-0.2212,0.0000
2506,0.0266,-0.0650,-0.0546,0.0125,-0.1129,66.2795,0.0215,0.0170,0.0156,0.0169,...,0.0340,-0.0355,0.0002,0.0338,-0.0335,18.3349,242.7373,-0.0502,-0.8889,1.0000


## Exporting for Model Training

In [32]:
train_df.to_excel(data_path + '/broad_assets_classifier_train_data.xlsx', sheet_name='broad_assets_train')
test_df.to_excel(data_path + '/broad_assets_classifier_test_data.xlsx', sheet_name='broad_assets_test')