In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from functools import reduce
from sklearn.model_selection import train_test_split
import os

from utils import *

# Data Wrangling Process Overview
1. Load the data
1. Breakout some features
1. Add lags of all variables as additional features
1. Filter all data to only the intersection of dates across datasets
1. Modify the target dataset to be a binary classification problem (consider 3 classes)
1. Split all the datasets into train, test, and cross-validation sets
1. Melt train, test, and cross-validation sets into tall DataFrames
1. Concatenate columns into train, test, and cross-validation sets for model training

## Loading the data

In [2]:
# Load data
data_path = os.path.dirname(os.getcwd()) + '/data'
data_prefix = '/broad_assets_'

# target_file = data_path + data_prefix + 'weekly_rets.xlsx'
# roc_file = data_path + data_prefix + 'roc_52w.xlsx'
# macd_file = data_path + data_prefix + 'macd.xlsx'
# rsi_file = data_path + data_prefix + 'rsi.xlsx'
# ewma_file = data_path + data_prefix + 'ewma_vol.xlsx'
# skew_file = data_path + data_prefix + 'skew.xlsx'
# kurtosis_file = data_path + data_prefix + 'kurtosis.xlsx'

names = ['weekly_rets', 'roc_4w', 'roc_6w', 'roc_12w', 'roc_26w', 'roc_52w', 'macd', 'rsi', 'ht_transforms', 'ewma_vol', 'skew', 'kurtosis']
# files = [target_file, lag1_file, roc_file, macd_file, rsi_file, ewma_file, skew_file, kurtosis_file]
dfs = {}
for name in names:
    dfs[name] = pd.read_excel(data_path + data_prefix + name + '.xlsx', sheet_name=0, index_col=0, parse_dates=True)

In [3]:
dfs['target'] = dfs.pop('weekly_rets')
dfs['target']

Unnamed: 0_level_0,Asset 1,Asset 2,Asset 3,Asset 4,Asset 5,Asset 6,Asset 7,Asset 8,Asset 9,Asset 10,Asset 11
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2000-07-31,0.0152,0.0757,-0.0023,0.0009,0.0422,0.0050,0.0373,0.0021,-0.0025,-0.0084,-0.0318
2000-08-07,0.0679,0.0526,0.0159,0.0134,0.0365,-0.0237,-0.0196,-0.0116,0.0241,-0.0266,0.0346
2000-08-14,0.0157,0.0023,0.0152,0.0063,0.0171,0.0254,0.0294,0.0369,0.0085,0.0063,0.0481
2000-08-21,0.0095,0.0566,-0.0018,-0.0003,0.0068,-0.0264,0.0016,-0.0024,0.0077,0.0018,0.0384
2000-08-28,0.0184,0.0653,0.0019,0.0041,0.0379,-0.0353,0.0195,-0.0170,0.0019,-0.0095,0.0313
...,...,...,...,...,...,...,...,...,...,...,...
2024-08-12,0.0607,0.0718,-0.0440,-0.0147,0.0210,0.0425,0.0431,0.0882,0.0093,0.0503,0.0583
2024-08-19,0.1006,0.1346,0.0199,-0.0004,0.1037,0.0321,0.0931,0.0771,-0.0238,0.0239,-0.0377
2024-08-26,0.0020,-0.0266,0.0028,0.0064,0.0450,0.0540,0.0290,-0.0180,-0.0195,0.0097,0.0332
2024-08-30,0.0108,0.0050,-0.0310,-0.0094,-0.0007,0.0091,0.0127,-0.0055,0.0175,-0.0126,-0.0348


## Breaking out features

### MACD

In [4]:
# MACD dataframe needs to be broken out into 3 separate dataframes
macd_names = ['MACD_Line', 'Signal_Line', 'MACD_Histogram']
for name in macd_names:
    cols = [col for col in dfs['macd'].columns if name in col]
    dfs[name] = dfs['macd'].loc[:, cols]

In [5]:
dfs['MACD_Histogram']

Unnamed: 0_level_0,MACD_Histogram_Asset 1,MACD_Histogram_Asset 2,MACD_Histogram_Asset 3,MACD_Histogram_Asset 4,MACD_Histogram_Asset 5,MACD_Histogram_Asset 6,MACD_Histogram_Asset 7,MACD_Histogram_Asset 8,MACD_Histogram_Asset 9,MACD_Histogram_Asset 10,MACD_Histogram_Asset 11
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2000-07-31,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000
2000-08-01,-0.0003,-0.0080,0.0006,0.0003,-0.0034,0.0023,-0.0042,0.0005,0.0008,0.0006,0.0030
2000-08-02,-0.0011,-0.0105,0.0006,0.0004,-0.0040,0.0004,-0.0048,0.0002,0.0013,0.0009,0.0049
2000-08-03,-0.0004,-0.0055,0.0009,0.0003,-0.0050,-0.0016,-0.0069,-0.0006,0.0013,-0.0006,0.0051
2000-08-04,-0.0002,-0.0071,0.0010,0.0005,-0.0041,-0.0030,-0.0050,-0.0003,0.0007,-0.0001,0.0066
...,...,...,...,...,...,...,...,...,...,...,...
2024-08-30,-0.0002,-0.0003,-0.0016,-0.0005,-0.0001,0.0001,-0.0011,-0.0010,0.0014,-0.0015,-0.0014
2024-09-03,-0.0032,-0.0045,0.0005,0.0002,-0.0045,-0.0002,-0.0037,-0.0034,0.0012,-0.0017,-0.0035
2024-09-04,-0.0024,-0.0031,0.0020,0.0008,-0.0033,-0.0003,-0.0031,-0.0022,0.0002,-0.0010,-0.0033
2024-09-05,-0.0020,-0.0018,0.0022,0.0007,-0.0030,-0.0009,-0.0025,-0.0011,-0.0001,0.0004,-0.0015


In [6]:
del dfs['macd']

### RSI

In [7]:
# RSI dataframe has RSI and Stochastic RSI, break these out into two frames
rsi_cols = [col for col in dfs['rsi'].columns if 'rsi' in col]
stoch_rsi_cols = [col for col in dfs['rsi'].columns if 'stoch_rsi' in col]
rsi_cols = [col for col in rsi_cols if col not in stoch_rsi_cols]   # Removing stochastic rsi column names from rsi

rsi_names = ['stoch_rsi', 'rsi']
rsi_cols = [stoch_rsi_cols, rsi_cols]

for name, cols in zip(rsi_names, rsi_cols):
    dfs[name] = dfs['rsi'].loc[:, cols]

In [8]:
dfs['stoch_rsi']

Unnamed: 0_level_0,stoch_rsi_rsi_Asset 1,stoch_rsi_rsi_Asset 2,stoch_rsi_rsi_Asset 3,stoch_rsi_rsi_Asset 4,stoch_rsi_rsi_Asset 5,stoch_rsi_rsi_Asset 6,stoch_rsi_rsi_Asset 7,stoch_rsi_rsi_Asset 8,stoch_rsi_rsi_Asset 9,stoch_rsi_rsi_Asset 10,stoch_rsi_rsi_Asset 11
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2000-09-06,0.0000,0.0000,0.5547,0.8624,0.7024,0.4091,0.0000,0.0000,1.0000,0.2343,1.0000
2000-09-07,0.0000,0.0053,0.2675,0.6862,0.7508,0.6821,0.0845,0.0000,0.8614,0.0898,1.0000
2000-09-08,0.0000,0.0000,0.2712,0.6470,0.1682,1.0000,0.0000,0.0222,0.8236,0.1154,0.2458
2000-09-11,0.0000,0.0000,0.1728,0.5327,0.0474,1.0000,0.0000,0.0000,0.9052,0.3432,0.4348
2000-09-12,0.0000,0.0000,0.0429,0.4991,0.0000,1.0000,0.1115,0.0000,0.7717,0.4391,0.4206
...,...,...,...,...,...,...,...,...,...,...,...
2024-08-30,0.7792,0.6320,0.0729,0.1892,1.0000,0.9775,0.9121,0.3188,1.0000,0.0000,0.0635
2024-09-03,0.1587,0.0000,0.2123,0.2097,0.5829,0.9177,0.4005,0.0000,1.0000,0.0000,0.0000
2024-09-04,0.0485,0.0000,0.3790,0.5222,0.5883,0.9059,0.2992,0.0686,0.8607,0.2423,0.0000
2024-09-05,0.0000,0.0000,0.9582,1.0000,0.2977,0.8876,0.0195,0.0000,0.6245,0.3394,0.0000


### Hilbert Transforms

In [9]:
# Hilbert Transforms dataframe has five features
ht_names = ['ht_dcperiod', 'ht_dcphase', 'inphase', 'sine', 'ht_trendmode']

for name in ht_names:
    cols = [col for col in dfs['ht_transforms'].columns if name in col]
    dfs[name] = dfs['ht_transforms'].loc[:, cols]

In [10]:
dfs['ht_dcperiod']

Unnamed: 0_level_0,ht_dcperiod_Asset 1,ht_dcperiod_Asset 2,ht_dcperiod_Asset 3,ht_dcperiod_Asset 4,ht_dcperiod_Asset 5,ht_dcperiod_Asset 6,ht_dcperiod_Asset 7,ht_dcperiod_Asset 8,ht_dcperiod_Asset 9,ht_dcperiod_Asset 10,ht_dcperiod_Asset 11
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2000-10-27,22.9056,36.7185,22.0138,20.6361,21.0196,27.1222,23.8259,25.8897,22.0544,16.1676,24.6703
2000-10-30,22.2560,35.4042,23.5627,21.7373,21.0822,28.7781,24.0764,25.2568,22.1548,15.7801,23.8106
2000-10-31,21.7633,33.8106,24.6628,22.6617,21.2878,30.9482,24.6472,25.0396,22.4161,15.4657,22.8284
2000-11-01,21.2824,32.0770,25.3371,23.3965,21.4470,33.3688,25.0424,24.8796,22.9361,15.3363,22.0150
2000-11-02,20.8243,30.4675,26.5159,24.5571,21.4857,34.8065,25.1196,24.6085,23.5289,15.4506,21.4205
...,...,...,...,...,...,...,...,...,...,...,...
2024-08-30,19.7883,21.5685,16.1287,18.8742,20.7283,19.2176,20.7838,25.0923,21.8228,16.9015,20.9577
2024-09-03,19.9034,21.3596,16.2487,18.8142,20.8586,20.7360,21.3793,24.6678,22.1340,17.9949,20.0412
2024-09-04,20.0680,21.2081,16.4225,18.6878,21.0514,22.5393,22.0583,24.2788,22.2113,18.9384,19.3261
2024-09-05,20.2467,21.0688,16.5598,18.5762,21.2879,23.7503,22.7931,23.8953,22.1642,19.5856,18.7763


In [11]:
del dfs['ht_transforms']

## Dates Cleanup

In [12]:
dfs['target'] = dfs['target'].shift(-1).dropna()

In [13]:
# Get intersection of dates across datasets
dates_inter = reduce(lambda x, y: set(x).intersection(y.index), list(dfs.values())[1:], list(dfs.values())[0].index)
dates_inter = sorted(list(dates_inter))
print(f'Count: {len(dates_inter)}\nFirst: {dates_inter[0]}\nLast:{dates_inter[-1]}')

Count: 1100
First: 2003-08-11 00:00:00
Last:2024-08-30 00:00:00


In [14]:
for name, frame in dfs.items():
    dfs[name] = frame.loc[dates_inter]

In [15]:
dfs['kurtosis']

Unnamed: 0_level_0,Asset 1,Asset 2,Asset 3,Asset 4,Asset 5,Asset 6,Asset 7,Asset 8,Asset 9,Asset 10,Asset 11
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2003-08-11,1.0794,2.6887,0.8889,1.4834,0.2593,2.3380,1.8598,0.9178,0.5774,5.0575,1.0969
2003-08-18,1.0896,2.7038,0.9973,1.5761,0.2569,2.3438,1.8815,0.8904,0.5750,4.9841,1.1055
2003-08-25,1.1106,2.7326,0.9672,1.5161,0.2544,2.3423,1.9078,0.8891,0.5819,4.9187,1.1226
2003-08-29,1.1257,2.7547,0.9542,1.4983,0.2622,2.3346,1.9160,0.8804,0.5847,4.7922,1.1355
2003-09-08,1.1313,2.7749,0.9515,1.4891,0.2584,2.3171,1.9103,0.8692,0.5671,4.7830,1.1371
...,...,...,...,...,...,...,...,...,...,...,...
2024-08-05,10.5468,8.3711,3.5426,3.5074,6.1892,21.2872,9.2754,18.8675,1.6149,5.8244,3.1960
2024-08-12,10.5394,8.3660,3.5324,3.5030,6.1849,21.2892,9.2810,18.8699,1.6179,5.8164,3.1948
2024-08-19,10.5331,8.3580,3.5293,3.5005,6.1814,21.3058,9.2833,18.8751,1.6161,5.8132,3.1957
2024-08-26,10.5372,8.3601,3.5266,3.4982,6.1742,21.3152,9.2875,18.8784,1.6139,5.8141,3.1953


## Adding Lags to all variables

In [16]:
features = list(dfs.keys())
features.remove('target')
features

['roc_4w',
 'roc_6w',
 'roc_12w',
 'roc_26w',
 'roc_52w',
 'rsi',
 'ewma_vol',
 'skew',
 'kurtosis',
 'MACD_Line',
 'Signal_Line',
 'MACD_Histogram',
 'stoch_rsi',
 'ht_dcperiod',
 'ht_dcphase',
 'inphase',
 'sine',
 'ht_trendmode']

In [17]:
for lag in [1, 2, 4, 6, 12]:
    for feature in features:
        dfs[f'{feature}_lag{lag}'] = dfs[feature].shift(lag).dropna()

In [18]:
# Write features into a file to be read in for performance evaluation
features = list(dfs.keys())
features.remove('target')

with open('classifier_features.txt', 'w') as f:
    for feature in features:
        f.write(f'{feature}\n')

In [19]:
# Get intersection of dates across datasets
dates_inter = reduce(lambda x, y: set(x).intersection(y.index), list(dfs.values())[1:], list(dfs.values())[0].index)
dates_inter = sorted(list(dates_inter))
print(f'Count: {len(dates_inter)}\nFirst: {dates_inter[0]}\nLast:{dates_inter[-1]}')

Count: 1088
First: 2003-11-03 00:00:00
Last:2024-08-30 00:00:00


In [20]:
for name, frame in dfs.items():
    dfs[name] = frame.loc[dates_inter]

## Modifying Target Variable

In [21]:
# Make the target data a binary classification
print('Before:')
display(dfs['target'])
print('After:')
display(pd.DataFrame(np.where(dfs['target'] > 0, 1, 0), columns=dfs['target'].columns, index=dfs['target'].index))

Before:


Unnamed: 0_level_0,Asset 1,Asset 2,Asset 3,Asset 4,Asset 5,Asset 6,Asset 7,Asset 8,Asset 9,Asset 10,Asset 11
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2003-11-03,-0.0216,-0.0396,-0.0218,-0.0121,-0.0174,0.0142,-0.0123,-0.0318,-0.0148,0.0495,0.0584
2003-11-10,-0.0060,-0.0306,0.0700,0.0465,-0.0269,0.0182,0.0441,-0.0256,-0.0319,0.0258,0.0395
2003-11-17,0.0160,0.0350,-0.0079,-0.0073,0.0502,-0.0194,0.0051,0.0031,0.0012,0.0014,-0.0563
2003-11-24,0.0349,0.0390,-0.0263,-0.0218,0.0567,0.0607,0.0795,0.1033,-0.0281,0.0553,0.0403
2003-12-01,-0.0011,-0.0406,0.0215,0.0211,-0.0415,-0.0010,0.0022,-0.0294,-0.0367,0.0221,0.0793
...,...,...,...,...,...,...,...,...,...,...,...
2024-08-05,0.0607,0.0718,-0.0440,-0.0147,0.0210,0.0425,0.0431,0.0882,0.0093,0.0503,0.0583
2024-08-12,0.1006,0.1346,0.0199,-0.0004,0.1037,0.0321,0.0931,0.0771,-0.0238,0.0239,-0.0377
2024-08-19,0.0020,-0.0266,0.0028,0.0064,0.0450,0.0540,0.0290,-0.0180,-0.0195,0.0097,0.0332
2024-08-26,0.0108,0.0050,-0.0310,-0.0094,-0.0007,0.0091,0.0127,-0.0055,0.0175,-0.0126,-0.0348


After:


Unnamed: 0_level_0,Asset 1,Asset 2,Asset 3,Asset 4,Asset 5,Asset 6,Asset 7,Asset 8,Asset 9,Asset 10,Asset 11
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2003-11-03,0,0,0,0,0,1,0,0,0,1,1
2003-11-10,0,0,1,1,0,1,1,0,0,1,1
2003-11-17,1,1,0,0,1,0,1,1,1,1,0
2003-11-24,1,1,0,0,1,1,1,1,0,1,1
2003-12-01,0,0,1,1,0,0,1,0,0,1,1
...,...,...,...,...,...,...,...,...,...,...,...
2024-08-05,1,1,0,0,1,1,1,1,1,1,1
2024-08-12,1,1,1,0,1,1,1,1,0,1,0
2024-08-19,1,0,1,1,1,1,1,0,0,1,1
2024-08-26,1,1,0,0,0,1,1,0,1,0,0


In [22]:
dfs['target'] = pd.DataFrame(np.where(dfs['target'] > 0, 1, 0), columns=dfs['target'].columns, index=dfs['target'].index)

## Train-Test Split

In [23]:
# Splitting each frame into train-test sets
train_len = round(len(dfs['target'].index) * 0.8)
test_len = len(dfs['target'].index) - train_len
train_dfs, test_dfs = {}, {}
for name, frame in dfs.items():
    train_dfs[name] = frame.iloc[:train_len]
    test_dfs[name] = frame.iloc[-test_len:]

In [24]:
display(train_dfs['target'].tail(5))
display(test_dfs['target'].head(5))

Unnamed: 0_level_0,Asset 1,Asset 2,Asset 3,Asset 4,Asset 5,Asset 6,Asset 7,Asset 8,Asset 9,Asset 10,Asset 11
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-06-01,1,1,0,0,1,1,1,1,0,0,1
2020-06-08,0,0,1,1,0,0,0,0,1,1,0
2020-06-15,1,1,0,0,1,0,1,1,1,1,1
2020-06-22,0,0,1,1,0,0,0,0,1,1,0
2020-06-29,1,1,0,0,1,1,1,1,0,1,1


Unnamed: 0_level_0,Asset 1,Asset 2,Asset 3,Asset 4,Asset 5,Asset 6,Asset 7,Asset 8,Asset 9,Asset 10,Asset 11
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2020-07-06,0,0,1,1,0,0,0,0,0,1,1
2020-07-13,1,1,1,1,1,1,1,1,0,1,1
2020-07-20,0,0,1,1,1,1,1,1,0,1,1
2020-07-27,1,1,1,1,1,1,0,0,0,1,1
2020-08-03,1,1,1,0,1,1,1,1,1,1,1


In [25]:
train_dfs.keys()

dict_keys(['roc_4w', 'roc_6w', 'roc_12w', 'roc_26w', 'roc_52w', 'rsi', 'ewma_vol', 'skew', 'kurtosis', 'target', 'MACD_Line', 'Signal_Line', 'MACD_Histogram', 'stoch_rsi', 'ht_dcperiod', 'ht_dcphase', 'inphase', 'sine', 'ht_trendmode', 'roc_4w_lag1', 'roc_6w_lag1', 'roc_12w_lag1', 'roc_26w_lag1', 'roc_52w_lag1', 'rsi_lag1', 'ewma_vol_lag1', 'skew_lag1', 'kurtosis_lag1', 'MACD_Line_lag1', 'Signal_Line_lag1', 'MACD_Histogram_lag1', 'stoch_rsi_lag1', 'ht_dcperiod_lag1', 'ht_dcphase_lag1', 'inphase_lag1', 'sine_lag1', 'ht_trendmode_lag1', 'roc_4w_lag2', 'roc_6w_lag2', 'roc_12w_lag2', 'roc_26w_lag2', 'roc_52w_lag2', 'rsi_lag2', 'ewma_vol_lag2', 'skew_lag2', 'kurtosis_lag2', 'MACD_Line_lag2', 'Signal_Line_lag2', 'MACD_Histogram_lag2', 'stoch_rsi_lag2', 'ht_dcperiod_lag2', 'ht_dcphase_lag2', 'inphase_lag2', 'sine_lag2', 'ht_trendmode_lag2', 'roc_4w_lag4', 'roc_6w_lag4', 'roc_12w_lag4', 'roc_26w_lag4', 'roc_52w_lag4', 'rsi_lag4', 'ewma_vol_lag4', 'skew_lag4', 'kurtosis_lag4', 'MACD_Line_lag4',

In [26]:
# Exporting Train and Test data in portfolio evaluation format before re-shaping for training
train_path = data_path + '/classifier_train'
test_path = data_path + '/classifier_test'
for name in train_dfs.keys():
    train_dfs[name].to_excel(train_path + f'/broad_assets_{name}_train.xlsx', sheet_name=f'broad_assets_{name}_train')
    test_dfs[name].to_excel(test_path + f'/broad_assets_{name}_test.xlsx', sheet_name=f'broad_assets_{name}_test')

## Melting Train and Test into tall datasets

In [27]:
train_df = pd.DataFrame()
for name, frame in train_dfs.items():
    new_col = frame.melt(value_name=name).drop(columns='variable')
    train_df = pd.concat([train_df, new_col],axis=1)

train_df

Unnamed: 0,roc_4w,roc_6w,roc_12w,roc_26w,roc_52w,rsi,ewma_vol,skew,kurtosis,target,...,kurtosis_lag12,MACD_Line_lag12,Signal_Line_lag12,MACD_Histogram_lag12,stoch_rsi_lag12,ht_dcperiod_lag12,ht_dcphase_lag12,inphase_lag12,sine_lag12,ht_trendmode_lag12
0,0.0491,0.0724,0.1680,0.3082,0.3456,56.4857,0.0411,0.2886,1.1973,0,...,1.0794,0.0006,-0.0009,0.0014,0.6933,15.7097,-25.7678,-0.0080,-0.4347,1.0000
1,0.0044,0.0834,0.0986,0.2301,0.4155,51.4365,0.0377,0.2916,1.2179,0,...,1.0896,0.0025,0.0009,0.0016,1.0000,14.2814,114.0176,0.0059,0.9134,1.0000
2,-0.0004,0.0204,0.1054,0.2878,0.3323,48.7950,0.0374,0.2927,1.2318,1,...,1.1106,-0.0004,0.0006,-0.0011,1.0000,14.3447,203.7704,0.0071,-0.4031,1.0000
3,0.0427,0.0143,0.0910,0.2738,0.2618,50.1757,0.0421,0.2915,1.2330,1,...,1.1257,0.0014,0.0007,0.0007,0.9597,16.0535,226.4582,-0.0008,-0.7249,1.0000
4,0.0226,0.0510,0.0777,0.2281,0.3023,63.1201,0.0404,0.2877,1.2434,0,...,1.1313,0.0019,0.0016,0.0003,0.6537,19.0817,166.1285,0.0104,0.2397,1.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9565,0.1681,0.0859,-0.0838,-0.3957,-0.4007,69.2456,0.0777,-0.2471,3.0857,1,...,3.1633,-0.0162,-0.0050,-0.0112,0.0000,24.1971,-38.9357,-0.0694,-0.6284,1.0000
9566,0.2085,0.3003,0.1328,-0.3788,-0.3592,73.3603,0.0735,-0.2476,3.0827,0,...,3.1618,-0.0112,-0.0079,-0.0033,0.2227,22.4317,-10.6575,-0.1622,-0.1849,1.0000
9567,0.0828,0.2115,0.1686,-0.4231,-0.3865,62.1184,0.0746,-0.2501,3.0800,1,...,3.1439,0.0008,-0.0058,0.0066,0.3453,26.1559,-5.4861,-0.1234,-0.0956,1.0000
9568,0.1433,0.2326,0.1802,-0.4012,-0.3946,65.2476,0.0670,-0.2511,3.0822,0,...,3.1340,0.0052,0.0027,0.0025,0.7136,29.7095,10.0433,-0.0059,0.1744,1.0000


In [28]:
test_df = pd.DataFrame()
for name, frame in test_dfs.items():
    new_col = frame.melt(value_name=name).drop(columns='variable')
    test_df = pd.concat([test_df, new_col],axis=1)

test_df

Unnamed: 0,roc_4w,roc_6w,roc_12w,roc_26w,roc_52w,rsi,ewma_vol,skew,kurtosis,target,...,kurtosis_lag12,MACD_Line_lag12,Signal_Line_lag12,MACD_Histogram_lag12,stoch_rsi_lag12,ht_dcperiod_lag12,ht_dcphase_lag12,inphase_lag12,sine_lag12,ht_trendmode_lag12
0,-0.0367,0.1522,0.3134,-0.1269,0.0404,62.2076,0.0926,-0.1419,11.2861,0,...,11.5778,0.0099,0.0104,-0.0005,1.0000,24.8592,92.2367,-0.0953,0.9992,1.0000
1,0.0575,0.0612,0.2404,-0.1622,-0.0016,54.2233,0.0830,-0.1416,11.2857,1,...,11.5128,0.0034,0.0079,-0.0044,0.5636,24.8072,164.1157,0.2824,0.2737,1.0000
2,0.0867,0.0077,0.2696,-0.1321,0.0816,76.3484,0.0746,-0.1426,11.2881,0,...,11.4678,0.0037,0.0037,0.0000,0.4865,22.4607,196.5017,0.0679,-0.2840,1.0000
3,0.1260,0.1147,0.2935,-0.0921,0.0484,64.7365,0.0671,-0.1425,11.2923,1,...,11.4308,-0.0025,0.0010,-0.0036,0.0000,21.9373,219.3485,0.1637,-0.6340,1.0000
4,0.0740,0.1157,0.2587,-0.0635,0.2243,66.2843,0.0615,-0.1430,11.2955,1,...,11.4261,0.0011,0.0005,0.0006,0.9177,20.3256,188.6485,-0.1126,-0.1504,1.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2393,-0.1386,-0.1309,-0.1467,-0.0182,-0.1647,25.1601,0.0485,-0.3279,3.1960,1,...,3.1927,0.0009,-0.0006,0.0015,0.9401,18.0478,6.8301,-0.0080,0.1189,1.0000
2394,-0.0596,-0.0957,-0.1316,0.0060,-0.1039,50.5970,0.0503,-0.3285,3.1948,0,...,3.1951,0.0031,0.0009,0.0022,1.0000,16.2415,128.1865,0.0136,0.7860,1.0000
2395,-0.0522,-0.1228,-0.1337,-0.0303,-0.1222,50.8652,0.0473,-0.3277,3.1957,1,...,3.1954,-0.0006,0.0001,-0.0007,0.3681,16.0384,192.7800,0.0477,-0.2212,0.0000
2396,0.0266,-0.0650,-0.0546,0.0125,-0.1129,66.2795,0.0487,-0.3277,3.1953,0,...,3.1889,-0.0046,-0.0014,-0.0032,0.0000,18.3349,242.7373,-0.0502,-0.8889,1.0000


## Exporting for Model Training

In [29]:
train_df.to_excel(data_path + '/broad_assets_classifier_train_data.xlsx', sheet_name='broad_assets_train')
test_df.to_excel(data_path + '/broad_assets_classifier_test_data.xlsx', sheet_name='broad_assets_test')