In [2]:
import talib as ta
import numpy as np
import pandas as pd

In [14]:
ASSET_DETAILS_CSV = './data/asset_details.csv'
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")
add_weight_map = dict(zip(df_asset_details.Asset_ID, 
                        df_asset_details.Weight/df_asset_details.Weight.sum()))

sup_train = pd.read_csv('./data/supplemental_train.csv')
sup_train['Weight'] = sup_train['Asset_ID'].map(add_weight_map)
sup_train = sup_train.sort_values('timestamp').set_index(["timestamp",'Asset_ID'])
sup_train.drop('Target',axis=1, inplace=True)


def log_return(series, periods=5):
    return np.log(series).diff(periods)
lr_15 = sup_train.groupby('Asset_ID').apply( 
        lambda x: log_return(x[['Close']],15)
        )
sup_train['lr_15'] = lr_15['Close']

mkt_lr_15 = sup_train.groupby('timestamp').apply( 
    lambda x: x[["lr_15", "Close"]].multiply(x["Weight"], axis="index").sum(skipna=True)
    )
mkt_lr_15.columns = ['Mkt_lrt_15','Crypto_Index']
firsts = sup_train.index.get_level_values('timestamp')
sup_train[['Mkt_lrt_15','Crypto_Index']] = mkt_lr_15.loc[firsts].values

In [17]:
sup_train.index.get_level_values('timestamp').min(),sup_train.index.get_level_values('timestamp').max()

(1632182460, 1641772800)

In [25]:
sup_train.shape

(2236496, 11)

In [90]:
def lag_features(df, fastk1,fastk2,adx,macd_s,macd_l,macd_sig,rsi,vol_sum,std_Crypto_Index,std_lr_15,std_Mkt_lrt_15, **kwargs):    
    ####TECH indicators
    df['slowK'], df['slowD'] = ta.STOCH(df.High, df.Low, df.Close, 
                                        fastk_period=fastk1, slowk_period=int(3*fastk1/5), slowd_period=int(3*fastk1/5),
                                        slowk_matype=0, slowd_matype=0)
    df['fastK'], df['fastD'] = ta.STOCHF(df.High, df.Low, df.Close,
                                         fastk_period=fastk2, fastd_period=int(3*fastk2/5), 
                                         fastd_matype=0)
    df[f'rsi_{rsi}'] = ta.RSI(df['Close'], timeperiod=rsi)
    df[f'macd_{macd_s}_{macd_l}'],df[f'macd_signal_{macd_sig}'], df['macd_hist'] = \
                ta.MACD(df['Close'],fastperiod=macd_s, slowperiod=macd_l, signalperiod=macd_sig)
    df[f'adx_{adx}'] = ta.ADX(df['High'], df['Low'],df['Close'], timeperiod=adx)#Average Directional Movement Index
    #df['AD'] = ta.AD(df['High'], df['Low'],df['Close'], df['Volume'])#Accumulation Distribution Line
    df[f'vol_sum_{vol_sum}'] = ta.SMA(df['Volume'],vol_sum)*vol_sum
    ####std volatility
    df[f'std_lr_15_{std_lr_15}'] = ta.STDDEV(df.lr_15,timeperiod=std_lr_15, nbdev=1)
    df[f'std_Mkt_lrt_15_{std_Mkt_lrt_15}'] = ta.STDDEV(df.Mkt_lrt_15,timeperiod=std_Mkt_lrt_15, nbdev=1)
    df[f'std_Crypto_Index_{std_Crypto_Index}'] = ta.STDDEV(df.Crypto_Index,timeperiod=std_Crypto_Index, nbdev=1)
    return df



In [91]:
fdict={'std_lr_15': 5,
 'std_Mkt_lrt_15': 10,
 'std_Crypto_Index': 15,
 'rsi': 60,
 'vol_sum':15,
 'macd_sig': 5,
 'macd_s': 15,
 'macd_l': 25,
 'lrtn': 50,
 'fastk2': 15,
 'fastk1': 5,
 'beta_s': '6h',
 'beta_l': '2d',
 'adx': 40}

pre_minute = 2000
sup_train2 = sup_train.iloc[-14*pre_minute:,:]
sup_train2=sup_train2.groupby('Asset_ID').apply(lambda x: lag_features(x,**fdict))
nan_num = []
nan_mins = []
for col in sup_train2.columns:
    nan_num.append(sup_train2[col].isin([np.nan]).sum())
    nan_mins.append(nan_num[-1]/14)
    print(f"{col}: {nan_num[-1]} rows, {nan_mins[-1]} mins")
print(f"max nan mins={max(nan_mins)}")

sup_train2[sup_train2.isin([np.nan]).any(axis=1)]
sup_train2.iloc[-14:,8:]

Count: 0 rows, 0.0 mins
Open: 0 rows, 0.0 mins
High: 0 rows, 0.0 mins
Low: 0 rows, 0.0 mins
Close: 0 rows, 0.0 mins
Volume: 0 rows, 0.0 mins
VWAP: 0 rows, 0.0 mins
Weight: 0 rows, 0.0 mins
lr_15: 0 rows, 0.0 mins
Mkt_lrt_15: 0 rows, 0.0 mins
Crypto_Index: 0 rows, 0.0 mins
slowK: 112 rows, 8.0 mins
slowD: 112 rows, 8.0 mins
fastK: 308 rows, 22.0 mins
fastD: 308 rows, 22.0 mins
rsi_60: 840 rows, 60.0 mins
macd_15_25: 392 rows, 28.0 mins
macd_signal_5: 392 rows, 28.0 mins
macd_hist: 392 rows, 28.0 mins
adx_40: 1106 rows, 79.0 mins
vol_sum_15: 196 rows, 14.0 mins
std_lr_15_5: 56 rows, 4.0 mins
std_Mkt_lrt_15_10: 126 rows, 9.0 mins
std_Crypto_Index_15: 196 rows, 14.0 mins
max nan mins=79.0


Unnamed: 0_level_0,Unnamed: 1_level_0,lr_15,Mkt_lrt_15,Crypto_Index,slowK,slowD,fastK,fastD,rsi_60,macd_15_25,macd_signal_5,macd_hist,adx_40,vol_sum_15,std_lr_15_5,std_Mkt_lrt_15_10,std_Crypto_Index_15
timestamp,Asset_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1641772800,13,-0.003493,-0.003263,7532.261935,65.391086,55.022644,22.058824,20.405045,43.458805,-1.1e-05,-1e-05,-3.124546e-07,17.208594,10203470.0,0.000853,0.001532,3.874972
1641772800,10,-0.003462,-0.003263,7532.261935,27.791994,26.12339,17.623554,60.294651,46.323024,1.481427,1.885207,-0.4037797,18.68402,29.6216,0.003268,0.001532,3.874972
1641772800,9,-0.004696,-0.003263,7532.261935,29.197168,25.312999,27.911018,21.72333,41.846081,-0.050992,-0.043601,-0.00739064,19.544074,5065.137,0.001116,0.001532,3.874972
1641772800,8,-0.001899,-0.003263,7532.261935,70.696046,72.645704,65.358362,57.977789,49.931053,0.001525,0.001004,0.0005207067,79.307572,53506.08,0.013499,0.001532,3.874972
1641772800,6,-0.00311,-0.003263,7532.261935,26.109021,25.788997,23.688969,38.170847,37.944579,-1.164433,-1.024059,-0.1403742,28.3281,5335.395,0.001051,0.001532,3.874972
1641772800,7,-0.003969,-0.003263,7532.261935,28.621263,29.365428,12.822458,18.629441,42.486766,-0.006396,-0.004225,-0.002171462,18.835485,5021.665,0.001014,0.001532,3.874972
1641772800,1,-0.002324,-0.003263,7532.261935,46.712745,41.673744,34.74166,39.723022,39.217776,-9.133145,-7.835613,-1.297532,32.151451,530.9589,0.000776,0.001532,3.874972
1641772800,4,-0.0035,-0.003263,7532.261935,42.674167,45.043987,15.257143,30.31746,40.538579,-3.1e-05,-2.2e-05,-8.873135e-06,19.521657,4875971.0,0.000893,0.001532,3.874972
1641772800,0,-0.002281,-0.003263,7532.261935,50.5436,53.592032,21.961674,43.766773,42.279728,-0.004755,0.011159,-0.01591443,22.395121,5152.49,0.00088,0.001532,3.874972
1641772800,2,-0.002764,-0.003263,7532.261935,63.298828,59.702026,28.915663,32.137188,43.65335,-0.033553,-0.026942,-0.006610882,24.349438,729.5325,0.000848,0.001532,3.874972


In [92]:
pre_minute = 200
sup_train2 = sup_train.iloc[-14*pre_minute:,:]
sup_train2= sup_train2.groupby('Asset_ID').apply(lambda x: lag_features(x,**fdict))
sup_train2[sup_train2.isin([np.nan]).any(axis=1)]
sup_train2.iloc[-14:,8:]

Unnamed: 0_level_0,Unnamed: 1_level_0,lr_15,Mkt_lrt_15,Crypto_Index,slowK,slowD,fastK,fastD,rsi_60,macd_15_25,macd_signal_5,macd_hist,adx_40,vol_sum_15,std_lr_15_5,std_Mkt_lrt_15_10,std_Crypto_Index_15
timestamp,Asset_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
1641772800,13,-0.003493,-0.003263,7532.261935,65.391086,55.022644,22.058824,20.405045,43.186819,-1.1e-05,-1e-05,-3.124574e-07,17.855194,10203470.0,0.000853,0.001532,3.874972
1641772800,10,-0.003462,-0.003263,7532.261935,27.791994,26.12339,17.623554,60.294651,46.081308,1.481427,1.885206,-0.4037796,19.749248,29.6216,0.003268,0.001532,3.874972
1641772800,9,-0.004696,-0.003263,7532.261935,29.197168,25.312999,27.911018,21.72333,41.847646,-0.050992,-0.043601,-0.007390644,19.565463,5065.137,0.001116,0.001532,3.874972
1641772800,8,-0.001899,-0.003263,7532.261935,70.696046,72.645704,65.358362,57.977789,49.920025,0.001525,0.001004,0.0005207069,80.001199,53506.08,0.013499,0.001532,3.874972
1641772800,6,-0.00311,-0.003263,7532.261935,26.109021,25.788997,23.688969,38.170847,37.691425,-1.164433,-1.024059,-0.1403742,28.996887,5335.395,0.001051,0.001532,3.874972
1641772800,7,-0.003969,-0.003263,7532.261935,28.621263,29.365428,12.822458,18.629441,42.248069,-0.006396,-0.004225,-0.002171462,18.83831,5021.665,0.001014,0.001532,3.874972
1641772800,1,-0.002324,-0.003263,7532.261935,46.712745,41.673744,34.74166,39.723022,39.076006,-9.133139,-7.835606,-1.297533,32.056602,530.9589,0.000776,0.001532,3.874972
1641772800,4,-0.0035,-0.003263,7532.261935,42.674167,45.043987,15.257143,30.31746,40.315984,-3.1e-05,-2.2e-05,-8.873141e-06,20.532677,4875971.0,0.000893,0.001532,3.874972
1641772800,0,-0.002281,-0.003263,7532.261935,50.5436,53.592032,21.961674,43.766773,42.278914,-0.004755,0.011159,-0.01591446,22.231004,5152.49,0.00088,0.001532,3.874972
1641772800,2,-0.002764,-0.003263,7532.261935,63.298828,59.702026,28.915663,32.137188,43.87251,-0.033552,-0.026942,-0.006610892,23.752491,729.5325,0.000848,0.001532,3.874972


In [54]:
fdict.keys()

dict_keys(['std_lr_15', 'std_Mkt_lrt_15', 'std_Crypto_Index', 'rsi', 'macd_sig', 'macd_s', 'macd_l', 'lrtn', 'fastk2', 'fastk1', 'beta_s', 'beta_l', 'adx'])

In [53]:
##pre_minute make sure the last minute can get non-nan values
pre_minute = max([fdict['std_lr_15']+15,fdict['std_Mkt_lrt_15'],fdict['std_Crypto_Index'],
                  fdict['rsi']+1,fdict['macd_l']+5,fdict['lrtn']+1,
                  fdict['fastk2']+10,fdict['fastk1']+10,fdict['adx']*2+1])
pre_minute

81