In [12]:
import talib as ta
import numpy as np
import pandas as pd

In [13]:
ASSET_DETAILS_CSV = './data/asset_details.csv'
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")
add_weight_map = dict(zip(df_asset_details.Asset_ID, 
                        df_asset_details.Weight/df_asset_details.Weight.sum()))

sup_train = pd.read_csv('./data/supplemental_train.csv')
sup_train['Weight'] = sup_train['Asset_ID'].map(add_weight_map)
sup_train = sup_train.sort_values('timestamp').set_index(["timestamp",'Asset_ID'])
sup_train.drop('Target',axis=1, inplace=True)


def log_return(series, periods=5):
    return np.log(series).diff(periods)
lr_15 = sup_train.groupby('Asset_ID').apply( 
        lambda x: log_return(x[['Close']],15)
        )
sup_train['lr_15'] = lr_15['Close']

mkt_lr_15 = sup_train.groupby('timestamp').apply( 
    lambda x: x[["lr_15", "Close"]].multiply(x["Weight"], axis="index").sum(skipna=True)
    )
mkt_lr_15.columns = ['Mkt_lrt_15','Crypto_Index']
firsts = sup_train.index.get_level_values('timestamp')
sup_train[['Mkt_lrt_15','Crypto_Index']] = mkt_lr_15.loc[firsts].values

In [14]:
sup_train.index.get_level_values('timestamp').min(),sup_train.index.get_level_values('timestamp').max()

(1632182460, 1642982400)

In [15]:
sup_train.shape

(2518278, 11)

In [16]:
def lag_features(df, fastk1,fastk2,adx,macd_s,macd_l,macd_sig,rsi,vol_sum,std_Crypto_Index,std_lr_15,std_Mkt_lrt_15, **kwargs):    
    ####TECH indicators
    df['slowK'], df['slowD'] = ta.STOCH(df.High, df.Low, df.Close, 
                                        fastk_period=fastk1, slowk_period=int(3*fastk1/5), slowd_period=int(3*fastk1/5),
                                        slowk_matype=0, slowd_matype=0)
    df['fastK'], df['fastD'] = ta.STOCHF(df.High, df.Low, df.Close,
                                         fastk_period=fastk2, fastd_period=int(3*fastk2/5), 
                                         fastd_matype=0)
    df[f'rsi_{rsi}'] = ta.RSI(df['Close'], timeperiod=rsi)
    df[f'macd_{macd_s}_{macd_l}'],df[f'macd_signal_{macd_sig}'], df['macd_hist'] = \
                ta.MACD(df['Close'],fastperiod=macd_s, slowperiod=macd_l, signalperiod=macd_sig)
    df[f'adx_{adx}'] = ta.ADX(df['High'], df['Low'],df['Close'], timeperiod=adx)#Average Directional Movement Index
    #df['AD'] = ta.AD(df['High'], df['Low'],df['Close'], df['Volume'])#Accumulation Distribution Line
    df[f'vol_sum_{vol_sum}'] = ta.SMA(df['Volume'],vol_sum)*vol_sum
    ####std volatility
    df[f'std_lr_15_{std_lr_15}'] = ta.STDDEV(df.lr_15,timeperiod=std_lr_15, nbdev=1)
    df[f'std_Mkt_lrt_15_{std_Mkt_lrt_15}'] = ta.STDDEV(df.Mkt_lrt_15,timeperiod=std_Mkt_lrt_15, nbdev=1)
    df[f'std_Crypto_Index_{std_Crypto_Index}'] = ta.STDDEV(df.Crypto_Index,timeperiod=std_Crypto_Index, nbdev=1)
    ###NEW
    df[f"ATR_{kwargs['ATR']}"] = ta.ATR(df['High'], df['Low'],df['Close'], timeperiod=kwargs['ATR'])
    df['TRENDLINE'] =ta.HT_TRENDLINE(df['Open'])
    df[f"willr_{kwargs['willr']}"] = ta.WILLR(df['High'], df['Low'],df['Close'], timeperiod=kwargs['willr'])
    return df



In [17]:
fdict={'willr': 60,
 'vol_sum': 15,
 'std_lr_15': 30,
 'std_Mkt_lrt_15': 10,
 'std_Crypto_Index': 30,
 'rsi': 30,
 'macd_sig': 15,
 'macd_s': 10,
 'macd_l': 60,
 'lrtn': 50,
 'fastk2': 10,
 'fastk1': 15,
 'beta_s': '6h',
 'beta_l': '2d',
 'adx': 50,
 'ATR': 60}

pre_minute = 2000
sup_train2 = sup_train.iloc[-14*pre_minute:,:]
sup_train2=sup_train2.groupby('Asset_ID').apply(lambda x: lag_features(x,**fdict))
nan_num = []
nan_mins = []
for col in sup_train2.columns:
    nan_num.append(sup_train2[col].isin([np.nan]).sum())
    nan_mins.append(nan_num[-1]/14)
    print(f"{col}: {nan_num[-1]} rows, {nan_mins[-1]} mins")
print(f"max nan mins={max(nan_mins)}")

sup_train2[sup_train2.isin([np.nan]).any(axis=1)]
sup_train2.iloc[-14:,8:]

Count: 0 rows, 0.0 mins
Open: 0 rows, 0.0 mins
High: 0 rows, 0.0 mins
Low: 0 rows, 0.0 mins
Close: 0 rows, 0.0 mins
Volume: 0 rows, 0.0 mins
VWAP: 0 rows, 0.0 mins
Weight: 0 rows, 0.0 mins
lr_15: 0 rows, 0.0 mins
Mkt_lrt_15: 0 rows, 0.0 mins
Crypto_Index: 0 rows, 0.0 mins
slowK: 420 rows, 30.0 mins
slowD: 420 rows, 30.0 mins
fastK: 196 rows, 14.0 mins
fastD: 196 rows, 14.0 mins
rsi_30: 420 rows, 30.0 mins
macd_10_60: 1022 rows, 73.0 mins
macd_signal_15: 1022 rows, 73.0 mins
macd_hist: 1022 rows, 73.0 mins
adx_50: 1386 rows, 99.0 mins
vol_sum_15: 196 rows, 14.0 mins
std_lr_15_30: 406 rows, 29.0 mins
std_Mkt_lrt_15_10: 126 rows, 9.0 mins
std_Crypto_Index_30: 406 rows, 29.0 mins
ATR_60: 840 rows, 60.0 mins
TRENDLINE: 882 rows, 63.0 mins
willr_60: 826 rows, 59.0 mins
max nan mins=99.0


Unnamed: 0_level_0,Unnamed: 1_level_0,lr_15,Mkt_lrt_15,Crypto_Index,slowK,slowD,fastK,fastD,rsi_30,macd_10_60,macd_signal_15,macd_hist,adx_50,vol_sum_15,std_lr_15_30,std_Mkt_lrt_15_10,std_Crypto_Index_30,ATR_60,TRENDLINE,willr_60
timestamp,Asset_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1642982400,13,-0.000622,-0.001114,6494.648269,59.905404,69.549659,33.418367,40.759107,52.838253,0.000142,0.000206,-6.3e-05,17.109192,21424840.0,0.005697,0.001848,8.745965,0.000182,0.057491,-39.491889
1642982400,10,-0.002008,-0.001114,6494.648269,49.483092,61.626629,61.600136,43.506434,54.529856,5.285849,7.954096,-2.668247,14.244778,87.34481,0.004661,0.001848,8.745965,5.95426,1832.947071,-31.55221
1642982400,9,-0.001254,-0.001114,6494.648269,65.325459,72.574171,22.751323,48.151957,57.476814,0.794795,0.93306,-0.138265,34.110821,18579.78,0.006084,0.001848,8.745965,0.444724,111.926883,-21.180556
1642982400,8,-0.001189,-0.001114,6494.648269,68.490289,76.730922,21.428571,53.335891,59.018494,0.008934,0.010039,-0.001105,28.819432,107681.6,0.00725,0.001848,8.745965,0.003728,0.811013,-12.207792
1642982400,6,0.000148,-0.001114,6494.648269,55.321706,61.557516,44.483031,52.695343,58.505906,19.188244,23.032518,-3.844274,38.3607,19103.85,0.00368,0.001848,8.745965,9.827786,2535.533645,-13.01596
1642982400,7,-0.004492,-0.001114,6494.648269,45.866008,55.486098,20.496124,44.379531,54.217994,0.159849,0.202309,-0.04246,25.371531,41825.28,0.005412,0.001848,8.745965,0.099856,25.074783,-24.857834
1642982400,1,-0.001337,-0.001114,6494.648269,56.802987,65.693501,25.221347,45.744561,54.342695,131.897521,173.636165,-41.738643,31.022512,1216.447,0.004165,0.001848,8.745965,122.043891,36252.19578,-30.736478
1642982400,4,-6.6e-05,-0.001114,6494.648269,59.490364,67.344531,10.682353,39.765283,52.390914,0.000592,0.000676,-8.4e-05,24.574737,21438910.0,0.008182,0.001848,8.745965,0.000633,0.141385,-45.175258
1642982400,0,-0.001288,-0.001114,6494.648269,67.653708,77.355154,23.777116,44.70378,56.280375,2.667685,3.317364,-0.64968,33.059342,12032.41,0.005818,0.001848,8.745965,1.081932,383.292427,-23.5625
1642982400,2,0.001066,-0.001114,6494.648269,65.34212,69.915164,44.5,49.284144,62.271397,2.409332,2.78677,-0.377437,28.322375,3515.688,0.004626,0.001848,8.745965,0.887057,301.624064,-11.411501


In [18]:
pre_minute = 200
sup_train2 = sup_train.iloc[-14*pre_minute:,:]
sup_train2= sup_train2.groupby('Asset_ID').apply(lambda x: lag_features(x,**fdict))
sup_train2[sup_train2.isin([np.nan]).any(axis=1)]
sup_train2.iloc[-14:,8:]

Unnamed: 0_level_0,Unnamed: 1_level_0,lr_15,Mkt_lrt_15,Crypto_Index,slowK,slowD,fastK,fastD,rsi_30,macd_10_60,macd_signal_15,macd_hist,adx_50,vol_sum_15,std_lr_15_30,std_Mkt_lrt_15_10,std_Crypto_Index_30,ATR_60,TRENDLINE,willr_60
timestamp,Asset_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1642982400,13,-0.000622,-0.001114,6494.648269,59.905404,69.549659,33.418367,40.759107,52.83309,0.000142,0.000206,-6.3e-05,17.723073,21424840.0,0.005697,0.001848,8.745965,0.000181,0.057491,-39.491889
1642982400,10,-0.002008,-0.001114,6494.648269,49.483092,61.626629,61.600136,43.506434,54.52223,5.275542,7.940582,-2.66504,14.361313,87.34481,0.004661,0.001848,8.745965,5.912055,1832.947071,-31.55221
1642982400,9,-0.001254,-0.001114,6494.648269,65.325459,72.574171,22.751323,48.151957,57.466329,0.794298,0.932409,-0.138111,32.435046,18579.78,0.006084,0.001848,8.745965,0.442222,111.926883,-21.180556
1642982400,8,-0.001189,-0.001114,6494.648269,68.490289,76.730922,21.428571,53.335891,59.029531,0.008932,0.010037,-0.001105,30.288226,107681.6,0.00725,0.001848,8.745965,0.003706,0.811013,-12.207792
1642982400,6,0.000148,-0.001114,6494.648269,55.321706,61.557516,44.483031,52.695343,58.504715,19.171123,23.010071,-3.838948,38.541788,19103.85,0.00368,0.001848,8.745965,9.847167,2535.533645,-13.01596
1642982400,7,-0.004492,-0.001114,6494.648269,45.866008,55.486098,20.496124,44.379531,54.2111,0.159903,0.20238,-0.042477,25.632911,41825.28,0.005412,0.001848,8.745965,0.099073,25.074783,-24.857834
1642982400,1,-0.001337,-0.001114,6494.648269,56.802987,65.693501,25.221347,45.744561,54.341882,131.800449,173.508891,-41.708443,31.827496,1216.447,0.004165,0.001848,8.745965,121.287226,36252.19578,-30.736478
1642982400,4,-6.6e-05,-0.001114,6494.648269,59.490364,67.344531,10.682353,39.765283,52.379913,0.000591,0.000674,-8.4e-05,22.677791,21438910.0,0.008182,0.001848,8.745965,0.000628,0.141385,-45.175258
1642982400,0,-0.001288,-0.001114,6494.648269,67.653708,77.355154,23.777116,44.70378,56.285974,2.667455,3.317063,-0.649608,33.871578,12032.41,0.005818,0.001848,8.745965,1.073387,383.292427,-23.5625
1642982400,2,0.001066,-0.001114,6494.648269,65.34212,69.915164,44.5,49.284144,62.266294,2.409192,2.786585,-0.377394,28.383728,3515.688,0.004626,0.001848,8.745965,0.883349,301.624064,-11.411501


In [19]:
fdict.keys()

dict_keys(['willr', 'vol_sum', 'std_lr_15', 'std_Mkt_lrt_15', 'std_Crypto_Index', 'rsi', 'macd_sig', 'macd_s', 'macd_l', 'lrtn', 'fastk2', 'fastk1', 'beta_s', 'beta_l', 'adx', 'ATR'])

In [20]:
##pre_minute make sure the last minute can get non-nan values
pre_minute = max([fdict['std_lr_15']+15,fdict['std_Mkt_lrt_15'],fdict['std_Crypto_Index'],
                  fdict['rsi']+1,fdict['macd_l']+5,fdict['lrtn']+1,
                  fdict['fastk2']+10,fdict['fastk1']+10,fdict['adx']*2+1])
pre_minute

101

In [26]:
dd= {"a":1,"b":2}
del dd['a']
dd

{'b': 2}