In [1]:
import traceback
import numpy as np
import pandas as pd
#import datatable as dt
import gc
import time
import os
import talib as ta
from script.crypto_API import *
import IPython

import tensorflow as tf
from tensorflow import keras
import tensorflow_probability as tfp

In [64]:
ASSET_DETAILS_CSV = './data/asset_details.csv'
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")
# sup_train=pd.read_csv('./data/supplemental_train.csv')
# assets_order = sup_train.Asset_ID[:14]
# assets_order = dict((t,i) for i,t in enumerate(assets_order))
mod_order = [3, 2, 0, 1, 4, 5, 7, 6, 8, 9, 10, 13, 12, 11]
features_list = ['Count', 'Open', 'High', 'Low', 'Close', 'Volume',
        'lr_15', 'Mkt_lrt_15', 'Crypto_Index', 'beta_6h',
       'lr_15_resid_6h', 'beta_2d', 'lr_15_resid_2d', 'lrtn_index_50', 'slowK',
       'slowD', 'fastK', 'fastD', 'rsi_30', 'macd_10_60', 'macd_signal_15',
       'macd_hist', 'adx_50', 'vol_sum_15', 'std_lr_15_30',
       'std_Mkt_lrt_15_10', 'std_Crypto_Index_30']

psets={'std_lr_15': 30, 'std_Mkt_lrt_15': 10, 'std_Crypto_Index': 30, 
            'rsi': 30, 'adx': 50, 'macd_sig': 15, 'macd_s': 10, 'macd_l': 60, 'lrtn': 50, 
            'fastk2': 10, 'fastk1': 15, 'beta_s': '6h', 'beta_l': '2d', 'vol_sum': 15}


version = 0
retrained = False #set to False to get correct out-train score
pre_minute=101 #psets['adx']*2+1 #for smoothing sensitive indicators
lstm_window = 15

MOD_FOLDER = "./trainedNN" + f"/kaggleLSTM_{version}"
MOD_FOLDER

'./trainedNN/kaggleLSTM_0'

In [3]:
def masked_cosine(y_true, y_pred):
    mask = tf.math.not_equal(y_true, 0.)
    y_true_masked = tf.boolean_mask(y_true, mask)
    y_pred_masked = tf.boolean_mask(y_pred, mask)
    return tf.keras.losses.cosine_similarity(y_true_masked, y_pred_masked)

def Correlation(y_true,y_pred): 
    return tf.math.abs(tfp.stats.correlation(y_pred,y_true, sample_axis=None, event_axis=None))

In [7]:
from os.path import exists
if exists(MOD_FOLDER):
    model = keras.models.load_model(MOD_FOLDER, 
                                custom_objects={"masked_cosine": masked_cosine, 'Correlation':Correlation}, 
                                compile =False)
    IPython.display.clear_output()
    print('finish loading the LSTM')
else:
    raise ValueError

finish loading the LSTM


In [11]:

def log_return(series, periods=5):
    return np.log(series).diff(periods)

def lag_features(df, fastk1,fastk2,adx,macd_s,macd_l,macd_sig,vol_sum,rsi,std_Crypto_Index,std_lr_15,std_Mkt_lrt_15, **kwargs):    
    if vol_sum >0 :
        df[f'vol_sum_{vol_sum}'] = ta.SMA(df['Volume'],vol_sum)*vol_sum
    ####TECH indicators
    df['slowK'], df['slowD'] = ta.STOCH(df.High, df.Low, df.Close, 
                                        fastk_period=fastk1, slowk_period=int(3*fastk1/5), slowd_period=int(3*fastk1/5),
                                        slowk_matype=0, slowd_matype=0)
    df['fastK'], df['fastD'] = ta.STOCHF(df.High, df.Low, df.Close,
                                         fastk_period=fastk2, fastd_period=int(3*fastk2/5), 
                                         fastd_matype=0)
    df[f'macd_{macd_s}_{macd_l}'],df[f'macd_signal_{macd_sig}'], df['macd_hist'] = \
                ta.MACD(df['Close'],fastperiod=macd_s, slowperiod=macd_l, signalperiod=macd_sig)
    ##smoothing sensitive ADX 150, RSI250, AD unKNOWN
    df[f'rsi_{rsi}'] = ta.RSI(df['Close'], timeperiod=rsi)
    df[f'adx_{adx}'] = ta.ADX(df['High'], df['Low'],df['Close'], timeperiod=adx)

    ####std volatility
    df[f'std_lr_15_{std_lr_15}'] = ta.STDDEV(df.lr_15,timeperiod=std_lr_15, nbdev=1)
    df[f'std_Mkt_lrt_15_{std_Mkt_lrt_15}'] = ta.STDDEV(df.Mkt_lrt_15,timeperiod=std_Mkt_lrt_15, nbdev=1)
    df[f'std_Crypto_Index_{std_Crypto_Index}'] = ta.STDDEV(df.Crypto_Index,timeperiod=std_Crypto_Index, nbdev=1)

def beta_resid(df, width): 
    b = ((ta.MULT(df.Mkt_lrt_15,df.lr_15).mean())/ \
        (ta.MULT(df.Mkt_lrt_15,df.Mkt_lrt_15).mean()))
    if b in [np.nan,np.inf,-np.inf]:
        b=0
    return b 
def get_features(df_feat, psets):
    pd.options.mode.chained_assignment = None  # default='warn'
    df_feat[f"lr_15_resid_{psets['beta_s']}"] = ta.SUB(df_feat.lr_15, ta.MULT(df_feat[f"beta_{psets['beta_s']}"], df_feat.Mkt_lrt_15)).rename(f"lr_15_resid_{psets['beta_s']}")
    df_feat[f"lr_15_resid_{psets['beta_l']}"] = ta.SUB(df_feat.lr_15, ta.MULT(df_feat[f"beta_{psets['beta_l']}"], df_feat.Mkt_lrt_15)).rename(f"lr_15_resid_{psets['beta_l']}")
    df_feat[f"lrtn_index_{psets['lrtn']}"] = log_return(df_feat.Crypto_Index, psets['lrtn'])
    lag_features(df_feat, **psets)
    return df_feat

## API submission

In [9]:
start = datestring_to_timestamp('2021-06-13T00:00:00')#1623542400
#end = datestring_to_timestamp('2021-09-21T00:00:00')#1632182400
end = datestring_to_timestamp('2021-06-13T01:00:00') #1623628800
train_df = read_csv_slice(file_path='./data/train.csv', 
                          use_window=[start, end])


In [12]:
def beta_window(beta):
    num, unit = int(beta[:-1]),beta[-1]
    if unit == 'h':
        width = 60*num
    elif unit == 'd':
        width = 60*24*num
    return width

beta_sw = beta_window(psets['beta_s'])
beta_lw = beta_window(psets['beta_l'])
beta_lw

2880

In [13]:
pre_minute_beta =  beta_lw + 15#for beta calculations
pre_minute, pre_minute_beta

(101, 2895)

In [145]:
add_weight_map = dict(zip(df_asset_details.Asset_ID, 
                        df_asset_details.Weight/df_asset_details.Weight.sum()))

###load sup_train
sup_train = pd.read_csv('./data/supplemental_train.csv')
sup_train = sup_train.sort_values('timestamp').set_index("timestamp")
ind = sup_train.index.unique()
###consistent timestamp for all 14 assets
def reindex(df):
    df = df.reindex(range(ind[0],ind[-1]+60,60),method='nearest')
    df = df.fillna(method="ffill").fillna(method="bfill")
    return df
sup_train = sup_train.groupby('Asset_ID').apply(reindex).reset_index(0, drop=True).sort_index()
sup_train = sup_train.iloc[(-14*pre_minute_beta):,:]
#add weight
sup_train['Weight'] = sup_train['Asset_ID'].map(add_weight_map)
sup_train.drop('Target',axis=1, inplace=True)

sup_train.set_index('Asset_ID',append=True, inplace=True)
##reorder by mod_order
sup_train = sup_train.reindex(list(zip(sup_train.index.get_level_values('timestamp'),
                           mod_order*pre_minute_beta)))

#######################################add lr_15,mkt_lr_15,crypto_index, beta,lr_mkt_resid
lr_15 = sup_train.groupby('Asset_ID').apply( 
        lambda x: log_return(x[['Close']],15)
        )
sup_train['lr_15'] = lr_15['Close']

mkt_lr_15 = sup_train.groupby('timestamp').apply( 
    lambda x: x[["lr_15", "Close"]].multiply(x["Weight"], axis="index").sum()
    )
mkt_lr_15.columns = ['Mkt_lrt_15','Crypto_Index']
firsts = sup_train.index.get_level_values('timestamp')
sup_train[['Mkt_lrt_15','Crypto_Index']] = mkt_lr_15.loc[firsts].values

####make beta,lr_mkt_resid, placeholder
sup_train[f"beta_{psets['beta_s']}"] = 0
sup_train[f"beta_{psets['beta_l']}"] = 0

sup_train.iloc[-14:,:]

Unnamed: 0_level_0,Unnamed: 1_level_0,Count,Open,High,Low,Close,Volume,VWAP,Weight,lr_15,Mkt_lrt_15,Crypto_Index,beta_6h,beta_2d
timestamp,Asset_ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1642982400,3,457.0,1.124904,1.1262,1.119907,1.121151,249923.2,1.123218,0.107797,-0.001616,-0.001114,6494.648269,0,0
1642982400,2,139.0,302.506,302.77,301.8,302.112,209.6457,302.422931,0.058657,0.001066,-0.001114,6494.648269,0,0
1642982400,0,420.0,383.901333,384.003,382.371,382.93,1490.571,383.253768,0.105286,-0.001288,-0.001114,6494.648269,0,0
1642982400,1,2917.0,36262.038571,36302.0,36176.45,36221.987143,110.0532,36247.575361,0.16585,-0.001337,-0.001114,6494.648269,0,0
1642982400,4,227.0,0.141784,0.1419,0.1413,0.141391,1009688.0,0.141649,0.086971,-6.6e-05,-0.001114,6494.648269,0,0
1642982400,5,468.0,2.27295,2.2777,2.26,2.2648,202194.0,2.269373,0.033911,-0.002785,-0.001114,6494.648269,0,0
1642982400,7,280.0,25.128433,25.191,24.984,25.028067,6709.816,25.093705,0.050867,-0.004492,-0.001114,6494.648269,0,0
1642982400,6,2510.0,2540.678571,2543.1,2533.96,2536.151429,1057.766,2538.323926,0.144188,0.000148,-0.001114,6494.648269,0,0
1642982400,8,49.0,0.814567,0.8153,0.8112,0.8124,6019.27,0.813656,0.026874,-0.001189,-0.001114,6494.648269,0,0
1642982400,9,214.0,112.293,112.47,111.84,111.983333,1243.86,112.127619,0.058657,-0.001254,-0.001114,6494.648269,0,0


In [148]:
num_asset_test = df_test.shape[0]
row_asset_id_map = dict(zip(df_test.row_id, df_test.Asset_ID))
test_timestamp = df_test.timestamp.values[0]
print(f"{test_timestamp}", end = "\r")

timestamp_list = sup_train.index.get_level_values('timestamp').unique().values
timestamp_list = np.append(timestamp_list,test_timestamp)
#######################################format df_test 
###add weight and index
df_test['Weight'] = df_test['Asset_ID'].map(add_weight_map)
###fillin missing assets as nan
df_test.set_index(['timestamp','Asset_ID'],inplace=True)
#df_test = df_test.reindex(list(zip([test_timestamp]*14,range(14))))
df_test = df_test.reindex(list(zip([test_timestamp]*14,mod_order)))

########################################concat to sup_train, add lr_15,mkt_lr_15,crypto_index
sup_train = pd.concat([sup_train,df_test.drop('row_id',axis=1)],join='outer')
test_lr_15 = sup_train.loc[timestamp_list[[-16,-1]]].groupby('Asset_ID').apply(
    lambda x: np.log(x[['Close']]).diff()
)#same as mod_order
sup_train.loc[test_timestamp, 'lr_15'] = test_lr_15.loc[test_timestamp,'Close'].values
sup_train.loc[test_timestamp, ['Mkt_lrt_15','Crypto_Index']] = sup_train.loc[test_timestamp, ["lr_15", "Close"]].multiply(sup_train.loc[test_timestamp,"Weight"], axis="index").sum(skipna=True).values

########################################beta_sl, reorder by mod_order
beta_s = sup_train[['lr_15','Mkt_lrt_15']].iloc[-14*(beta_sw):,:].groupby('Asset_ID').apply(
    lambda x: beta_resid(x,beta_sw)
).rename(f"beta_{psets['beta_s']}").reindex(mod_order)
beta_l = sup_train[['lr_15','Mkt_lrt_15']].iloc[-14*(beta_lw):,:].groupby('Asset_ID').apply(
    lambda x: beta_resid(x,beta_lw)
).rename(f"beta_{psets['beta_l']}").reindex(mod_order)
sup_train.loc[test_timestamp, [f"beta_{psets['beta_s']}",f"beta_{psets['beta_l']}"]] = \
    pd.concat([beta_s, beta_l],axis=1).values
#########################################fill in missing assets as forward
if num_asset_test <14:
    #ffill in missing
    sup_train = sup_train.groupby('Asset_ID').apply(lambda x: x.fillna(method="ffill")).iloc[14:,:]
else:
    sup_train = sup_train.iloc[14:,:]
#######################################add features to test timestamp, make sure no nan in this lstm window
sup_train2 = sup_train.iloc[(-14*(pre_minute+lstm_window)):,:].copy()
xx_test=sup_train2.groupby('Asset_ID').apply(
    lambda x: get_features(x,psets)
).iloc[(-14*lstm_window):,:]

1623542400

In [153]:
xx_test.shape

(1, 15, 14, 27)

In [146]:
myapi = API(train_df)
from datetime import datetime 
start_time = datetime.now()

for df_test, df_pred in myapi:
    break
    num_asset_test = df_test.shape[0]
    row_asset_id_map = dict(zip(df_test.row_id, df_test.Asset_ID))
    test_timestamp = df_test.timestamp.values[0]
    print(f"{test_timestamp}", end = "\r")
    
    timestamp_list = sup_train.index.get_level_values('timestamp').unique().values
    timestamp_list = np.append(timestamp_list,test_timestamp)
    #######################################format df_test 
    ###add weight and index
    df_test['Weight'] = df_test['Asset_ID'].map(add_weight_map)
    ###fillin missing assets as nan
    df_test.set_index(['timestamp','Asset_ID'],inplace=True)
    #df_test = df_test.reindex(list(zip([test_timestamp]*14,range(14))))
    df_test = df_test.reindex(list(zip([test_timestamp]*14,mod_order)))
    
    ########################################concat to sup_train, add lr_15,mkt_lr_15,crypto_index
    sup_train = pd.concat([sup_train,df_test.drop('row_id',axis=1)],join='outer')
    test_lr_15 = sup_train.loc[timestamp_list[[-16,-1]]].groupby('Asset_ID').apply(
        lambda x: np.log(x[['Close']]).diff()
    )#same as mod_order
    sup_train.loc[test_timestamp, 'lr_15'] = test_lr_15.loc[test_timestamp,'Close'].values
    sup_train.loc[test_timestamp, ['Mkt_lrt_15','Crypto_Index']] = sup_train.loc[test_timestamp, ["lr_15", "Close"]].multiply(sup_train.loc[test_timestamp,"Weight"], axis="index").sum(skipna=True).values
    
    ########################################beta_sl, reorder by mod_order
    beta_s = sup_train[['lr_15','Mkt_lrt_15']].iloc[-14*(beta_sw):,:].groupby('Asset_ID').apply(
        lambda x: beta_resid(x,beta_sw)
    ).rename(f"beta_{psets['beta_s']}").reindex(mod_order)
    beta_l = sup_train[['lr_15','Mkt_lrt_15']].iloc[-14*(beta_lw):,:].groupby('Asset_ID').apply(
        lambda x: beta_resid(x,beta_lw)
    ).rename(f"beta_{psets['beta_l']}").reindex(mod_order)
    sup_train.loc[test_timestamp, [f"beta_{psets['beta_s']}",f"beta_{psets['beta_l']}"]] = \
        pd.concat([beta_s, beta_l],axis=1).values
    #########################################fill in missing assets as forward
    if num_asset_test <14:
        #ffill in missing
        sup_train = sup_train.groupby('Asset_ID').apply(lambda x: x.fillna(method="ffill")).iloc[14:,:]
    else:
        sup_train = sup_train.iloc[14:,:]
    #######################################add features to test timestamp, make sure no nan in this lstm window
    sup_train2 = sup_train.iloc[(-14*(pre_minute+lstm_window)):,:].copy()
    xx_test=sup_train2.groupby('Asset_ID').apply(
        lambda x: get_features(x,psets)
    ).iloc[(-14*lstm_window):,:]
    ########################################reorder and make np.array format
    #xx_test = xx_test.reindex(list(zip(xx_test.index.get_level_values('timestamp'),mod_order*lstm_window)))
    xx_test = np.array(xx_test[features_list])#(210,27)
    xx_test = xx_test.reshape(-1, 14, 27) #(15, 14, 27), len(features_list)
    xx_test = np.expand_dims(xx_test, axis=0) #(none , 15, 14, 27)
    #######################################make prediction
    y_pred = model.predict(xx_test).squeeze()#.reshape(-1, 1).squeeze()
    y = dict(zip(mod_order, y_pred))#dict(asset_id:pred)
    #match with row_id
    df_pred['Target']= [y[row_asset_id_map[r]] for r in df_pred['row_id']]
    myapi.predict(df_pred)

time_elapsed = datetime.now() - start_time
print('Time elapsed total (hh:mm:ss.ms) {}'.format(time_elapsed))
print(f'time elapsed per iteration {time_elapsed/myapi.init_num_times}')
print(f'Submission time estimate {129600*time_elapsed/myapi.init_num_times}')

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set. ;)
Time elapsed total (hh:mm:ss.ms) 0:00:00.003552
time elapsed per iteration 0:00:00.000059
Submission time estimate 0:00:07.672320


In [144]:
model.predict(xx_test).squeeze()#.reshape(-1, 1).squeeze()

array([ 0.07281665, -0.46627435,  2.070983  , -0.8853696 , -1.768797  ,
       -0.6417947 , -1.3258556 ,  0.00251037,  0.86014843, -0.03998267,
       -0.66459185, -0.25610355, -0.41952956, -0.1513101 ], dtype=float32)

In [139]:
##scoring
id_2_weight = dict(zip(df_asset_details.Asset_ID, df_asset_details.Weight))
df, score = myapi.score(id_2_weight)
print(f"Your LB score is {round(score, 4)}")
myapi.predictions

Your LB score is 0.2764


[            row_id    Target
 22221694  22221694 -0.404724
 22221695  22221695 -0.183793
 22221696  22221696  2.278096
 22221697  22221697 -0.476878
 22221698  22221698 -1.776959
 22221699  22221699 -0.644963
 22221700  22221700 -1.777952
 22221701  22221701  0.271466
 22221702  22221702  0.999579
 22221703  22221703 -0.207778
 22221704  22221704 -0.290701
 22221705  22221705 -0.008112
 22221706  22221706 -0.415728
 22221707  22221707 -0.212668,
             row_id    Target
 22221708  22221708 -0.362784
 22221709  22221709 -0.196250
 22221710  22221710  2.264895
 22221711  22221711 -0.525895
 22221712  22221712 -1.776985
 22221713  22221713 -0.596066
 22221714  22221714 -1.733016
 22221715  22221715  0.260795
 22221716  22221716  1.079230
 22221717  22221717 -0.188165
 22221718  22221718 -0.348255
 22221719  22221719 -0.010486
 22221720  22221720 -0.472349
 22221721  22221721 -0.209048,
             row_id    Target
 22221722  22221722 -0.316932
 22221723  22221723 -0.213432
 2222172