# XGB model cv selection scoring

- `xgboost.cv`
- Comparing the optimal candidates by test set, using the g-research weighted correlation metrics.

In [15]:
import os
import pandas as pd
import gc
import talib as ta
import numpy as np
import json
import pickle
import xgboost as xgb
from os.path import exists


## 1. version_num

- `model_nof_{version number}`

In [16]:
version_num = 40
retrained = False #set to False to get correct out-train score
if retrained:
    MOD_FOLDER = "./trainedXGB/retrained" + f"/model_nof_{version_num}_sub_sub"
else:
    MOD_FOLDER = "./trainedXGB" + f"/model_nof_{version_num}"

MOD_FOLDER

'./trainedXGB/model_nof_40'

## 2. organized data set  

- new_data.ftr

In [17]:
df_train = pd.read_feather('./data'+'/new_data.ftr',
                           columns=['timestamp', 'Asset_ID', 'Count', 'Open', 'High', 'Low', 'Close',
                                'Volume', 'Target', 'Weight', 'lr_15', 'Mkt_lrt_15','Crypto_Index','beta','lr_mkt_resid'])

In [18]:
df_train.tail()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,Target,Weight,lr_15,Mkt_lrt_15,Crypto_Index,beta,lr_mkt_resid
26473297,1641772800,9,218.0,130.842571,130.94,130.478,130.707429,1011.042,,0.058657,-0.004696,-0.003263,7532.261935,0.963412,-0.001552
26473298,1641772800,10,20.0,2122.416667,2124.01,2116.95,2119.743333,2.084755,,0.026874,-0.003462,-0.003263,7532.261935,1.507092,0.001455
26473299,1641772800,13,118.0,0.066109,0.06613,0.066058,0.066082,1265238.0,,0.04383,-0.003493,-0.003263,7532.261935,0.772527,-0.000973
26473300,1641772800,12,112.0,0.261055,0.261185,0.260474,0.260682,118758.1,,0.050867,-0.003568,-0.003263,7532.261935,0.938556,-0.000505
26473301,1641772800,11,52.0,190.732,191.16,189.8,190.506,135.5932,,0.03937,-0.000454,-0.003263,7532.261935,0.57614,0.001426


In [19]:
ASSET_DETAILS_CSV = './data/asset_details.csv'
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")

def model_reload_train():
    print(f"model from {MOD_FOLDER}")
    models = {}
    for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
        model_file = MOD_FOLDER + f"/model_{asset_id}.json"
        if exists(model_file):
            model = xgb.Booster()
            model.load_model(model_file)
            models[asset_id] = model
    return models

models = {}
models[version_num] =model_reload_train()

model from ./trainedXGB/model_nof_40


## 3. Features

In [20]:
# psets34 = dict(zip(['lrtn','fastk1','fastk2','adx','macd_s','macd_l','rsi','std_Crypto_Index','std_lr_15','std_Mkt_lrt_15'],
#                    [15, 15, 5, 30, 5, 25, 60, 5, 15, 30]))
with open(MOD_FOLDER+ f"/feature_best{version_num}", "rb") as f:
    psets=pickle.load(f)

if version_num < 44:
    psets['vol_sum'] = 0    
psets

{'std_lr_15': 5,
 'std_Mkt_lrt_15': 10,
 'std_Crypto_Index': 15,
 'rsi': 60,
 'macd_sig': 5,
 'macd_s': 15,
 'macd_l': 25,
 'lrtn': 50,
 'fastk2': 15,
 'fastk1': 5,
 'beta_s': '6h',
 'beta_l': '2d',
 'adx': 60,
 'vol_sum': 0}

In [21]:
def beta_window(beta):
    num, unit = int(beta[:-1]),beta[-1]
    if unit == 'h':
        width = 60*num
    elif unit == 'd':
        width = 60*24*num
    return width

beta_sw = beta_window(psets['beta_s'])
beta_lw = beta_window(psets['beta_l'])
(beta_sw,beta_lw)

(360, 2880)

In [22]:
######mod>=36
def log_return(series, periods=5):
    return np.log(series).diff(periods)

def beta_resid(df, window): 
    num, unit = int(window[:-1]),window[-1]
    if unit == 'h':
        width = 60*num
    elif unit == 'd':
        width = 60*24*num
    b = ((ta.MULT(df.Mkt_lrt_15,df.lr_15).rolling(width).mean())/ \
        (ta.MULT(df.Mkt_lrt_15,df.Mkt_lrt_15).rolling(width).mean())).rename(f"beta_{window}")
    b = b.replace([np.nan,np.inf,-np.inf], 0)
    resids = ta.SUB(df.lr_15, ta.MULT(b, df.Mkt_lrt_15)).rename(f"lr_15_resid_{window}")
    return pd.concat([b, resids],axis=1)



def lag_features(df,fastk1,fastk2,adx,macd_s,macd_l,macd_sig,rsi,vol_sum,std_Crypto_Index,std_lr_15,std_Mkt_lrt_15,**kwargs):    
    ####TECH indicators
    df['slowK'], df['slowD'] = ta.STOCH(df.High, df.Low, df.Close, 
                                        fastk_period=fastk1, slowk_period=int(3*fastk1/5), slowd_period=int(3*fastk1/5),
                                        slowk_matype=0, slowd_matype=0)
    df['fastK'], df['fastD'] = ta.STOCHF(df.High, df.Low, df.Close,
                                         fastk_period=fastk2, fastd_period=int(3*fastk2/5), 
                                         fastd_matype=0)
    df[f'rsi_{rsi}'] = ta.RSI(df['Close'], timeperiod=rsi)
    df[f'macd_{macd_s}_{macd_l}'],df[f'macd_signal_{macd_sig}'], df['macd_hist'] = \
                ta.MACD(df['Close'],fastperiod=macd_s, slowperiod=macd_l, signalperiod=macd_sig)
    df[f'adx_{adx}'] = ta.ADX(df['High'], df['Low'],df['Close'], timeperiod=adx)#Average Directional Movement Index
    df['AD'] = ta.AD(df['High'], df['Low'],df['Close'], df['Volume'])#Accumulation Distribution Line
    if vol_sum >0 :
        df[f'vol_sum_{vol_sum}'] = ta.SMA(df['Volume'],vol_sum)*vol_sum
    ####std volatility
    df[f'std_lr_15_{std_lr_15}'] = ta.STDDEV(df.lr_15,timeperiod=std_lr_15, nbdev=1)
    df[f'std_Mkt_lrt_15_{std_Mkt_lrt_15}'] = ta.STDDEV(df.Mkt_lrt_15,timeperiod=std_Mkt_lrt_15, nbdev=1)
    df[f'std_Crypto_Index_{std_Crypto_Index}'] = ta.STDDEV(df.Crypto_Index,timeperiod=std_Crypto_Index, nbdev=1)

def get_features(df_feat, fpara_dict):
    pd.options.mode.chained_assignment = None  # default='warn'
    df_feat[[f"beta_{fpara_dict['beta_s']}",f"lr_15_resid_{fpara_dict['beta_s']}"]] = beta_resid(df_feat, window = fpara_dict['beta_s'])
    df_feat[[f"beta_{fpara_dict['beta_l']}",f"lr_15_resid_{fpara_dict['beta_l']}"]] = beta_resid(df_feat, window = fpara_dict['beta_l'])
    df_feat[f"lrtn_index_{fpara_dict['lrtn']}"] = log_return(df_feat.Crypto_Index, fpara_dict['lrtn'])
    lag_features(df_feat, **fpara_dict)
    return df_feat

## 4. Test set with features

- train set range is after `df_train['timestamp'].quantile(0.45)` -`quantile(0.95)`
- test set

In [23]:
int(pd.Timestamp('2021-09-21T00:00:00').timestamp())

1632182400

In [24]:
pd.to_datetime([1632182460,1641772800],unit="s",
               infer_datetime_format=True)

DatetimeIndex(['2021-09-21 00:01:00', '2022-01-10 00:00:00'], dtype='datetime64[ns]', freq=None)

In [25]:
import datetime
import time

#test_days= 30*3
def make_testset(df_train, psets, mode, **kwargs):
    if mode == 'out-train':
        ##same as df_test in notebook
        start = df_train['timestamp'].quantile(0.95)
        df2 = df_train[df_train['timestamp']>=start].set_index("timestamp").copy()
        ind = df2.index.unique()
        def reindex(df):
            df = df.reindex(range(ind[0],ind[-1]+60,60),method='nearest')
            df = df.fillna(method="ffill").fillna(method="bfill")
            return df
        df2 = df2.groupby('Asset_ID').apply(reindex).reset_index(0, drop=True).sort_index()
        ###add features
        df2 = df2.groupby('Asset_ID').apply(lambda x: get_features(x, psets))
        return df2.dropna(axis = 0).reset_index()
    if mode == 'api':
        #'2021-09-21 00:01:00', '2022-01-10 00:00:00'
        sup_tt = df_train[(df_train['timestamp']>=1632182460) & (df_train['timestamp']<=1641772800)]
        sup_tt = sup_tt.iloc[(-14* kwargs['pre_minute_beta']):,:]
        sup_tt['Target'] = 0
        
        start = 1623542400#2021-06-13
        end = 1632182400#2021-09-21
        df2 = df_train[(df_train['timestamp']>=start) & (df_train['timestamp']<=end)]
        df2  = pd.concat([sup_tt,df2], join='outer')

        df2 = df2.groupby('Asset_ID').apply(lambda x: get_features(x, psets))
        df2 = df2.iloc[(14*kwargs['pre_minute_beta']):,:] #delete sup_tt placeholder
        return df2.dropna(axis = 0).reset_index()

In [26]:
################################################out-train test same as tune df_test
df_test = make_testset(df_train, psets, mode = 'out-train')
print(f"made out-train data\n {pd.to_datetime([df_test['timestamp'].min(),df_test['timestamp'].max()],unit='s',infer_datetime_format=True)}")

made out-train data
 DatetimeIndex(['2021-11-05 09:18:00', '2022-01-10 00:00:00'], dtype='datetime64[ns]', freq=None)


In [27]:
#############################################in-train test same as API

df_test2 = make_testset(df_train, psets, mode = 'api', 
                        pre_minute=120, pre_minute_beta=beta_lw + 15)

print(f"made API data\n {pd.to_datetime(df_test2['timestamp'].iloc[[0,-1]],unit='s',infer_datetime_format=True)}")


made API data
 0         2021-06-13
2015111   2021-09-21
Name: timestamp, dtype: datetime64[ns]


## Performance on test set

In [28]:
from script.weighted_corr_metrics import weighted_correlation
pd.options.mode.chained_assignment = None  # default='warn'

result = []
result2 = []

result_id=[]
result_id2=[]
for id in range(0,14):
    model = models[version_num][id]
    #out-train
    x = df_test[df_test['Asset_ID']==id]
    x['Pred'] = model.predict(xgb.DMatrix(x[model.feature_names]))
    result_id.append(x[['timestamp','Asset_ID','Weight','Target','Pred']])
    #api
    x2 = df_test2[df_test2['Asset_ID']==id]
    x2['Pred2'] = model.predict(xgb.DMatrix(x2[model.feature_names]))
    result_id2.append(x2[['timestamp','Asset_ID','Weight','Target','Pred2']])
    
result.append(pd.concat(result_id, axis=0))
result2.append(pd.concat(result_id2, axis=0))
score=weighted_correlation(a=result[-1]['Target'], 
                    b=result[-1]['Pred'], 
                    weights=result[-1]['Weight'])
score2=weighted_correlation(a=result2[-1]['Target'], 
                    b=result2[-1]['Pred2'], 
                    weights=result2[-1]['Weight'])
print(f'{MOD_FOLDER} finished. Out-train-score:{score}, api-score:{score2}')

./trainedXGB/model_nof_40 finished. Out-train-score:0.01112561919094972, api-score:0.038313918954451985
