# XGB model cv selection scoring

- `xgboost.cv`
- Comparing the optimal candidates by test set, using the g-research weighted correlation metrics.

In [1]:
import os
import pandas as pd
import gc
import talib as ta
import numpy as np
import json
import pickle
import xgboost as xgb
from os.path import exists


## 1. version_num

- `model_nof_{version number}`

In [2]:
version_num = 55
retrained = False #set to False to get correct out-train score
if version_num < 46:    
    if retrained:
        MOD_FOLDER = "./trainedXGB/retrained" + f"/model_nof_{version_num}_sub_sub"
    else:
        MOD_FOLDER = "./trainedXGB" + f"/model_nof_{version_num}"
else:
    MOD_FOLDER = "./trainedXGB" + f"/model_nof_{version_num}"

MOD_FOLDER

'./trainedXGB/model_nof_55'

## 2. organized data set  

- new_data.ftr

In [3]:
if version_num >46:
     df_train = pd.read_feather('./data'+'/new_data3.ftr',
                              columns=['timestamp', 'Asset_ID', 'Count', 'Open', 'High', 'Low', 'Close',
                                   'Volume', 'Target', 'Weight', 'lr_15', 'Mkt_lrt_15','Crypto_Index'])
else:
     df_train = pd.read_feather('./data'+'/new_data.ftr',
                           columns=['timestamp', 'Asset_ID', 'Count', 'Open', 'High', 'Low', 'Close',
                                'Volume', 'Target', 'Weight', 'lr_15', 'Mkt_lrt_15','Crypto_Index'])

In [4]:
df_train

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,Target,Weight,lr_15,Mkt_lrt_15,Crypto_Index
0,1514764860,2,40.0,2376.58000,2399.500000,2357.140000,2374.590000,1.923301e+01,-0.004218,0.058657,,0.000000,2571.477256
1,1514764860,0,5.0,8.53000,8.530000,8.530000,8.530000,7.838000e+01,-0.014399,0.105286,,0.000000,2571.477256
2,1514764860,1,229.0,13835.19400,14013.800000,13666.110000,13850.176000,3.155006e+01,-0.014643,0.165850,,0.000000,2571.477256
3,1514764860,5,32.0,7.65960,7.659600,7.656700,7.657600,6.626713e+03,-0.013922,0.033911,,0.000000,2571.477256
4,1514764860,7,5.0,25.92000,25.920000,25.874000,25.877000,1.210873e+02,-0.008264,0.050867,,0.000000,2571.477256
...,...,...,...,...,...,...,...,...,...,...,...,...,...
26755079,1642982400,9,214.0,112.29300,112.470000,111.840000,111.983333,1.243860e+03,,0.058657,-0.001254,-0.001114,6494.648269
26755080,1642982400,10,25.0,1833.56500,1834.700000,1831.050000,1833.018333,4.190290e+00,,0.026874,-0.002008,-0.001114,6494.648269
26755081,1642982400,13,235.0,0.05752,0.057618,0.057389,0.057478,8.988253e+05,,0.043830,-0.000622,-0.001114,6494.648269
26755082,1642982400,12,492.0,0.20094,0.201066,0.199352,0.200093,2.983796e+06,,0.050867,-0.002306,-0.001114,6494.648269


In [5]:
ASSET_DETAILS_CSV = './data/asset_details.csv'
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")

def model_reload_train():
    print(f"model from {MOD_FOLDER}")
    models = {}
    for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
        if version_num <46:
            model_file = MOD_FOLDER + f"/model_{asset_id}.json"
        elif retrained:
            model_file = MOD_FOLDER + f"/model_{asset_id}_alldata.json"
        else:
            model_file = MOD_FOLDER + f"/model_{asset_id}.json"

        if exists(model_file):
            model = xgb.Booster()
            model.load_model(model_file)
            models[asset_id] = model
    return models

models = {}
models[version_num] =model_reload_train()

model from ./trainedXGB/model_nof_55


## 3. Features

In [6]:
# psets34 = dict(zip(['lrtn','fastk1','fastk2','adx','macd_s','macd_l','rsi','std_Crypto_Index','std_lr_15','std_Mkt_lrt_15'],
#                    [15, 15, 5, 30, 5, 25, 60, 5, 15, 30]))
with open(MOD_FOLDER+ f"/feature_best{version_num}", "rb") as f:
    psets=pickle.load(f)

if version_num < 44:
    psets['vol_sum'] = 0    
psets

{'vol_sum': 15,
 'std_lr_15': 30,
 'std_Mkt_lrt_15': 10,
 'std_Crypto_Index': 30,
 'macd_sig': 15,
 'macd_s': 10,
 'macd_l': 60,
 'lrtn': 50,
 'fastk2': 10,
 'fastk1': 15,
 'beta_s': '6h',
 'beta_l': '2d',
 'adx': 50,
 'ATR': 60}

In [7]:
def beta_window(beta):
    num, unit = int(beta[:-1]),beta[-1]
    if unit == 'h':
        width = 60*num
    elif unit == 'd':
        width = 60*24*num
    return width

beta_sw = beta_window(psets['beta_s'])
beta_lw = beta_window(psets['beta_l'])
(beta_sw,beta_lw)

(360, 2880)

In [8]:
######mod>=36
def log_return(series, periods=5):
    return np.log(series).diff(periods)

def beta_resid(df, window): 
    num, unit = int(window[:-1]),window[-1]
    if unit == 'h':
        width = 60*num
    elif unit == 'd':
        width = 60*24*num
    b = ((ta.MULT(df.Mkt_lrt_15,df.lr_15).rolling(width).mean())/ \
        (ta.MULT(df.Mkt_lrt_15,df.Mkt_lrt_15).rolling(width).mean())).rename(f"beta_{window}")
    b = b.replace([np.nan,np.inf,-np.inf], 0)
    resids = ta.SUB(df.lr_15, ta.MULT(b, df.Mkt_lrt_15)).rename(f"lr_15_resid_{window}")
    return pd.concat([b, resids],axis=1)

def lag_features(df,fastk1,fastk2,adx,macd_s,macd_l,macd_sig,vol_sum,std_Crypto_Index,std_lr_15,std_Mkt_lrt_15,**kwargs):    
    ####TECH indicators
    df['slowK'], df['slowD'] = ta.STOCH(df.High, df.Low, df.Close, 
                                        fastk_period=fastk1, slowk_period=int(3*fastk1/5), slowd_period=int(3*fastk1/5),
                                        slowk_matype=0, slowd_matype=0)
    df['fastK'], df['fastD'] = ta.STOCHF(df.High, df.Low, df.Close,
                                         fastk_period=fastk2, fastd_period=int(3*fastk2/5), 
                                         fastd_matype=0)
    #df[f'rsi_{rsi}'] = ta.RSI(df['Close'], timeperiod=rsi)
    df[f'macd_{macd_s}_{macd_l}'],df[f'macd_signal_{macd_sig}'], df['macd_hist'] = \
                ta.MACD(df['Close'],fastperiod=macd_s, slowperiod=macd_l, signalperiod=macd_sig)
    df[f'adx_{adx}'] = ta.ADX(df['High'], df['Low'],df['Close'], timeperiod=adx)#Average Directional Movement Index
    df['AD'] = ta.AD(df['High'], df['Low'],df['Close'], df['Volume'])#Accumulation Distribution Line
    if vol_sum >0 :
        df[f'vol_sum_{vol_sum}'] = ta.SMA(df['Volume'],vol_sum)*vol_sum
    ####std volatility
    df[f'std_lr_15_{std_lr_15}'] = ta.STDDEV(df.lr_15,timeperiod=std_lr_15, nbdev=1)
    df[f'std_Mkt_lrt_15_{std_Mkt_lrt_15}'] = ta.STDDEV(df.Mkt_lrt_15,timeperiod=std_Mkt_lrt_15, nbdev=1)
    df[f'std_Crypto_Index_{std_Crypto_Index}'] = ta.STDDEV(df.Crypto_Index,timeperiod=std_Crypto_Index, nbdev=1)
    #####new after mod 49
    if 'ATR' in kwargs:       
        df[f"ATR_{kwargs['ATR']}"] = ta.ATR(df['High'], df['Low'],df['Close'], timeperiod=kwargs['ATR'])
        df['TRENDLINE'] =ta.HT_TRENDLINE(df['Open'])
    if 'willr' in kwargs:
        df[f"willr_{kwargs['willr']}"] = ta.WILLR(df['High'], df['Low'],df['Close'], timeperiod=kwargs['willr'])
        

def get_features(df_feat, fpara_dict):
    pd.options.mode.chained_assignment = None  # default='warn'
    df_feat[[f"beta_{fpara_dict['beta_s']}",f"lr_15_resid_{fpara_dict['beta_s']}"]] = beta_resid(df_feat, window = fpara_dict['beta_s'])
    df_feat[[f"beta_{fpara_dict['beta_l']}",f"lr_15_resid_{fpara_dict['beta_l']}"]] = beta_resid(df_feat, window = fpara_dict['beta_l'])
    df_feat[f"lrtn_index_{fpara_dict['lrtn']}"] = log_return(df_feat.Crypto_Index, fpara_dict['lrtn'])
    lag_features(df_feat, **fpara_dict)
    return df_feat

## 4. Test set with features

- train set range is after `df_train['timestamp'].quantile(0.45)` -`quantile(0.95)`
- test set

In [9]:
int(pd.Timestamp('2021-09-21T00:00:00').timestamp())

1632182400

In [10]:
pd.to_datetime([1632182460,1641772800],unit="s",
               infer_datetime_format=True)

DatetimeIndex(['2021-09-21 00:01:00', '2022-01-10 00:00:00'], dtype='datetime64[ns]', freq=None)

In [11]:
import datetime
import time

#test_days= 30*3
def make_testset(df_train, psets, mode, **kwargs):
    if mode == 'out-train':
        ##same as df_test in notebook
        start = df_train['timestamp'].quantile(0.95)
        df2 = df_train[df_train['timestamp']>=start].copy()
        df2 = df2.groupby('Asset_ID').apply(lambda x: get_features(x, psets))
        return df2.dropna(axis = 0).reset_index()
    if mode =='in-train':
        ##same as df_test in notebook
        start,end = df_train['timestamp'].quantile(0.80), df_train['timestamp'].quantile(0.95)
        df2 = df_train[(df_train['timestamp']>=start) & (df_train['timestamp']<=end)].copy()
        df2 = df2.groupby('Asset_ID').apply(lambda x: get_features(x, psets))
        return df2.dropna(axis = 0).reset_index()
    if mode == 'api':
        #'2021-09-21 00:01:00', '2022-01-10 00:00:00'
        sup_tt = df_train[(df_train['timestamp']>=1632182460) & (df_train['timestamp']<=1641772800)]
        sup_tt = sup_tt.iloc[(-14* kwargs['pre_minute_beta']):,:]
        sup_tt['Target'] = 0
        
        start = 1623542400#2021-06-13
        end = 1632182400#2021-09-21
        df2 = df_train[(df_train['timestamp']>=start) & (df_train['timestamp']<=end)]
        df2  = pd.concat([sup_tt,df2], join='outer')

        df2 = df2.groupby('Asset_ID').apply(lambda x: get_features(x, psets))
        df2 = df2.iloc[(14*kwargs['pre_minute_beta']):,:] #delete sup_tt placeholder
        return df2.dropna(axis = 0).reset_index()

In [12]:
start,end = df_train['timestamp'].quantile(0.90), df_train['timestamp'].quantile(0.95)
df2 = df_train[(df_train['timestamp']>=start) & (df_train['timestamp']<=end)].copy()
df2.groupby('Asset_ID').apply(lambda x: get_features(x, psets)).reset_index()

Unnamed: 0,index,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,Target,...,macd_signal_15,macd_hist,adx_50,AD,vol_sum_15,std_lr_15_30,std_Mkt_lrt_15_10,std_Crypto_Index_30,ATR_60,TRENDLINE
0,24079571,1631508360,3,906.0,2.421633,2.423500,2.416158,2.418004,3.075439e+05,-0.001720,...,,,,-1.529063e+05,,,,,,
1,24079572,1631508360,2,74.0,623.790000,624.210000,623.400000,623.661667,3.963215e+01,-0.001648,...,,,,-1.402619e+01,,,,,,
2,24079573,1631508360,0,222.0,398.890500,399.100000,398.600000,398.738000,4.951066e+02,-0.001406,...,,,,-2.218078e+02,,,,,,
3,24079574,1631508360,1,1618.0,44591.749091,44610.000000,44556.660000,44584.658750,5.880664e+01,0.000177,...,,,,2.929858e+00,,,,,,
4,24079575,1631508360,4,369.0,0.239422,0.239700,0.239100,0.239351,1.171516e+06,-0.000443,...,,,,-1.913476e+05,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1337762,25417333,1637244120,9,397.0,215.323286,215.860000,215.000000,215.277286,1.374382e+03,0.001744,...,-0.627480,-0.425282,13.849152,3.743344e+06,2.123426e+04,0.010531,0.001623,37.031179,0.892095,216.820243
1337763,25417334,1637244120,10,29.0,2920.117950,2925.351400,2918.800000,2919.222950,1.156347e+00,-0.001148,...,-3.751343,-5.853549,14.062468,7.586451e+03,5.114181e+01,0.011117,0.001623,37.031179,7.936887,2933.666953
1337764,25417335,1637244120,13,731.0,0.105071,0.105280,0.104950,0.105043,3.539281e+06,-0.001108,...,-0.000156,-0.000115,13.106632,7.760901e+09,5.041386e+07,0.006762,0.001623,37.031179,0.000302,0.105520
1337765,25417336,1637244120,12,213.0,0.335453,0.336306,0.335100,0.335430,1.973036e+05,-0.001144,...,-0.000098,-0.000401,9.363663,2.722182e+08,2.976035e+06,0.007720,0.001623,37.031179,0.001138,0.336810


In [13]:
################################################out-train test same as tune df_test
df_test = make_testset(df_train, psets, mode = 'out-train')
print(f"made out-train data\n {pd.to_datetime([df_test['timestamp'].min(),df_test['timestamp'].max()],unit='s',infer_datetime_format=True)}")

made out-train data
 DatetimeIndex(['2021-11-18 15:41:00', '2022-01-23 23:44:00'], dtype='datetime64[ns]', freq=None)


In [14]:
#############################################in-train test / API
df_test2 = make_testset(df_train, psets, mode = 'in-train', 
                        pre_minute=120, pre_minute_beta=beta_lw + 15)

print(f"made in-train data\n {pd.to_datetime(df_test2['timestamp'].iloc[[0,-1]],unit='s',infer_datetime_format=True)}")


made in-train data
 0         2021-05-03 12:11:00
4011889   2021-11-18 14:02:00
Name: timestamp, dtype: datetime64[ns]


## Performance on test set

In [15]:
from script.weighted_corr_metrics import weighted_correlation
pd.options.mode.chained_assignment = None  # default='warn'

result = []
result2 = []

result_id=[]
result_id2=[]
for id in range(0,14):
    model = models[version_num][id]
    #out-train
    x = df_test[df_test['Asset_ID']==id]
    x['Pred'] = model.predict(xgb.DMatrix(x[model.feature_names]))
    result_id.append(x[['timestamp','Asset_ID','Weight','Target','Pred']])
    #api
    x2 = df_test2[df_test2['Asset_ID']==id]
    x2['Pred2'] = model.predict(xgb.DMatrix(x2[model.feature_names]))
    result_id2.append(x2[['timestamp','Asset_ID','Weight','Target','Pred2']])
    
result.append(pd.concat(result_id, axis=0))
result2.append(pd.concat(result_id2, axis=0))
score=weighted_correlation(a=result[-1]['Target'], 
                    b=result[-1]['Pred'], 
                    weights=result[-1]['Weight'])
score2=weighted_correlation(a=result2[-1]['Target'], 
                    b=result2[-1]['Pred2'], 
                    weights=result2[-1]['Weight'])
print(f'{MOD_FOLDER} finished. Out-train-score:{score}, intrain-api-score:{score2}')

./trainedXGB/model_nof_55 finished. Out-train-score:0.035764680028872155, intrain-api-score:0.4823039091008447
