# XGB model cv selection scoring

- `xgboost.cv`
- Comparing the optimal candidates by test set, using the g-research weighted correlation metrics.

In [11]:
import os
import pandas as pd
import gc
import talib as ta
import numpy as np

## 1. cv tuned model folders

- `model_nof_{version number}`

In [12]:
sorted([int(s.split('_')[-1]) for s in os.listdir('./trainedXGB/') if 'model_nof' in s])[-2:]

[33, 34]

## 2. organized data set  

- new_data.ftr

In [13]:
df_train = pd.read_feather('./data'+'/new_data.ftr',
                           columns=['timestamp', 'Asset_ID', 'Count', 'Open', 'High', 'Low', 'Close',
                                'Volume', 'Target', 'Weight', 'lr_15', 'Mkt_lrt_15','Crypto_Index','beta','lr_mkt_resid'])
df_train.tail()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,Target,Weight,lr_15,Mkt_lrt_15,Crypto_Index,beta,lr_mkt_resid
26473297,1641772800,9,218.0,130.842571,130.94,130.478,130.707429,1011.042,,0.058657,-0.004696,-0.003263,7532.261935,0.963412,-0.001552
26473298,1641772800,10,20.0,2122.416667,2124.01,2116.95,2119.743333,2.084755,,0.026874,-0.003462,-0.003263,7532.261935,1.507092,0.001455
26473299,1641772800,13,118.0,0.066109,0.06613,0.066058,0.066082,1265238.0,,0.04383,-0.003493,-0.003263,7532.261935,0.772527,-0.000973
26473300,1641772800,12,112.0,0.261055,0.261185,0.260474,0.260682,118758.1,,0.050867,-0.003568,-0.003263,7532.261935,0.938556,-0.000505
26473301,1641772800,11,52.0,190.732,191.16,189.8,190.506,135.5932,,0.03937,-0.000454,-0.003263,7532.261935,0.57614,0.001426


## 3. Features

In [14]:
# lrtn,fastk1,fastk2,adx,macd_s,macd_l,rsi,std_Crypto_Index,std_lr_15,std_Mkt_lrt_15 = \
#     (30, 5, 15, 30, 15, 25, 60, 15, 15, 5)
psets31 = dict(zip(['lrtn','fastk1','fastk2','adx','macd_s','macd_l','rsi','std_Crypto_Index','std_lr_15','std_Mkt_lrt_15'],
                   [30, 5, 15, 30, 15, 25, 60, 15, 15, 5]))
psets32 = dict(zip(['lrtn','fastk1','fastk2','adx','macd_s','macd_l','rsi','std_Crypto_Index','std_lr_15','std_Mkt_lrt_15'],
                   [30, 15, 15, 30, 10, 25, 60, 30, 15, 15]))
psets33 = dict(zip(['lrtn','fastk1','fastk2','adx','macd_s','macd_l','rsi','std_Crypto_Index','std_lr_15','std_Mkt_lrt_15'],
                   [15, 15, 5, 30, 5, 25, 60, 5, 15, 30]))
psets34 = dict(zip(['lrtn','fastk1','fastk2','adx','macd_s','macd_l','rsi','std_Crypto_Index','std_lr_15','std_Mkt_lrt_15'],
                   [15, 15, 5, 30, 5, 25, 60, 5, 15, 30]))


psets = dict(zip(sorted([int(s.split('_')[-1]) for s in os.listdir('./trainedXGB/') if 'model_nof' in s])[-2:],
                 [psets33,psets34]))

In [15]:
def log_return(series, periods=5):
    return np.log(series).diff(periods)

def lag_features(df, fastk1,fastk2,adx,macd_s,macd_l,rsi,std_Crypto_Index,std_lr_15,std_Mkt_lrt_15, **kwargs):    
    ####TECH indicators
    df['slowK'], df['slowD'] = ta.STOCH(df.High, df.Low, df.Close, 
                                        fastk_period=fastk1, slowk_period=int(3*fastk1/5), slowd_period=int(3*fastk1/5),
                                        slowk_matype=0, slowd_matype=0)
    df['fastK'], df['fastD'] = ta.STOCHF(df.High, df.Low, df.Close,
                                         fastk_period=fastk2, fastd_period=int(3*fastk2/5), 
                                         fastd_matype=0)
    df[f'rsi_{rsi}'] = ta.RSI(df['Close'], timeperiod=rsi)
    df[f'macd_{macd_s}_{macd_l}'],df['macd_signal'], df['macd_hist'] = \
                ta.MACD(df['Close'],fastperiod=macd_s, slowperiod=macd_l, signalperiod=5)
    df[f'adx_{adx}'] = ta.ADX(df['High'], df['Low'],df['Close'], timeperiod=adx)#Average Directional Movement Index
    df['AD'] = ta.AD(df['High'], df['Low'],df['Close'], df['Volume'])#Accumulation Distribution Line
    ####std volatility
    df[f'std_lr_15_{std_lr_15}'] = ta.STDDEV(df.lr_15,timeperiod=std_lr_15, nbdev=1)
    df[f'std_Mkt_lrt_15_{std_Mkt_lrt_15}'] = ta.STDDEV(df.Mkt_lrt_15,timeperiod=std_Mkt_lrt_15, nbdev=1)
    df[f'std_Crypto_Index_{std_Crypto_Index}'] = ta.STDDEV(df.Crypto_Index,timeperiod=std_Crypto_Index, nbdev=1)
    ####candidates
    # df[f'DI_plus_{DI_plus}'] = ta.PLUS_DI(df['High'], df['Low'],df['Close'], timeperiod=DI_plus)
    # df[f'DI_minus_{DI_minus}'] = ta.MINUS_DI(df['High'], df['Low'],df['Close'], timeperiod=DI_minus)
    # df['TRENDLINE'] =ta.HT_TRENDLINE(df['Open'])#Hilbert Transform - Instantaneous Trendline
    # df[f'willr_{willr}'] = ta.WILLR(df['High'], df['Low'],df['Close'], timeperiod=willr)#Williams Percent Range
    # df[f'ADOSC_{ADOSC_s}_{ADOSC_l}'] = \
    #             ta.ADOSC(df['High'], df['Low'],df['Close'], df['Volume'], fastperiod = ADOSC_s, slowperiod=ADOSC_l)
    # df[f'ATR_{ATR}'] = ta.ATR(df['High'], df['Low'],df['Close'], timeperiod=ATR)
def get_features(df_feat, psets):
    pd.options.mode.chained_assignment = None  # default='warn'
    df_feat[f"lrtn_index_{psets['lrtn']}"] = log_return(df_feat.Crypto_Index, psets['lrtn'])
    lag_features(df_feat, **psets)
    return df_feat

## 4. Test set with features

- train set range is after `df_train['timestamp'].quantile(0.5)`
- test set

In [16]:
df_train['timestamp'].quantile(0.5)

int(pd.Timestamp('2021-09-21T00:00:00').timestamp())
pd.to_datetime(df_train['timestamp'].quantile(0.95),unit="s",
               infer_datetime_format=True)

Timestamp('2021-11-05 07:18:57')

In [17]:
import datetime
import time

#test_days= 30*3
def make_testset(df_train, start,end, psets):
    dd = df_train[(start<=df_train['timestamp']) & (df_train['timestamp']<=end)].copy()
    ###consistent timestamp for all 14 assets
    dd = dd.sort_values('timestamp').set_index("timestamp").sort_index()
    ind = dd.index.unique()
    def reindex(df):
        df = df.reindex(range(ind[0],ind[-1]+60,60),method='nearest')
        df = df.fillna(method="ffill").fillna(method="bfill")
        return df
    dd = dd.groupby('Asset_ID').apply(reindex).reset_index(0, drop=True).sort_index()
    ###add features
    dd = dd.groupby('Asset_ID').apply(lambda x: get_features(x,psets))
    dd.dropna(axis = 0, inplace= True)
    return dd.reset_index()
    



## Performance on test set

In [18]:
import xgboost as xgb
from os.path import exists

ASSET_DETAILS_CSV = './data/asset_details.csv'
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")

def model_reload_train(param_version):
    models = {}
    for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
        model_file = f"./trainedXGB/model_nof_{param_version}/model_{asset_id}.json"
        if exists(model_file):
            model = xgb.Booster()
            model.load_model(model_file)
            models[asset_id] = model
    return models

In [19]:
from script.weighted_corr_metrics import weighted_correlation
pd.options.mode.chained_assignment = None  # default='warn'

result = []
result2 = []
models = {}

for version_num in sorted([int(s.split('_')[-1]) for s in os.listdir('./trainedXGB/') if 'model_nof' in s])[-2:]:
    models[version_num] =model_reload_train(param_version= version_num)
    ################################################out-train test
    test_end= df_train['timestamp'].max()
    test_start= df_train['timestamp'].quantile(0.95)
    df_test = make_testset(df_train, test_start,test_end, psets[version_num])
    #############################################in-train test same as API
    test_end2= 1632182400
    test_start2= 1623542400 - 60*64
    df_test2 = make_testset(df_train, test_start2,test_end2, psets[version_num])

    result_id=[]
    result_id2=[]
    for id in range(0,14):
        model = models[version_num][id]
        x = df_test[df_test['Asset_ID']==id]
        x['Pred'] = model.predict(xgb.DMatrix(x[model.feature_names]))
        result_id.append(x[['timestamp','Asset_ID','Weight','Target','Pred']])
        x2 = df_test2[df_test2['Asset_ID']==id]
        x2['Pred2'] = model.predict(xgb.DMatrix(x2[model.feature_names]))
        result_id2.append(x2[['timestamp','Asset_ID','Weight','Target','Pred2']])
        
    result.append(pd.concat(result_id, axis=0))
    result2.append(pd.concat(result_id2, axis=0))
    score=weighted_correlation(a=result[-1]['Target'], 
                     b=result[-1]['Pred'], 
                     weights=result[-1]['Weight'])
    score2=weighted_correlation(a=result2[-1]['Target'], 
                     b=result2[-1]['Pred2'], 
                     weights=result2[-1]['Weight'])
    #print(f"features: {models[0].feature_names}")
    print(f'model_nof_{version_num} finished. Out-train-score:{score}, In-train-score:{score2}')

model_nof_33 finished. Out-train-score:0.01320315622979851, In-train-score:0.15900677535789493
model_nof_34 finished. Out-train-score:0.01320315622979851, In-train-score:0.15900677535789493


In [20]:
xgb.plot_importance(models[31][2], max_num_features=20, height=2)

KeyError: 31

In [None]:
models[31][5].feature_names

In [None]:
!cat trainedXGB/model_nof_31/params_xgb31.txt