# XGB model hyperparameters tuning

- `xgboost.cv`
- Comparing the optimal candidates by test set, using the g-research weighted correlation metrics.

## 1. cv tuned model folders

- `model_nof_{version number}`

In [1]:
import os
os.listdir('./trainedXGB/')

['XGB0', '.DS_Store', 'model_nof_14', 'model_nof_13', 'model_nof_15']

## organized data set  

- new_data.ftr

In [2]:
import pandas as pd
import gc
df_train = pd.read_feather('./data'+'/new_data.ftr', 
                           columns=['timestamp', 'Asset_ID', 'Count', 'Open', 'High', 'Low',
                                       'Close', 'Volume', 'VWAP', 'Target','Crypto_Index','Weight'])

In [3]:
pd.to_datetime(df_train["timestamp"],unit="s",
               infer_datetime_format=True).describe(datetime_is_numeric=True)

count                         25759299
mean     2020-01-19 15:11:25.009727232
min                2017-08-17 00:00:00
25%                2019-02-09 18:04:00
50%                2020-02-07 21:00:00
75%                2021-01-13 07:15:00
max                2021-12-22 23:43:00
Name: timestamp, dtype: object

In [4]:
from script.XGB.xgbmodel import get_features
pd.options.mode.chained_assignment = None  # default='warn'
get_features(df_train[df_train['Asset_ID']==5])

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target,Crypto_Index,Weight,Upper_Shadow,Lower_Shadow
9537149,1514764860,5,32.0,7.6596,7.6596,7.6567,7.6576,6626.713370,7.657713,-0.013922,4137.615640,1.386294,0.0000,0.0009
9537150,1514764920,5,10.0,7.6568,7.6569,7.6567,7.6567,3277.475494,7.656749,-0.014534,3912.356671,1.386294,0.0001,0.0000
9537151,1514764980,5,22.0,7.6569,7.6569,7.6500,7.6512,5623.557585,7.654267,-0.012546,3904.157543,1.386294,0.0000,0.0012
9537152,1514765040,5,12.0,7.6500,7.6503,7.6358,7.6358,1696.632459,7.649660,-0.011170,3897.749143,1.386294,0.0003,0.0000
9537153,1514765100,5,19.0,7.6358,7.6358,7.6100,7.6100,2268.362218,7.628076,-0.006154,3885.251133,1.386294,0.0000,0.0000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11587169,1637969940,5,113.0,3.8790,3.8790,3.8600,3.8630,26508.500000,3.869500,0.000899,9722.884153,1.386294,0.0000,0.0030
11587170,1637970000,5,107.0,3.8630,3.8670,3.8590,3.8590,16279.300000,3.863000,0.000149,9717.191088,1.386294,0.0040,0.0000
11587171,1637970060,5,90.0,3.8580,3.8590,3.8500,3.8530,7815.000000,3.854500,0.000618,9703.766371,1.386294,0.0010,0.0030
11587172,1637970120,5,145.0,3.8510,3.8570,3.8390,3.8390,15173.600000,3.848000,0.000931,9689.570361,1.386294,0.0060,0.0000


## test set
- train set range is after `df_train['timestamp'].quantile(0.5)`
- test set

In [5]:
df_train['timestamp'].quantile(0.5)

1581109200.0

In [6]:
pd.to_datetime(df_train['timestamp'].quantile(0.5),unit="s",
               infer_datetime_format=True)

Timestamp('2020-02-07 21:00:00')

In [7]:
import datetime
import time

test_days= 30*2
#out-train test
test_end= df_train['timestamp'].quantile(0.5) - 60
test_start= test_end - 60*(test_days*24*60)
df_test = df_train[(test_start<=df_train['timestamp'])& (df_train['timestamp']<=test_end)]
df_test = get_features(df_test)

#in-train test
test_end2= df_train['timestamp'].quantile(0.9) - 60
test_start2= test_end2 - 60*(test_days*24*60)
df_test2 = df_train[(test_start2<=df_train['timestamp'])& (df_train['timestamp']<=test_end2)]
df_test2 = get_features(df_test2)

del df_train
gc.collect()
# print([test_start,test_end])
# pd.to_datetime([test_start,test_end],
#                unit="s", infer_datetime_format=True)

0

## Performance on test set

In [8]:
import xgboost as xgb
from os.path import exists

ASSET_DETAILS_CSV = './data/asset_details.csv'
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")

def model_reload_train(param_version):
    models = {}
    for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
        model_file = f"./trainedXGB/model_nof_{param_version}/model_{asset_id}.json"
        if exists(model_file):
            model = xgb.Booster()
            model.load_model(model_file)
            models[asset_id] = model
    return models

In [10]:
from script.weighted_corr_metrics import weighted_correlation

result = []
result2 = []
for version_num in [13,14,15]:
    models=model_reload_train(param_version= version_num)
    result_id=[]
    result_id2=[]
    for id in range(0,14):
        model = models[id]
        x = df_test[df_test['Asset_ID']==id]
        x['Pred'] = model.predict(xgb.DMatrix(x[model.feature_names]))
        result_id.append(x[['timestamp','Asset_ID','Weight','Target','Pred']])
        x2 = df_test2[df_test2['Asset_ID']==id]
        x2['Pred2'] = model.predict(xgb.DMatrix(x2[model.feature_names]))
        result_id2.append(x2[['timestamp','Asset_ID','Weight','Target','Pred2']])
        
    result.append(pd.concat(result_id, axis=0))
    result2.append(pd.concat(result_id2, axis=0))
    score=weighted_correlation(a=result[-1]['Target'], 
                     b=result[-1]['Pred'], 
                     weights=result[-1]['Weight'])
    score2=weighted_correlation(a=result2[-1]['Target'], 
                     b=result2[-1]['Pred2'], 
                     weights=result2[-1]['Weight'])
    
    print(f'predict with model_nof_{version_num} finished. Out-train-score:{score}, In-train-score:{score2}')
    


predict with model_nof_13 finished. Out-train-score:0.0008034555356094846, In-train-score:0.95380248609865
predict with model_nof_14 finished. Out-train-score:0.0028732162014596272, In-train-score:0.9749513709489366
predict with model_nof_15 finished. Out-train-score:0.0069584889147281245, In-train-score:0.7719050344916323
