# XGBoost and LightGBM model


**Reference:**
- [Kaggle1](https://www.kaggle.com/yamqwe/crypto-prediction-xgb-regressor)
- [Kaggle2](https://www.kaggle.com/tensorchoko/g-research-for-jp-en-coins)
- [XGBoost Official doc](https://xgboost.readthedocs.io/en/latest/python/index.html)
- [https://mljar.com/blog/xgboost-save-load-python/](https://mljar.com/blog/xgboost-save-load-python/)

In [2]:
import IPython.display
import traceback
import gc
import numpy as np
import pandas as pd
import datatable as dt
import lightgbm as lgb
from lightgbm import LGBMRegressor
import xgboost as xgb
print(f'xgboost version:{xgb.__version__}')
import talib as ta


TRAIN_JAY = './data/cryptocurrency-extra-data-binance-coin/orig_train.jay'
ASSET_DETAILS_JAY = './data/cryptocurrency-extra-data-binance-coin/orig_asset_details.jay'
df_train = dt.fread(TRAIN_JAY).to_pandas()
df_asset_details = dt.fread(ASSET_DETAILS_JAY).to_pandas().sort_values("Asset_ID")

xgboost version:1.5.0


## missing/infinite value

In [3]:
# there are missing values for the targets columns, filling gaps with the previous valid value.
#df_train.loc[df_train.isin([np.nan, np.inf, -np.inf]).any(axis=1)]
def fill_target(x):
    return x.sort_values('timestamp').fillna(method='pad')
df_train = df_train.groupby('Asset_ID').apply(fill_target).reset_index(drop=True)
df_train.dropna(subset=['Target'], inplace= True)

#VWAP make finite
VWAP_max = np.max(df_train[np.isfinite(df_train.VWAP)].VWAP)
VWAP_min = np.min(df_train[np.isfinite(df_train.VWAP)].VWAP)
df_train['VWAP'] = np.nan_to_num(df_train.VWAP, posinf=VWAP_max, neginf=VWAP_min)
sum((np.isfinite(df_train)).values)

array([24236289, 24236289, 24236289, 24236289, 24236289, 24236289,
       24236289, 24236289, 24236289, 24236289])

In [4]:
##reduce memory space by changing data_type
from script.memory_usage import reduce_mem_usage
df_train= reduce_mem_usage(df_train)
gc.collect()

Memory usage of dataframe is 2033.99 MB
int64->int32
int64->int8
float64->float32
float64->float32
float64->float32
float64->float32
float64->float32
float64->float32
float64->float32
float64->float32
Memory usage after optimization is: 1040.11 MB
Decreased by 48.9%


0

In [5]:
df_train.head()

Unnamed: 0,timestamp,Asset_ID,Count,Open,High,Low,Close,Volume,VWAP,Target
0,1514764860,0,5.0,8.53,8.53,8.53,8.53,78.379997,8.53,-0.014399
1,1514764920,0,7.0,8.53,8.53,8.5145,8.5145,71.389999,8.520215,-0.015875
2,1514764980,0,45.0,8.5065,8.5299,8.4848,8.4848,1546.819946,8.501393,-0.01541
3,1514765040,0,14.0,8.5009,8.5066,8.4744,8.5009,125.800003,8.47981,-0.012524
4,1514765100,0,5.0,8.5007,8.5007,8.456,8.456,125.010002,8.458435,-0.00594


In [6]:
## data time range
pd.to_datetime(df_train["timestamp"],unit="s",infer_datetime_format=True)

0          2018-01-01 00:01:00
1          2018-01-01 00:02:00
2          2018-01-01 00:03:00
3          2018-01-01 00:04:00
4          2018-01-01 00:05:00
                   ...        
24236801   2021-09-20 23:56:00
24236802   2021-09-20 23:57:00
24236803   2021-09-20 23:58:00
24236804   2021-09-20 23:59:00
24236805   2021-09-21 00:00:00
Name: timestamp, Length: 24236289, dtype: datetime64[ns]

## Utility functions to train a model

- TA-lib
- feature engineering [`get_features()`,`add_features`](./script/XGB/xgbmodel.py)
- XGBoost model configures [`params_xgb`](./script/XGB/xgbmodel.py)
- model fit
- model predict
- model evaluate

In [7]:
from script.XGB.xgbmodel import *

df = df_train[df_train["Asset_ID"] == 0]
pd.options.mode.chained_assignment = None  # default='warn'
df = get_features(df)
df = add_features(df)
df.dropna(axis = 0, inplace= True)

y = df.pop('Target') 
df.drop(['timestamp', 'Asset_ID'],axis=1)

Unnamed: 0,Count,Open,High,Low,Close,Volume,VWAP,Upper_Shadow,Lower_Shadow,lrtn_1,...,adx,DI_plus,DI_minus,ROCP,momentam,APO,PPO,CMO,MIDPOINT,TRENDLINE
63,13.0,8.332600,8.332600,8.299300,8.299300,194.720001,8.307179,0.000000,0.000000,-0.004688,...,26.842100,7.710664,27.392996,-0.004980,-0.041700,0.003904,0.046761,-10.926632,8.347350,8.349214
64,3.0,8.327200,8.327200,8.269500,8.269500,58.230000,8.298365,0.000000,0.000000,-0.003597,...,29.388427,6.906968,29.921036,-0.002480,-0.020700,0.000732,0.008770,-13.087717,8.347350,8.348563
65,5.0,8.308500,8.327200,8.308500,8.327200,18.910000,8.326820,0.000000,0.000000,0.006953,...,31.752873,6.209908,26.901368,-0.007620,-0.063800,-0.004309,-0.051621,-20.299357,8.341450,8.347516
66,3.0,8.327200,8.327200,8.327100,8.327100,75.300003,8.327168,0.000000,0.000000,-0.000012,...,33.948430,6.208737,26.896295,-0.004483,-0.037500,-0.006311,-0.075602,-10.431340,8.341400,8.346575
67,2.0,8.305500,8.327200,8.305500,8.327200,2.970000,8.320113,0.000000,0.000000,0.000012,...,36.300265,5.947009,29.958509,-0.001779,-0.014800,-0.010520,-0.126043,-18.759487,8.338900,8.345790
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1942614,277.0,364.111511,364.200012,362.700012,362.917999,773.176025,363.458771,0.088501,0.217987,-0.003282,...,20.253548,22.109629,19.181532,0.009576,3.453522,0.996409,0.275506,26.237914,362.313004,361.786063
1942615,296.0,362.926514,363.200012,362.100006,362.200012,596.347778,362.655640,0.273499,0.100006,-0.001980,...,18.998302,20.547849,21.679586,0.002230,0.807526,1.074526,0.296993,8.105143,362.313004,361.978759
1942616,319.0,362.230988,362.399994,361.700012,361.858002,820.909729,361.962982,0.169006,0.157990,-0.000945,...,18.260086,19.599101,23.317047,0.000930,0.336487,1.098014,0.303395,-0.893123,362.313004,362.106636
1942617,588.0,362.011993,364.299988,361.799988,364.114502,1268.055176,363.465607,0.185486,0.212006,0.006217,...,18.193979,28.104836,19.800623,0.001998,0.721985,0.988579,0.273081,-3.613520,362.464493,362.203774


## Loop over all assets  

- Train one xgboost model for each assets and saved into a `.json` file.
- Train one lightBGM model for each assets and saved into a `.txt` file.

In [8]:
from script.XGB.xgbmodel import *
from os.path import exists
models = {}

def model_reload_train(type: str, folder="./trainedXGB/XGB0"):
    for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
        if type=='xgb':
            model_file = folder+f"/model_{asset_id}.json"
        elif type=='light':
            model_file = folder+f"/model_{asset_id}.txt"
        else:
            raise ValueError
        
        if exists(model_file):
            print(f"model for {asset_name:<16} (ID={asset_id:<2}) exists")
            if type=='xgb':
                model = xgb.XGBRegressor()
                model.load_model(model_file)
            elif type=='light':
                model = lgb.Booster(model_file)
            else:
                raise ValueError
            
            models[asset_id] = model
        else:
            print(f"Training {type} model for {asset_name:<16} (ID={asset_id:<2})")
            pd.options.mode.chained_assignment = None  # default='warn'
            if type=='xgb':
                models[asset_id] = get_Xy_and_model_for_asset(df_train, asset_id)
                models[asset_id].save_model(model_file)
            elif type=='light':
                models[asset_id] = get_Xy_and_model_for_asset_1(df_train, asset_id)
                models[asset_id].booster_.save_model(model_file)
            else:
                raise ValueError

model_reload_train(type='xgb',folder = "./trainedXGB/XGB0")

model for Binance Coin     (ID=0 ) exists
model for Bitcoin          (ID=1 ) exists
model for Bitcoin Cash     (ID=2 ) exists
model for Cardano          (ID=3 ) exists
model for Dogecoin         (ID=4 ) exists
model for EOS.IO           (ID=5 ) exists
model for Ethereum         (ID=6 ) exists
model for Ethereum Classic (ID=7 ) exists
model for IOTA             (ID=8 ) exists
model for Litecoin         (ID=9 ) exists
model for Maker            (ID=10) exists
model for Monero           (ID=11) exists
model for Stellar          (ID=12) exists
model for TRON             (ID=13) exists


In [9]:
params_xgb

{'learning_rate': 0.05,
 'max_depth': 11,
 'n_estimators': 500,
 'subsample': 0.9,
 'colsample_bytree': 0.7,
 'missing': -999,
 'random_state': 2020,
 'objective': 'reg:squarederror'}

In [10]:
## model hyper-parameters
models[0]

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.05, max_delta_step=0,
             max_depth=11, min_child_weight=1, missing=-999,
             monotone_constraints='()', n_estimators=500, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=2020,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.9,
             tree_method='gpu_hist', validate_parameters=1, verbosity=None)

## model prediction test

In [11]:
# Check the model interface
x = get_features(df_train.iloc[1])
x =pd.DataFrame([x])
x = add_features(x)
x = x.fillna(-1)

x = x.drop(['timestamp', 'Asset_ID','Target'],axis=1)
y_pred = models[0].predict(x)
y_pred[0]

0.13024198

## Scores

Measure with weighed correlation.

In [12]:
from script.kagglescore import weighted_correlation
weighted_correlation.__doc__

"\n    'a' and 'b' are the expected and predicted targets, \n        and ' weights' include the weight of each row, determined by its asset\n    "