Reference: 
- [Kaggle](https://www.kaggle.com/yamqwe/crypto-prediction-xgb-regressor)
- [Official doc](https://xgboost.readthedocs.io/en/latest/python/index.html)
- [https://mljar.com/blog/xgboost-save-load-python/](https://mljar.com/blog/xgboost-save-load-python/)

In [1]:
import traceback
import numpy as np
import pandas as pd
import datatable as dt
from lightgbm import LGBMRegressor
import xgboost as xgb
print(xgb.__version__)

TRAIN_JAY = './data/cryptocurrency-extra-data-binance-coin/orig_train.jay'
ASSET_DETAILS_JAY = './data/cryptocurrency-extra-data-binance-coin/orig_asset_details.jay'

1.5.0


In [2]:
df_train = dt.fread('./data/cryptocurrency-extra-data-binance-coin/orig_train.jay').to_pandas()
df_asset_details = dt.fread('./data/cryptocurrency-extra-data-binance-coin/orig_asset_details.jay').to_pandas().sort_values("Asset_ID")

In [3]:
## Target fillin
df_train['Target'] = df_train['Target'].fillna(0)
#VWAP make finite
VWAP_max = np.max(df_train[np.isfinite(df_train.VWAP)].VWAP)
VWAP_min = np.min(df_train[np.isfinite(df_train.VWAP)].VWAP)
df_train['VWAP'] = np.nan_to_num(df_train.VWAP, posinf=VWAP_max, neginf=VWAP_min)

## Utility functions to train a model for one asset

**Main Training Function**, **Feature Extraction**

In [4]:
from script.XGB.xgbmodel import *

## Loop over all assets

In [5]:
from os.path import exists
Xs = {}
ys = {}
models = {}

for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
    
    if exists(f"./trainedXGB/model_{asset_id}.json"):
        print(f"model for {asset_name:<16} (ID={asset_id:<2}) exists")
        model = xgb.XGBRegressor()
        model.load_model(f"./trainedXGB/model_{asset_id}.json")
        df = df_train[df_train["Asset_ID"] == asset_id]
        df_proc = get_features(df)
        df_proc['y'] = df['Target']
        df_proc = df_proc.dropna(how="any")
        X = df_proc.drop("y", axis=1)
        y = df_proc["y"]
        Xs[asset_id], ys[asset_id], models[asset_id] = X, y, model
        next
    else:
        print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
        try:
            X, y, model = get_Xy_and_model_for_asset(df_train, asset_id)    
            Xs[asset_id], ys[asset_id], models[asset_id] = X, y, model
        except:         
            Xs[asset_id], ys[asset_id], models[asset_id] = None, None, None    
        #save model
        models[asset_id].save_model(f"./trainedXGB/model_{asset_id}.json")

model for Binance Coin     (ID=0 ) exists
model for Bitcoin          (ID=1 ) exists
model for Bitcoin Cash     (ID=2 ) exists
model for Cardano          (ID=3 ) exists
model for Dogecoin         (ID=4 ) exists
model for EOS.IO           (ID=5 ) exists
model for Ethereum         (ID=6 ) exists
model for Ethereum Classic (ID=7 ) exists
model for IOTA             (ID=8 ) exists
model for Litecoin         (ID=9 ) exists
model for Maker            (ID=10) exists
model for Monero           (ID=11) exists
model for Stellar          (ID=12) exists
model for TRON             (ID=13) exists


In [7]:
# Check the model interface
x = get_features(df_train.iloc[1])
y_pred = models[0].predict(pd.DataFrame([x]))
y_pred[0]

-0.00012690053

In [8]:
models

{0: XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.05, max_delta_step=0,
              max_depth=11, min_child_weight=1, missing=-999,
              monotone_constraints='()', n_estimators=500, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=2020,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.9,
              tree_method='auto', validate_parameters=1, verbosity=None),
 1: XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.7, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.05, max_delta_step=0,
              max_depth=11, min_child_weight=1, missing=-999,
     

# Submit To Kaggle

In [9]:
data_path = './data/'
submission = []
for i in range(4):
    test_csv,sample_prediction_df_csv = f'{data_path}temp/{i}test_df.csv', f'{data_path}temp/{i}sample_prediction_df.csv'
    df_test, df_pred= pd.read_csv(test_csv,usecols =range(1,11)),pd.read_csv(sample_prediction_df_csv,usecols =range(1,3))
    for j , row in df_test.iterrows():
        if models[row['Asset_ID']] is not None:
            try:
                model = models[row['Asset_ID']]
                x_test = get_features(row)
                y_pred = model.predict(pd.DataFrame([x_test]))[0]
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
            except:
                print(row['Asset_ID'])
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
                traceback.print_exc()
        else: 
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
    submission.append(df_pred)

print(pd.concat(submission,axis=1))


    row_id    Target  row_id    Target  row_id    Target  row_id    Target
0        0 -0.000253      14 -0.000038      28 -0.000200      42 -0.000101
1        1  0.000019      15  0.000319      29 -0.000023      43 -0.000036
2        2 -0.000027      16 -0.000168      30 -0.000029      44  0.000028
3        3  0.000076      17 -0.000043      31  0.000004      45 -0.000117
4        4 -0.000037      18 -0.000040      32  0.000016      46 -0.000033
5        5  0.000015      19  0.000058      33  0.000031      47  0.000034
6        6 -0.000008      20  0.000067      34 -0.000197      48 -0.000031
7        7  0.000196      21  0.000095      35  0.000186      49  0.000188
8        8  0.000122      22  0.000182      36  0.000355      50  0.000501
9        9 -0.000090      23 -0.000029      37 -0.000029      51 -0.000047
10      10 -0.000035      24 -0.000073      38 -0.000039      52 -0.000073
11      11  0.000041      25  0.000064      39 -0.000002      53  0.000048
12      12  0.000007     