Reference: 
- [Kaggle1](https://www.kaggle.com/yamqwe/crypto-prediction-xgb-regressor)
- [Kaggle2](https://www.kaggle.com/tensorchoko/g-research-for-jp-en-coins)
- [Official doc](https://xgboost.readthedocs.io/en/latest/python/index.html)
- [https://mljar.com/blog/xgboost-save-load-python/](https://mljar.com/blog/xgboost-save-load-python/)

In [63]:
import IPython.display
import traceback
import gc
import numpy as np
import pandas as pd
import datatable as dt
import lightgbm as lgb
from lightgbm import LGBMRegressor
import xgboost as xgb
print(xgb.__version__)
import talib as ta


TRAIN_JAY = './data/cryptocurrency-extra-data-binance-coin/orig_train.jay'
ASSET_DETAILS_JAY = './data/cryptocurrency-extra-data-binance-coin/orig_asset_details.jay'

1.5.0


In [64]:
df_train = dt.fread('./data/cryptocurrency-extra-data-binance-coin/orig_train.jay').to_pandas()
df_asset_details = dt.fread('./data/cryptocurrency-extra-data-binance-coin/orig_asset_details.jay').to_pandas().sort_values("Asset_ID")

In [65]:
#VWAP make finite
VWAP_max = np.max(df_train[np.isfinite(df_train.VWAP)].VWAP)
VWAP_min = np.min(df_train[np.isfinite(df_train.VWAP)].VWAP)
df_train['VWAP'] = np.nan_to_num(df_train.VWAP, posinf=VWAP_max, neginf=VWAP_min)

## Utility functions to train a model for one asset

**Main Training Function**, **Feature Extraction**

In [66]:
from script.XGB.xgbmodel import *
# Memory saving function credit to https://www.kaggle.com/gemartin/load-data-reduce-memory-usage
def reduce_mem_usage(df):
    """ 
        iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:
        col_type = df[col].dtype.name
        
        if col_type not in ['object', 'category', 'datetime64[ns, UTC]', 'datetime64[ns]']:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float64).min and c_max < np.finfo(np.float64).max:
                    df[col] = df[col].astype(np.float64)
                else:
                    next
        print(f'{col_type}->{df[col].dtype.name}')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

df_train= reduce_mem_usage(df_train)
gc.collect()

Memory usage of dataframe is 1849.12 MB
int64->int32
int64->int8
float64->float32
float64->float32
float64->float32
float64->float32
float64->float32
float64->float32
float64->float32
float64->float32
Memory usage after optimization is: 855.22 MB
Decreased by 53.7%


255

## Loop over all assets

In [67]:
from os.path import exists
Xs = {}
ys = {}
models = {}

for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
    
    if exists(f"./trainedXGB/XGB1/model_{asset_id}.txt"):
        print(f"model for {asset_name:<16} (ID={asset_id:<2}) exists")
        ## for xgbregressor0
        # model = xgb.XGBRegressor()
        # model.load_model(f"./trainedXGB/XGB0/model_{asset_id}.json")
        
        # df = df_train[df_train["Asset_ID"] == asset_id]
        # df_proc = get_features(df)
        
        # df_proc['y'] = df['Target']
        # df_proc = df_proc.dropna(how="any")
        # X = df_proc.drop("y", axis=1)
        # y = df_proc["y"]
        
        ## for lightgbm1
        model = lgb.Booster(model_file=f"./trainedXGB/XGB1/model_{asset_id}.txt")
        df = df_train[df_train["Asset_ID"] == asset_id]
        df = df.dropna(subset=['Target'])
        y = df['Target'] 
        df = df.drop(['Target'],axis=1)
        df_proc = get_features(df)
        df_proc = add_features(df_proc)
        df_proc = df_proc.fillna(-1)
        df_proc = df_proc.drop(['timestamp', 'Asset_ID','hour','dayofweek', 'day'],axis=1)
        X= df_proc 

        Xs[asset_id], ys[asset_id], models[asset_id] = X, y, model
        next
    else:
        print(f"Training model for {asset_name:<16} (ID={asset_id:<2})")
        try:
            X, y, model = get_Xy_and_model_for_asset_1(df_train, asset_id)    
            Xs[asset_id], ys[asset_id], models[asset_id] = X, y, model
        except:         
            Xs[asset_id], ys[asset_id], models[asset_id] = None, None, None    
        #save model
        models[asset_id].booster_.save_model(f"./trainedXGB/XGB1/model_{asset_id}.txt")

model for Binance Coin     (ID=0 ) exists
model for Bitcoin          (ID=1 ) exists
model for Bitcoin Cash     (ID=2 ) exists
model for Cardano          (ID=3 ) exists
model for Dogecoin         (ID=4 ) exists
model for EOS.IO           (ID=5 ) exists
model for Ethereum         (ID=6 ) exists
model for Ethereum Classic (ID=7 ) exists
model for IOTA             (ID=8 ) exists
model for Litecoin         (ID=9 ) exists
model for Maker            (ID=10) exists
model for Monero           (ID=11) exists
model for Stellar          (ID=12) exists
model for TRON             (ID=13) exists


In [68]:
# Check the model interface
x = get_features(df_train.iloc[1])
x=pd.DataFrame([x])
x = add_features(x)
x = x.fillna(-1)
x = x.drop(['timestamp', 'Asset_ID','hour','dayofweek', 'day','Target'],axis=1)
y_pred = models[0].predict(x)
y_pred[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cacher_needs_updating = self._check_is_chained_assignment_possible()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_with_indexer_missing(indexer, value)


-0.036999944382306124

## Submit to Kaggle Lightxgb

- attached with previous data(x_prior)

In [69]:
data_path = './data/'
train = pd.read_csv(data_path+'/train.csv').set_index('timestamp')
WINDOW_SIZE = 100


In [70]:
submission = []
for i in range(4):
    test_csv,sample_prediction_df_csv = f'{data_path}kagglesubmission/{i}test_df.csv', f'{data_path}kagglesubmission/{i}sample_prediction_df.csv'
    df_test, df_pred= pd.read_csv(test_csv,usecols =range(1,11)),pd.read_csv(sample_prediction_df_csv,usecols =range(1,3))
    if i ==0:
        #add placeholder
        ts_last = df_test['timestamp'].unique().item()
        ts_placeholder = list(range(ts_last-60*(WINDOW_SIZE-1),ts_last, 60))
        placeholder = train.loc[ts_placeholder,:].reset_index() #previous sample
        placeholder = placeholder.drop('Target',axis =1)
        print(placeholder.shape)
    for j , row in df_test.iterrows():
        if models[row['Asset_ID']] is not None:
            try:
                x_prior = placeholder.loc[placeholder['Asset_ID']==row['Asset_ID'],:]#99X9
                x_test = x_prior.append(row[x_prior.columns])
                placeholder = placeholder.append(row[placeholder.columns])#add to placeholder

                x_test = get_features(x_test)
                x_test = add_features(x_test)
                x_test = x_test.fillna(-1)
                x_test = x_test.drop(['timestamp', 'Asset_ID','hour','dayofweek', 'day'],axis=1)

                model = models[row['Asset_ID']]
                y_pred = model.predict(x_test.iloc[-2:,:]) #100
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred[-1]
            except:
                
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
                traceback.print_exc()
        else: 
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
    submission.append(df_pred)
print(placeholder.shape)
print(pd.concat(submission,axis=1))

(1385, 9)
(1441, 9)
    row_id    Target  row_id    Target  row_id    Target  row_id    Target
0        0 -0.014661      14 -0.005366      28 -0.001190      42 -0.012154
1        1  0.005207      15 -0.005224      29  0.003402      43  0.001301
2        2 -0.040518      16 -0.011133      30 -0.020008      44 -0.028566
3        3  0.004674      17 -0.001437      31 -0.003973      45 -0.005514
4        4 -0.029864      18 -0.027143      32 -0.043370      46 -0.055095
5        5 -0.005070      19 -0.011641      33 -0.013088      47 -0.022040
6        6  0.028932      20  0.011306      34 -0.007918      48 -0.025200
7        7 -0.001971      21  0.001072      35 -0.000579      49 -0.004929
8        8 -0.022743      22 -0.038666      36 -0.045859      50 -0.054999
9        9 -0.008685      23 -0.003194      37 -0.000035      51 -0.003864
10      10 -0.029130      24 -0.028767      38 -0.028118      52 -0.033359
11      11 -0.023421      25 -0.026586      39 -0.028743      53 -0.027078
12   

# Submit To Kaggle XGBoost

In [71]:
data_path = './data/'
submission = []
for i in range(4):
    test_csv,sample_prediction_df_csv = f'{data_path}kagglesubmission/{i}test_df.csv', f'{data_path}kagglesubmission/{i}sample_prediction_df.csv'
    df_test, df_pred= pd.read_csv(test_csv,usecols =range(1,11)),pd.read_csv(sample_prediction_df_csv,usecols =range(1,3))
    for j , row in df_test.iterrows():
        if models[row['Asset_ID']] is not None:
            try:
                model = models[row['Asset_ID']]
                x_test = get_features(row)
                y_pred = model.predict(pd.DataFrame([x_test]))[0]
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = y_pred
            except:
                print(row['Asset_ID'])
                df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
                traceback.print_exc()
        else: 
            df_pred.loc[df_pred['row_id'] == row['row_id'], 'Target'] = 0
    submission.append(df_pred)

print(pd.concat(submission,axis=1))


3.0
2.0
0.0
1.0
4.0
5.0
7.0
6.0
8.0
9.0
10.0
13.0
12.0
11.0
3.0
2.0
0.0
1.0
4.0
5.0
7.0
6.0
8.0
9.0
10.0
13.0
12.0
11.0
3.0
2.0
0.0
1.0
4.0
5.0
7.0
6.0
8.0
9.0
10.0
13.0
12.0
11.0
3.0
2.0
0.0
1.0
4.0
5.0
7.0
6.0
8.0
9.0
10.0
13.0
12.0
11.0
    row_id  Target  row_id  Target  row_id  Target  row_id  Target
0        0     0.0      14     0.0      28     0.0      42     0.0
1        1     0.0      15     0.0      29     0.0      43     0.0
2        2     0.0      16     0.0      30     0.0      44     0.0
3        3     0.0      17     0.0      31     0.0      45     0.0
4        4     0.0      18     0.0      32     0.0      46     0.0
5        5     0.0      19     0.0      33     0.0      47     0.0
6        6     0.0      20     0.0      34     0.0      48     0.0
7        7     0.0      21     0.0      35     0.0      49     0.0
8        8     0.0      22     0.0      36     0.0      50     0.0
9        9     0.0      23     0.0      37     0.0      51     0.0
10      10     0.0     

[LightGBM] [Fatal] The number of features in data (12) is not the same as it was in training data (36).
You can set ``predict_disable_shape_check=true`` to discard this error, but please be aware what you are doing.
Traceback (most recent call last):
  File "/var/folders/_0/sgv1wmb92l11hndgfl0tyvjr0000gn/T/ipykernel_44551/3870017929.py", line 11, in <module>
    y_pred = model.predict(pd.DataFrame([x_test]))[0]
  File "/opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages/lightgbm/basic.py", line 3538, in predict
    return predictor.predict(data, start_iteration, num_iteration,
  File "/opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages/lightgbm/basic.py", line 848, in predict
    preds, nrow = self.__pred_for_np2d(data, start_iteration, num_iteration, predict_type)
  File "/opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages/lightgbm/basic.py", line 938, in __pred_for_np2d
    return inner_predict(

## Scores