In [1]:
import talib as ta
import numpy as np
import pandas as pd
import gc
import time
import xgboost as xgb
import os
from random import sample
from os.path import exists
import json

from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV   #Perforing grid search
from sklearn import metrics
import matplotlib.pylab as plt
%matplotlib inline

## data

In [2]:
PRE_MOD_FOLDER = "./trainedXGB"#'../input/mytrainedxgb'
ASSET_DETAILS_CSV = './data/asset_details.csv'

In [3]:
df_asset_details = pd.read_csv(ASSET_DETAILS_CSV).sort_values("Asset_ID")

#######
df_train=pd.read_feather('./data/new_data.ftr',
                        columns=['timestamp', 'Asset_ID', 'Count', 'Open', 'High', 'Low', 'Close',
                               'Volume', 'Target', 'Weight', 'lr_15', 'Mkt_lrt_15','Crypto_Index',
                                 'beta','lr_mkt_resid'])

print('finished loading')
print(df_train.columns)

finished loading
Index(['timestamp', 'Asset_ID', 'Count', 'Open', 'High', 'Low', 'Close',
       'Volume', 'Target', 'Weight', 'lr_15', 'Mkt_lrt_15', 'Crypto_Index',
       'beta', 'lr_mkt_resid'],
      dtype='object')


In [4]:
df_test = df_train[df_train['timestamp']>=df_train['timestamp'].quantile(0.95)]
df_retrain = df_train[(df_train['timestamp']>df_train['timestamp'].quantile(0.45)) & \
                      (df_train['timestamp']<df_train['timestamp'].quantile(0.95))]

del df_train
gc.collect()

0

## features

In [5]:
##initial feature parameters to hyperparameter selection
lrtn,fastk1,fastk2,adx,macd_s,macd_l,rsi,std_Crypto_Index,std_lr_15,std_Mkt_lrt_15 = \
(30, 5, 15, 30, 15, 25, 60, 15, 15, 5)

def log_return(series, periods=5):
    return np.log(series).diff(periods)

def upper_shadow(df):
    return ta.SUB(df['High'], np.maximum(df['Close'], df['Open']))

def lower_shadow(df):
    return ta.SUB(np.minimum(df['Close'], df['Open']), df['Low'] )

def lag_features(df):    
    ####TECH indicators
    df['slowK'], df['slowD'] = ta.STOCH(df.High, df.Low, df.Close, 
                                        fastk_period=fastk1, slowk_period=int(3*fastk1/5), slowd_period=int(3*fastk1/5),
                                        slowk_matype=0, slowd_matype=0)
    df['fastK'], df['fastD'] = ta.STOCHF(df.High, df.Low, df.Close,
                                         fastk_period=fastk2, fastd_period=int(3*fastk2/5), 
                                         fastd_matype=0)
    df[f'rsi_{rsi}'] = ta.RSI(df['Close'], timeperiod=rsi)
    df[f'macd_{macd_s}_{macd_l}'],df['macd_signal'], df['macd_hist'] = \
                ta.MACD(df['Close'],fastperiod=macd_s, slowperiod=macd_l, signalperiod=5)
    df[f'adx_{adx}'] = ta.ADX(df['High'], df['Low'],df['Close'], timeperiod=adx)#Average Directional Movement Index
    df['AD'] = ta.AD(df['High'], df['Low'],df['Close'], df['Volume'])#Accumulation Distribution Line
    ####std volatility
    df[f'std_lr_15_{std_lr_15}'] = ta.STDDEV(df.lr_15,timeperiod=std_lr_15, nbdev=1)
    df[f'std_Mkt_lrt_15_{std_Mkt_lrt_15}'] = ta.STDDEV(df.Mkt_lrt_15,timeperiod=std_Mkt_lrt_15, nbdev=1)
    df[f'std_Crypto_Index_{std_Crypto_Index}'] = ta.STDDEV(df.Crypto_Index,timeperiod=std_Crypto_Index, nbdev=1)
def get_features(df_feat):
    pd.options.mode.chained_assignment = None  # default='warn'
    df_feat[f'lrtn_index_{lrtn}'] = log_return(df_feat.Crypto_Index, lrtn)
    lag_features(df_feat)
    return df_feat

## Tune start here

In [6]:
####parameters placeholder
#https://xgboost.readthedocs.io/en/stable/parameter.html#general-parameters
params_general ={'booster': 'gbtree', 'verbosity':0, 'validate_parameters': 1}
params_booster ={
    'learning_rate': 0.3,#check
    'gamma': 0, #gamma. check
    'max_depth': 6,#check
    'min_child_weight': 1, #instance weight (hessian). check
    'subsample': 0.8,#check
    'colsample_bytree': 1,#check
    'reg_lambda': 1,#L2 regularization term on weights
    'reg_alpha': 0, #L1 regularization term on weights
    'tree_method': 'hist', #hist, gpu_hist
    'predictor': 'auto', #auto, gpu_predictor
}
params_learning={
    'objective': 'reg:squarederror', 'eval_metric': 'rmse', 'base_score':0.5,
    'seed': 2021
}

params_xgb = {**params_general, **params_booster, **params_learning}

#https://xgboost.readthedocs.io/en/stable/parameter.html#command-line-parameters
params_train={
    'num_boost_round': 500, #alias as 'n_estimators' in sklearn api
    'early_stopping_rounds':10, 'verbose_eval': False
}

############alias in sklearn api
params_sklearn={'n_estimators':params_train['num_boost_round']}
xgb_sklearn = XGBRegressor(**params_xgb,**params_sklearn)

print("finish paramx_xgb initialization")

finish paramx_xgb initialization


In [7]:
xgb_sklearn.get_params()

{'objective': 'reg:squarederror',
 'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': 1,
 'enable_categorical': False,
 'gamma': 0,
 'gpu_id': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': 0.3,
 'max_delta_step': None,
 'max_depth': 6,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 500,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': 'auto',
 'random_state': None,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': None,
 'subsample': 0.8,
 'tree_method': 'hist',
 'validate_parameters': 1,
 'verbosity': 0,
 'eval_metric': 'rmse',
 'seed': 2021}

In [8]:
#make tune data
def make_tune_data(df, asset_id=1):
    pd.options.mode.chained_assignment = None  # default='warn'
    tune_train = df_retrain[df_retrain["Asset_ID"] == asset_id]
    tune_train = get_features(tune_train)
    tune_train.dropna(axis = 0, inplace= True)#for lag_features missing rows:<100
    dtrain=xgb.DMatrix(tune_train.drop(['timestamp', 'Asset_ID','Target','Weight'],axis=1),
                    label= tune_train['Target'])
    ###########
    tune_test = df_test[df_test["Asset_ID"] == asset_id]
    tune_test = get_features(tune_test)
    tune_test.dropna(axis = 0, inplace= True)#for lag_features missing rows:<100
    dtest=xgb.DMatrix(tune_test.drop(['timestamp', 'Asset_ID','Target','Weight'],axis=1),
                    label= tune_test['Target'])
    return tune_train,dtrain,tune_test,dtest

In [9]:
tune_train,dtrain,tune_test,dtest = make_tune_data(df=df_retrain, asset_id=1)

In [10]:
def modelfit(xgb_sklearn, tune_train, dtrain, useTrainCV=True, cv_folds=5, early_stopping_rounds=10):
    '''
    useTrainCV: is for auto select best n_estimators or num_boost_round.
    '''
    if useTrainCV:
        xgb_param = xgb_sklearn.get_xgb_params()
        cvresult = xgb.cv(xgb_param, dtrain, 
                          num_boost_round = xgb_sklearn.get_params()['n_estimators'], 
                          nfold=cv_folds, 
                          metrics=['rmse'], 
                          early_stopping_rounds=early_stopping_rounds,
                          verbose_eval=False,as_pandas=True)
        xgb_sklearn.set_params(n_estimators = cvresult.shape[0])
    #Fit the algorithm on the data
    xgb_sklearn.fit(X=tune_train.drop(['timestamp', 'Asset_ID','Target','Weight'],axis=1),
                    y= tune_train['Target'], eval_metric='rmse')
    #Predict training set:
    dtest_predictions = xgb_sklearn.predict(tune_test.drop(['timestamp', 'Asset_ID','Target','Weight'],axis=1))
    dtrain_predictions = xgb_sklearn.predict(tune_train.drop(['timestamp', 'Asset_ID','Target','Weight'],axis=1))
    print(f"test-set-mse: {metrics.mean_squared_error(y_true=tune_test['Target'],y_pred=dtest_predictions)}, \
           train-set-mse: {metrics.mean_squared_error(y_true=tune_train['Target'],y_pred=dtrain_predictions)}")

In [11]:
modelfit(xgb_sklearn, tune_train, dtrain)
xgb_sklearn.get_params()

test-set-mse: 2.1307763342596816e-05,            train-set-mse: 2.4657125145904318e-06


{'objective': 'reg:squarederror',
 'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'enable_categorical': False,
 'gamma': 0,
 'gpu_id': -1,
 'importance_type': None,
 'interaction_constraints': '',
 'learning_rate': 0.3,
 'max_delta_step': 0,
 'max_depth': 6,
 'min_child_weight': 1,
 'missing': nan,
 'monotone_constraints': '()',
 'n_estimators': 500,
 'n_jobs': 8,
 'num_parallel_tree': 1,
 'predictor': 'auto',
 'random_state': 2021,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'subsample': 0.8,
 'tree_method': 'hist',
 'validate_parameters': 1,
 'verbosity': 0,
 'eval_metric': 'rmse',
 'seed': 2021}

In [12]:
def gsearch_tune(grid_dict, xgb_sklearn):
    gsearch = GridSearchCV(estimator = XGBRegressor(**xgb_sklearn.get_params()), 
                        param_grid = grid_dict, scoring='neg_root_mean_squared_error', cv=5, refit=True)
    gsearch.fit(tune_train.drop(['timestamp', 'Asset_ID','Target','Weight'],axis=1),
             tune_train['Target'])
    print(f"gsearch.best_score_: {gsearch.best_score_}, gsearch.best_params_: {gsearch.best_params_}")
    return gsearch.best_estimator_

### max_depth, min_child_weight

In [14]:
##around optimal check
param_test = {
 'max_depth':[5,8,10],
 'min_child_weight':[0.8,1,1.5,2,2.2]
}
xgb_sklearn = gsearch_tune(grid_dict=param_test, xgb_sklearn = xgb_sklearn)
# modelfit(xgb_sklearn, tune_train, dtrain)
# xgb_sklearn.get_params()['n_estimators']

### Gamma

In [None]:
##around optimal check
param_test = {
 'gamma':[0,0.001]
}
xgb_sklearn = gsearch_tune(grid_dict=param_test, xgb_sklearn = xgb_sklearn)
# modelfit(xgb_sklearn, tune_train, dtrain)
# xgb_sklearn.get_params()['n_estimators']

### subsample and colsample_bytree

In [None]:
##around optimal check
param_test = {
 'subsample':[0.6,0.7,0.8,0.9],
 'colsample_bytree':[0.6,0.7,0.8,0.9]
}
xgb_sklearn = gsearch_tune(grid_dict=param_test, xgb_sklearn = xgb_sklearn)
#modelfit(xgb_sklearn, tune_train, dtrain)
#xgb_sklearn.get_params()['n_estimators']

### Regularization Parameters

In [None]:
##around optimal check
param_test = {
 'reg_alpha':[0, 0.001, 0.01, 0.1,1],
 'reg_lambda':[1,1.2,1.5,2]
}
xgb_sklearn = gsearch_tune(grid_dict=param_test, xgb_sklearn = xgb_sklearn)
# modelfit(xgb_sklearn, tune_train, dtrain)
# xgb_sklearn.get_params()['n_estimators']

### Reducing Learning Rate

In [None]:
##around optimal check
param_test = {
 'learning_rate':[0.3, 0.5, 0.7]
}
xgb_sklearn = gsearch_tune(grid_dict=param_test, xgb_sklearn = xgb_sklearn)
# modelfit(xgb_sklearn, tune_train, dtrain)
# xgb_sklearn.get_params()['n_estimators']

## n_estimators

In [None]:
# modelfit(xgb_sklearn, tune_train, dtrain)
# xgb_sklearn.get_params()['n_estimators']

## Final evaluation

In [None]:
xgb_sklearn.get_params()

In [None]:
params_xgb.update(xgb_sklearn.get_xgb_params())
params_train['num_boost_round'] = xgb_sklearn.get_params()['n_estimators']

In [None]:
def weighted_correlation(a, b, weights):
  w = np.ravel(weights)
  a = np.ravel(a)
  b = np.ravel(b)
  sum_w = np.sum(w)
  mean_a = np.sum(a * w) / sum_w
  mean_b = np.sum(b * w) / sum_w
  var_a = np.sum(w * np.square(a - mean_a)) / sum_w
  var_b = np.sum(w * np.square(b - mean_b)) / sum_w
  cov = np.sum((a * b * w)) / np.sum(w) - mean_a * mean_b
  corr = cov / np.sqrt(var_a * var_b)
  return corr

def make_testset(df):
    ###consistent timestamp for all 14 assets
    df2 = df.set_index("timestamp").copy()
    ind = df2.index.unique()
    def reindex(df):
        df = df.reindex(range(ind[0],ind[-1]+60,60),method='nearest')
        df = df.fillna(method="ffill").fillna(method="bfill")
        return df
    df2 = df2.groupby('Asset_ID').apply(reindex).reset_index(0, drop=True).sort_index()
    ###add features
    df2 = df2.groupby('Asset_ID').apply(lambda x: get_features(x))
    return df2.dropna(axis = 0).reset_index()

In [None]:
def train_model_for_asset(df_train, df_val,asset_id):
    pd.options.mode.chained_assignment = None  # default='warn'
    dftrain = df_train[df_train["Asset_ID"] == asset_id].copy()
    #dfval = df_val[df_val["Asset_ID"] == asset_id]
    dftrain = get_features(dftrain)
    #dfval = get_features(dfval)
    dftrain.dropna(axis = 0, inplace= True)
    #dfval.dropna(axis = 0, inplace= True)
    #https://xgboost.readthedocs.io/en/stable/python/python_api.html#xgboost.DMatrix
    dmat_train=xgb.DMatrix(data = dftrain.drop(['timestamp', 'Asset_ID','Target','Weight'],
                                           axis=1),
                       label= dftrain['Target'])
    #dmat_val=xgb.DMatrix(data = dfval.drop(['timestamp', 'Asset_ID','Target','Weight'],axis=1),label= dfval['Target'])
    del dftrain#,dfval
    gc.collect()

    #https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.training
    evals_result = {}
    model = xgb.train(params_xgb, dtrain=dmat_train, 
                      evals=[(dmat_train,'train')],#[(dmat_val,'val')],
                      #feval = pearson_metric, evals_result=evals_result,
                      **params_train)
    return model



def model_reload_train(df_train, df_val):
    models = {}
    for asset_id, asset_name in zip(df_asset_details['Asset_ID'], df_asset_details['Asset_Name']):
        print(f"training model for {asset_name:<16} (ID={asset_id:<2})")
        models[asset_id] = train_model_for_asset(df_train, df_val,asset_id)
        #models[asset_id].save_model(f'./model_nof_{params_version}/model_{asset_id}.json')
    return models

models=model_reload_train(df_train=df_retrain, 
                   df_val=df_test)

print(models[0].save_config())
print(models[0].feature_names)

In [None]:
xgb.plot_importance(models[0])#trees are decision stumps with depth 0.

score by weighted correlation

In [None]:
df_test = make_testset(df_test)

In [None]:
result_frame = []
for id in range(0,14):
    model = models[id]
    x = df_test[df_test['Asset_ID']==id]
    x['Pred'] = model.predict(xgb.DMatrix(x[model.feature_names]))
    result_frame.append(x[['timestamp','Asset_ID','Weight','Target','Pred']])
result = pd.concat(result_frame, axis=0)
########################################
score=weighted_correlation(a=result['Target'], 
                    b=result['Pred'], 
                    weights=result['Weight'])
score

In [None]:
result