In [1]:
import pandas as pd
from pandas.api.types import is_numeric_dtype
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
import lightgbm as lgb 
import xgboost as xgb 
import catboost as cbt 
import numpy as np 
import joblib 
import os 

In [2]:
def generate_features(df):
    features = ['seconds_in_bucket', 'imbalance_buy_sell_flag',
               'imbalance_size', 'matched_size', 'bid_size', 'ask_size',
                'reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap',
                'imb_s1', 'imb_s2'
               ]
    
    
    df['imb_s1'] = df.eval('(bid_size-ask_size)/(bid_size+ask_size)')
    df['imb_s2'] = df.eval('(imbalance_size-matched_size)/(matched_size+imbalance_size)')
    
    prices = ['reference_price','far_price', 'near_price', 'ask_price', 'bid_price', 'wap']
    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            if i>j:
                df[f'{a}_{b}_imb'] = df.eval(f'({a}-{b})/({a}+{b})')
                features.append(f'{a}_{b}_imb')    
                    
    for i,a in enumerate(prices):
        for j,b in enumerate(prices):
            for k,c in enumerate(prices):
                if i>j and j>k:
                    max_ = df[[a,b,c]].max(axis=1)
                    min_ = df[[a,b,c]].min(axis=1)
                    mid_ = df[[a,b,c]].sum(axis=1)-min_-max_

                    df[f'{a}_{b}_{c}_imb2'] = (max_-mid_)/(mid_-min_)
                    features.append(f'{a}_{b}_{c}_imb2')
    
    return df[features], df['target']

In [3]:
# Split the data into training and testing sets
df = pd.read_csv('Data/train.csv')
X, y = generate_features(df)
X = X.values
Y = y.values
X = X[np.isfinite(y)]
y = y[np.isfinite(y)]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
def objective(space, X_train, y_train, X_test, y_test):
    clf=xgb.XGBRegressor(
                    n_estimators =int(space['n_estimators']), max_depth = int(space['max_depth']), gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),
                    reg_lambda= int(space['reg_lambda']), 
                    min_child_weight=int(space['min_child_weight']),
                    colsample_bytree=int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), ( X_test, y_test)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="mae",
            early_stopping_rounds=40,verbose=False)
    

    y_pred = clf.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    return {'MAE': mae, 'status': STATUS_OK }

In [5]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
import scipy.stats as stats

# Define the hyperparameter distributions
param_dist = {
    'max_depth': stats.randint(3, 10),
    'learning_rate': stats.uniform(0.01, 0.1),
    'subsample': stats.uniform(0.5, 0.5),
    'n_estimators':stats.randint(500, 1000),
    'tree_method': ['hist'], 
    'objective': ['reg:absoluteerror'], 
    'early_stopping_rounds' : [20]}
# Create the XGBoost model object
#xgb_model = xgb.XGBRegressor()

# Create the RandomizedSearchCV object
#random_search = RandomizedSearchCV(xgb_model, param_distributions=param_dist, n_iter=20, cv=3, scoring='neg_mean_absolute_error')

# Fit the RandomizedSearchCV object to the training data
#random_search.fit(X_train, y_train)

# Print the best set of hyperparameters and the corresponding score
#print("Best set of hyperparameters: ", random_search.best_params_)
#print("Best score: ", random_search.best_score_)

In [16]:
model_xgb = xgb.XGBRegressor()
params = {'tree_method'        : "hist",
        'objective'          : 'reg:absoluteerror',
        'random_state'       : 0,
        'colsample_bytree'   : 0.7,
        'learning_rate'      : 0.07,
        'max_depth'          : 6,
        'n_estimators'       : 450,                         
        'reg_alpha'          : 0.025,
        'reg_lambda'         : 1.75,
        'min_child_weight'   : 1000,
        'early_stopping_rounds' : 100}
model_xgb.set_params(**params)

In [17]:
model_xgb.fit(X_train, y_train, 
            eval_set = [(X_test, y_test)], 
            verbose = 1)

[0]	validation_0-mae:6.30895
[1]	validation_0-mae:6.30153
[2]	validation_0-mae:6.29628
[3]	validation_0-mae:6.29230
[4]	validation_0-mae:6.28815
[5]	validation_0-mae:6.28448
[6]	validation_0-mae:6.28356
[7]	validation_0-mae:6.28223
[8]	validation_0-mae:6.28164
[9]	validation_0-mae:6.27998
[10]	validation_0-mae:6.27957
[11]	validation_0-mae:6.27893
[12]	validation_0-mae:6.27833
[13]	validation_0-mae:6.27762
[14]	validation_0-mae:6.27687
[15]	validation_0-mae:6.27593
[16]	validation_0-mae:6.27582
[17]	validation_0-mae:6.27507
[18]	validation_0-mae:6.27462
[19]	validation_0-mae:6.27450
[20]	validation_0-mae:6.27434
[21]	validation_0-mae:6.27421
[22]	validation_0-mae:6.27406
[23]	validation_0-mae:6.27398
[24]	validation_0-mae:6.27222
[25]	validation_0-mae:6.27204
[26]	validation_0-mae:6.27184
[27]	validation_0-mae:6.27123
[28]	validation_0-mae:6.27114
[29]	validation_0-mae:6.27083
[30]	validation_0-mae:6.27052
[31]	validation_0-mae:6.27020
[32]	validation_0-mae:6.27008
[33]	validation_0-ma

In [None]:
y_pred = model_xgb.predict(X_train)

In [None]:
from lightgbm import log_evaluation, early_stopping, LGBMRegressor as LGBMR;
model_lgbm = lgb.LGBMRegressor()
params = {'device'            : "cpu",
        'objective'         : 'regression_l1',
        'boosting_type'     : 'gbdt',
        'random_state'      : 0,
        'colsample_bytree'  : 0.7,
        'subsample'         : 0.65,
        'learning_rate'     : 0.065,
        'max_depth'         : 6,
        'n_estimators'      : 400,
        'num_leaves'        : 150,  
        'reg_alpha'         : 0.01,
        'reg_lambda'        : 3.25,
        'verbose'           : -1,
        }
model_lgbm.fit(X_train, y_train, 
                eval_set = [(X_test, y_test)], 
                verbose = 0, 
                eval_metric = "mae",
                callbacks = [log_evaluation(0,), 
                        early_stopping(100, verbose = False)], 
                )
    
y_pred = model_lgbm.predict(X_train)