In [1]:
import numpy as np
import pandas as pd

In [2]:
def b_i(row):
    if row['imbalance_buy_sell_flag'] == 0:
        buy_interest = row['matched_size']
    elif row['imbalance_buy_sell_flag'] > 0:
        buy_interest = row['matched_size'] + row['imbalance_size']
    else:
        buy_interest = row['matched_size']
    return buy_interest

def s_i(row):
    if row['imbalance_buy_sell_flag'] == 0:
        sell_interest = row['matched_size']
    elif row['imbalance_buy_sell_flag'] > 0:
        sell_interest = row['matched_size']
    else:
        sell_interest = row['matched_size'] + row['imbalance_size']
    return sell_interest

In [3]:
def generate_features(df, train_flag):
    
    df_out = df.drop(['far_price', 'near_price', 'row_id'], axis = 1)
    
    df_out['buy_interest'] = df_out.apply(b_i, axis = 1)
    df_out['sell_interest'] = df_out.apply(s_i, axis = 1)

    features = ['stock_id', 'date_id', 'seconds_in_bucket', 'imbalance_buy_sell_flag', 'imbalance_size', 'matched_size', 
                'bid_size', 'ask_size', 'reference_price', 'ask_price', 'bid_price', 'wap',
                'imb_s1', 'imb_s2', 'imb_s3', 'buy_interest', 'sell_interest'
               ]
    
    df_out['imb_s1'] = df_out.eval('(ask_size - bid_size)/(ask_size + bid_size)')
    df_out['imb_s2'] = df_out.eval('(imbalance_size - matched_size)/(matched_size + imbalance_size)')
    df_out['imb_s3'] = np.log(df_out.ask_size) - np.log(df_out.bid_size)
    
    prices = ['reference_price', 'ask_price', 'bid_price', 'wap']
    
    for i, a in enumerate(prices):
        for j, b in enumerate(prices):
            if i > j:
                df_out[f'{a}_{b}_imb'] = df_out.eval(f'({a} - {b})/({a} + {b})')
                features.append(f'{a}_{b}_imb')    
                    
    for i, a in enumerate(prices):
        for j, b in enumerate(prices):
            for k, c in enumerate(prices):
                if i > j and j > k:
                    max_ = df_out[[a,b,c]].max(axis = 1)
                    min_ = df_out[[a,b,c]].min(axis = 1)
                    mid_ = df_out[[a,b,c]].sum(axis = 1) - min_ - max_

                    df_out[f'{a}_{b}_{c}_imb2'] = (max_ - mid_)/(mid_ - min_)
                    features.append(f'{a}_{b}_{c}_imb2')
    
    if train_flag:
        df_out = df_out.dropna()
        df_out.replace([np.inf, -np.inf], 0, inplace = True)
    else:
        df_out.fillna(0, inplace = True)
        df_out.replace([np.inf, -np.inf], 0, inplace = True)
        
    if train_flag:
        return df_out[features], df_out['target']
    else:
        return df_out[features], None

In [4]:
%%time

train_df = pd.read_csv('/kaggle/input/optiver-trading-at-the-close/train.csv')
X, y = generate_features(train_df, True)

  result = getattr(ufunc, method)(*inputs, **kwargs)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out.replace([np.inf, -np.inf], 0, inplace = True)


CPU times: user 4min 31s, sys: 6.65 s, total: 4min 37s
Wall time: 4min 41s


In [5]:
import xgboost as xgb
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

xgb.set_config(verbosity = 0)

In [6]:
best_params = {'n_estimators': 4267, 
               'learning_rate': 0.16987645395358383, 
               'max_depth': 30, 
               'min_child_weight': 165, 
               'gamma': 0.0008669890550996714, 
               'alpha': 1.3874741641288368, 
               'lambda': 0.06464543052706935, 
               'colsample_bytree': 0.781499920579469, 
               'subsample': 0.7322463216704231
              }

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.4, random_state = 1977)
dtrain = xgb.DMatrix(data = X_train, label = y_train)
dval = xgb.DMatrix(data = X_val, label = y_val)

regressor = xgb.train(best_params, dtrain)
y_pred = regressor.predict(dval)
mae = mean_absolute_error(y_val, y_pred)

print("Mean Absolute Error: {}".format(mae))

Mean Absolute Error: 6.264716025562697


In [8]:
### Test dataset:
test_df = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/example_test_files/test.csv")
test_targets = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/example_test_files/revealed_targets.csv")

In [9]:
df = pd.merge(test_df, test_targets[['date_id', 'seconds_in_bucket', 'stock_id', 'revealed_target']], how = 'outer', on = ['date_id', 'seconds_in_bucket', 'stock_id'])
df.rename(index = str, columns = {'revealed_target': 'target'}, inplace = True)

In [10]:
X_test, y_test = generate_features(df, True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_out.replace([np.inf, -np.inf], 0, inplace = True)


In [11]:
dtest = xgb.DMatrix(data = X_test, label = y_test)

y_pred = regressor.predict(dtest)
mae_test = mean_absolute_error(y_test, y_pred)

print("Mean Absolute Error: {}".format(mae_test))

Mean Absolute Error: 5.492949694380493


In [12]:
import optiver2023
env = optiver2023.make_env()
iter_test = env.iter_test()

In [13]:
counter = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    X_test, _ = generate_features(test, False)
    dtest = xgb.DMatrix(data = X_test)
    pred = regressor.predict(dtest)
    sample_prediction['target'] = pred
    env.predict(sample_prediction)
    counter += 1

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
