In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
import lightgbm as lgb
import xgboost as xgb
import catboost as cbt
import joblib
import re


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

%load_ext autoreload
%autoreload 2

In [2]:
train = pd.read_csv("train.csv")

# Features:

## Volume and Duration
1. Breadth (may not be relevant) - number of transactions in an interval
2. VolumeAll: the total number of shares transacted in the interval
3. VolumeAvg: the avg number of shares transacted in the interval
4. VolumeMax: maximum num of shares transacted in one transaction in the interval.

## Return and Imbalance:
1. Lambda: 

## Exploratory Data Analysis
The training data contain 481 days, covering the period of 3:50pm to 3:59pm at the Nasdaq

Note Reference price is for all seconds but far price is only for seconds_in_bucket 300-540.
In other words for 4:55-4:59

In [3]:
from classifytrades import TradeClassification 
import datetime

def lee_ready(data):
	df = data.copy()

	df["vol"] = df.eval('bid_size + ask_size + matched_size + imbalance_size')
	df.rename(columns={"reference_price" : "price"}, inplace=True)

	# seconds since midnight
	df1 = df[["vol", "seconds_in_bucket", "price", "date_id", "stock_id", "ask_price", "bid_price", "ask_size", "bid_size"]].copy()

	df1['time'] = df1.index # + (16 * 3600) + (50 * 60)

	ask = df1[['ask_price', 'ask_size', 'time']].copy().rename(columns={"ask_price" : "price", "ask_size" : "vol"})
	bid = df1[['bid_price', 'bid_size', 'time']].copy().rename(columns={"bid_price" : "price", "bid_size" : "vol"})

	tc = TradeClassification(df1, Ask=ask, Bid=bid)
	tc.classify(method='lee_ready', freq=1, reduce_precision=False)

	data = data.merge(tc.df_tr[["date_id", "stock_id", "seconds_in_bucket", "Initiator", "midpoint"]], on=["date_id", "stock_id", "seconds_in_bucket"], how="left")
	return data

here we can see that some stocks don't have an auction

In [4]:

def make_data(df, training=False, full_data=False):
	data = lee_ready(df)
	# data = df.set_index(["stock_id", "date_id", "seconds_in_bucket"])

	data["imbalance_buy_sell"] = data.imbalance_size * data.imbalance_buy_sell_flag

	params = ["imbalance_buy_sell", "reference_price", "matched_size", "wap_logreturns", "matched_ratio"]

	# data["wap_logreturns"] = np.log(data.wap/data.wap.shift(1))

	data["prev_wap"] = data.groupby(["date_id", "stock_id"]).wap.shift(1)

	data["PastReturn"] = 1 - data.eval('wap /prev_wap')

	data["matched_ratio"] = data.imbalance_buy_sell_flag*data.imbalance_size/data.matched_size


	data["imb_s1"] = data.eval('(bid_size - ask_size)/(bid_size + ask_size)')
	data["imb_s2"] = data.eval('(imbalance_size - matched_size)/(matched_size + imbalance_size)')

	data["VolumeAll"] = data.eval('bid_size + ask_size + matched_size + imbalance_size') # explore

	data["lambda"] = data.eval('(bid_price - ask_price)/VolumeAll')

	data["LobImbalance"] = data.eval('(ask_size - bid_size)/VolumeAll')
	data["TxnImbalance"] = data.eval('VolumeAll*Initiator')


	if full_data == True:
		return data

	data = data.fillna(0)
	features = ["PastReturn", "lambda", "LobImbalance", "matched_ratio", "imb_s1", "imb_s2", "imbalance_buy_sell_flag"]
	if training:
		X = data[features].to_numpy()
		y = data.target.to_numpy()
		# y = np.array(data.target).reshape(-1,1)
		return X[np.isfinite(X).all(1)], y[np.isfinite(X).all(1)]
		# return X, y

	X = data[features].copy()

	return X.to_numpy()


# Hyperparameter Tuning
We separate these 480 days into 5 pieces perform a 5-fold cross-validation on them.

In [5]:
import getpass
X, y = make_data(train, training=True)


TUNING = False

from tune import TuningSession

if TUNING == True:
    if getpass.getuser() == "vinicius":
        hyperparam_dists_path = '/Users/vinicius/Projects/kaggle/kaggle-optiver/kaggle_optiver/hyperparameters.yaml'
    elif getpass.getuser() == "ephraimsutherland":
        hyperparam_dists_path = '/home/ephraimsutherland/Documents/kaggle-optiver/kaggle_optiver/hyperparameters.yaml'

    ts = TuningSession(hyperparam_dists_path=hyperparam_dists_path)
    ts.run(data=X, labels=y)
    ts.trials_dict['LGBMRegressor'].to_csv("lgbm_trials.csv")
    ts.trials_dict['RFRegressor'].to_csv("rf_trials.csv")

else:
    trials_LGBM = pd.read_csv("lgbm_trials.csv")
    trials_RF = pd.read_csv("rf_trials.csv")


  s = tick_rule(self.df_tr.price.values.astype(int), prices, index_p)
  from .autonotebook import tqdm as notebook_tqdm


In [6]:
trials_RF.drop(columns=["Unnamed: 0"], inplace=True)
trials_LGBM.drop(columns=["Unnamed: 0"], inplace=True)

In [7]:
percentage_of_trials=0.10
best_trials_LGBM = trials_LGBM.sort_values('value')[:int(len(trials_LGBM)*percentage_of_trials)]
best_trials_LGBM.drop(columns=["number","value","datetime_start", "datetime_complete","duration", "state"], inplace=True)
# best_trials_LGBM.loc[:.best_trials_LGBM.columns.str.startswith('params_')]
best_trials_LGBM = best_trials_LGBM.rename(columns=lambda x: re.sub('params_','',x))


In [8]:
best_trials_RF = trials_RF.sort_values('value')[:3]
best_trials_RF.drop(columns=["number","value","datetime_start", "datetime_complete","duration", "state"], inplace=True)
# best_trials_RF.loc[:.best_trials_RF.columns.str.startswith('params_')]
best_trials_RF = best_trials_RF.rename(columns=lambda x: re.sub('params_','',x))

In [9]:
best_trials_LGBM.drop_duplicates(inplace=True)
best_trials_RF.drop_duplicates(inplace=True)

In [10]:
def models(model, i, training=False):
	if training==True:
		model.fit(X,y)
		joblib.dump(model, f'./models/model_{i}.model')
	else:
		model = joblib.load(f'./models/model_{i}.model')
	return model



In [11]:
model1 = LinearRegression()

model1.fit(X,y)
model_list = []

# model_list.append(model1)
for i in range(len(best_trials_LGBM)):
	print("LGBM", i)
	
	vals = best_trials_LGBM.iloc[i]
	model = lgb.LGBMRegressor(objective='regression_l1', **vals)
	model_fit = models(model, "LGBM"+str(i), training=False)
	model_list.append(model_fit)


for i in range(len(best_trials_RF)):
	print("RF", i)
	
	vals = best_trials_RF.iloc[i]
	model = RandomForestRegressor(**vals)
	# model = None
	model_fit = models(model, "RF"+str(i), training=False)
	model_list.append(model_fit)


LGBM 0
LGBM 1
LGBM 2
LGBM 3
LGBM 4
RF 0
RF 1
RF 2


In [12]:
import optiver2023

env = optiver2023.make_env()
iter_test = env.iter_test()


In [13]:
counter = 0
for (test, revealed_targets, sample_prediction) in iter_test:
    X = make_data(test)
    # sample_prediction['target'] = model.predict(X)
    sample_prediction['target'] = np.mean([model.predict(X) for model in model_list], 0)
    # sample_prediction['target'] = 0.5
    env.predict(sample_prediction)
    counter += 1

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.


In [14]:
!tail submission.csv
# sub = pd.read_csv("./example_test_files/sample_submission.csv")
sub = pd.read_csv("submission.csv")

test = pd.read_csv("./example_test_files/test.csv")

revealed = pd.read_csv("./example_test_files/revealed_targets.csv")

480_540_190,0.669063343502831
480_540_191,-0.10323479642061406
480_540_192,-0.4726706178442343
480_540_193,-0.07967248250678398
480_540_194,0.5186614778554246
480_540_195,0.41593818706091495
480_540_196,-0.3835122479397541
480_540_197,0.9056433419265066
480_540_198,0.5690122873193335
480_540_199,-0.00045112135934086833


In [15]:
sub = sub[sub.target != "target"]
sub.target = sub.target.astype(float)

In [16]:
ans = train[["row_id", "target"]].merge(sub, on="row_id", how="right")

In [17]:
mae = np.mean(np.abs(ans.target_x - ans.target_y))
mae

5.267661304787218

Original Naive Estimate: 5.27941

With bid-ask size imbalance: 5.279170945560908

As above but adding imbalance-matched difference ratio: 5.278630645573058


Same as above but using lgb: 5.278595988137708

Now using LGBM and RF: 5.273744433871616

Adding the LR initiator flag: 5.268223119261944
