# **DATA3888 Project: Optiver**

In [None]:
import os
import pandas as pd
import numpy as np
import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

In [3]:
def load_data(directory: str) -> pd.DataFrame:

    all_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".csv"):
                all_files.append(os.path.join(root, file))

    if not all_files:
        raise FileNotFoundError("No CSV files found in the given directory.")

    df = dd.read_csv(all_files)
    return df.compute()

data_path = "./Data/individual_book_train"
df = load_data(data_path)

## **Data Exploration**

In [None]:
df.head()

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,stock_id
0,5,0,1.000129,1.000386,0.999871,1.000643,302,615,500,400,13
1,5,1,1.000129,1.000386,0.999871,1.000643,602,515,400,500,13
2,5,2,1.000129,1.000386,0.999871,1.000643,502,515,400,500,13
3,5,3,1.000129,1.000386,0.999871,1.000643,502,515,400,500,13
4,5,4,1.000129,1.000386,0.999871,1.000643,502,515,400,600,13


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 167253289 entries, 0 to 962099
Data columns (total 11 columns):
 #   Column             Dtype  
---  ------             -----  
 0   time_id            int64  
 1   seconds_in_bucket  int64  
 2   bid_price1         float64
 3   ask_price1         float64
 4   bid_price2         float64
 5   ask_price2         float64
 6   bid_size1          int64  
 7   ask_size1          int64  
 8   bid_size2          int64  
 9   ask_size2          int64  
 10  stock_id           int64  
dtypes: float64(4), int64(7)
memory usage: 15.0 GB


In [None]:
df.describe()

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,stock_id
count,167253300.0,167253300.0,167253300.0,167253300.0,167253300.0,167253300.0,167253300.0,167253300.0,167253300.0,167253300.0,167253300.0
mean,16022.37,296.9969,0.9997121,1.000283,0.9995184,1.000479,928.5549,923.3744,1181.631,1146.534,62.71922
std,9370.937,173.4195,0.003811545,0.003810885,0.003821979,0.00382081,5782.958,5263.738,7168.244,6121.242,36.92018
min,5.0,0.0,0.8807735,0.8876458,0.8806137,0.8898833,1.0,1.0,1.0,1.0,0.0
25%,7837.0,146.0,0.9984497,0.9989405,0.9982569,0.9991112,100.0,100.0,100.0,100.0,32.0
50%,15845.0,296.0,0.9998062,1.000211,0.9996398,1.00038,161.0,161.0,159.0,161.0,62.0
75%,23958.0,447.0,1.001055,1.001535,1.000888,1.001728,400.0,397.0,500.0,500.0,95.0
max,32767.0,599.0,1.125048,1.12715,1.12457,1.127245,1051433.0,646294.0,980137.0,850139.0,126.0


In [None]:
df.isnull().sum()

time_id              0
seconds_in_bucket    0
bid_price1           0
ask_price1           0
bid_price2           0
ask_price2           0
bid_size1            0
ask_size1            0
bid_size2            0
ask_size2            0
stock_id             0
dtype: int64

## **Feature Engineering**

In [None]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, window_size=100, delta=5):
        self.window_size = window_size
        self.delta = delta

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()

        # Compute midpoints
        df["midpoint1"] = (df["ask_price1"] + df["bid_price1"]) / 2.0
        df["midpoint2"] = (df["ask_price2"] + df["bid_price2"]) / 2.0

        # Compute bid-ask spreads
        df["bid_ask_spread1"] = df["ask_price1"] - df["bid_price1"]
        df["bid_ask_spread2"] = df["ask_price2"] - df["bid_price2"]

        # Log returns
        df["log_return"] = np.log(df["midpoint1"] / df["midpoint1"].shift(1))

        # Realized volatility using rolling window
        df["realized_volatility"] = df["log_return"].rolling(window=self.window_size).apply(
            lambda x: np.sqrt(np.sum(x ** 2)), raw=True
        )

        # Integrated variance and volatility
        df["integrated_variance"] = (df["log_return"] ** 2).cumsum()
        df["integrated_volatility"] = np.sqrt(df["integrated_variance"])

        # Normalized spreads and order book imbalance
        df["normalized_spread1"] = df["bid_ask_spread1"] / df["midpoint1"]
        df["normalized_spread2"] = df["bid_ask_spread2"] / df["midpoint2"]
        df["order_book_imbalance"] = df["bid_ask_spread1"] / df["bid_ask_spread2"]
        df["cumulative_order_book_imbalance"] = df["order_book_imbalance"].cumsum()

        # Future midpoint and adverse costs
        df["future_midpoint"] = df["midpoint1"].shift(-self.delta)
        df["adverse_cost_buy"] = df["future_midpoint"] - df["ask_price1"]
        df["adverse_cost_sell"] = df["bid_price1"] - df["future_midpoint"]

        # Vectorized computation of LOB entropy
        lob_columns = ["bid_size1", "bid_size2", "ask_size1", "ask_size2"]
        volumes = df[lob_columns].values.astype(float)
        total_volume = volumes.sum(axis=1, keepdims=True)
        # Avoid division by zero using np.errstate
        with np.errstate(divide="ignore", invalid="ignore"):
            p = np.divide(volumes, total_volume, where=total_volume != 0)
        # Set probabilities to zero for rows with no volume
        p = np.where(total_volume == 0, 0, p)
        # Compute entropy in a vectorized way; avoid log(0) issues
        entropy = -np.nansum(np.where(p > 0, p * np.log(p), 0), axis=1)
        df["LOB_entropy"] = entropy
        # Normalize by maximum entropy for 4 levels
        df["LOB_entropy_normalized"] = entropy / np.log(4)

        # Additional features: microprice and LOB slopes
        df["microprice"] = (
            (df["ask_price1"] * df["bid_size1"] + df["bid_price1"] * df["ask_size1"])
            / (df["bid_size1"] + df["ask_size1"])
        )
        df["lob_slope"] = (
            ((df["ask_price2"] - df["ask_price1"]) / (df["ask_size2"] + 1e-9))
            - ((df["bid_price1"] - df["bid_price2"]) / (df["bid_size2"] + 1e-9))
        )
        df["lob_slope_top2"] = (
            ((df["ask_price2"] - df["ask_price1"]) + (df["bid_price1"] - df["bid_price2"]))
            / ((df["ask_size2"] + df["ask_size1"]) + (df["bid_size1"] + df["bid_size2"]))
        )

        # Additional computed features: trade sign and jump in quotes
        df["trade_sign"] = np.sign(df["log_return"])
        df["jump_in_quotes"] = df["ask_price1"].diff().abs() + df["bid_price1"].diff().abs()

        return df.dropna()

feature_pipeline = Pipeline([
    ('feature_engineering', FeatureEngineer(window_size=100, delta=5))
])

In [None]:
df = feature_pipeline.fit_transform(df)
df.info()

## **Plotting**