# **DATA3888 Project: Optiver**

In [None]:
import os
import pandas as pd
import numpy as np
import dask.dataframe as dd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from statsmodels.tsa.stattools import acf, pacf

In [3]:
def load_data(directory: str) -> pd.DataFrame:

    all_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".csv"):
                all_files.append(os.path.join(root, file))

    if not all_files:
        raise FileNotFoundError("No CSV files found in the given directory.")

    df = dd.read_csv(all_files)
    return df.compute()

data_path = "./Data/individual_book_train"
df = load_data(data_path)

## **Data Exploration**

In [None]:
df.head()

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,stock_id
0,5,0,1.000129,1.000386,0.999871,1.000643,302,615,500,400,13
1,5,1,1.000129,1.000386,0.999871,1.000643,602,515,400,500,13
2,5,2,1.000129,1.000386,0.999871,1.000643,502,515,400,500,13
3,5,3,1.000129,1.000386,0.999871,1.000643,502,515,400,500,13
4,5,4,1.000129,1.000386,0.999871,1.000643,502,515,400,600,13


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 167253289 entries, 0 to 962099
Data columns (total 11 columns):
 #   Column             Dtype  
---  ------             -----  
 0   time_id            int64  
 1   seconds_in_bucket  int64  
 2   bid_price1         float64
 3   ask_price1         float64
 4   bid_price2         float64
 5   ask_price2         float64
 6   bid_size1          int64  
 7   ask_size1          int64  
 8   bid_size2          int64  
 9   ask_size2          int64  
 10  stock_id           int64  
dtypes: float64(4), int64(7)
memory usage: 15.0 GB


In [None]:
df.describe()

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,stock_id
count,167253300.0,167253300.0,167253300.0,167253300.0,167253300.0,167253300.0,167253300.0,167253300.0,167253300.0,167253300.0,167253300.0
mean,16022.37,296.9969,0.9997121,1.000283,0.9995184,1.000479,928.5549,923.3744,1181.631,1146.534,62.71922
std,9370.937,173.4195,0.003811545,0.003810885,0.003821979,0.00382081,5782.958,5263.738,7168.244,6121.242,36.92018
min,5.0,0.0,0.8807735,0.8876458,0.8806137,0.8898833,1.0,1.0,1.0,1.0,0.0
25%,7837.0,146.0,0.9984497,0.9989405,0.9982569,0.9991112,100.0,100.0,100.0,100.0,32.0
50%,15845.0,296.0,0.9998062,1.000211,0.9996398,1.00038,161.0,161.0,159.0,161.0,62.0
75%,23958.0,447.0,1.001055,1.001535,1.000888,1.001728,400.0,397.0,500.0,500.0,95.0
max,32767.0,599.0,1.125048,1.12715,1.12457,1.127245,1051433.0,646294.0,980137.0,850139.0,126.0


In [None]:
df.isnull().sum()

time_id              0
seconds_in_bucket    0
bid_price1           0
ask_price1           0
bid_price2           0
ask_price2           0
bid_size1            0
ask_size1            0
bid_size2            0
ask_size2            0
stock_id             0
dtype: int64

## **Feature Engineering**

In [None]:
def bipower_variation(returns: pd.Series, window: int = 20) -> pd.Series:
    """
    Compute rolling bipower variation of a returns series.
    """
    abs_r = returns.abs()
    product_series = abs_r * abs_r.shift(1)        # |r_t| * |r_(t-1)|
    rolling_sum = product_series.rolling(window=window).sum()
    bv = (np.pi / 2.0) * rolling_sum
    return bv

def rolling_integrated_variance(returns: pd.Series, window: int = 20) -> pd.Series:
    """
    Compute rolling integrated variance as sum of squared returns over a window.
    """
    sq_returns = returns**2
    riv = sq_returns.rolling(window=window).sum()
    return riv

def rolling_acf_pacf(series: pd.Series, window: int = 100, max_lag: int = 5) -> pd.DataFrame:
    """
    Compute rolling ACF/PACF for 'max_lag' lags on a rolling window of 'window' length.
    """
    results = {f'acf_lag{k}': [] for k in range(1, max_lag + 1)}
    for k in range(1, max_lag + 1):
        results[f'pacf_lag{k}'] = []

    for i in range(len(series)):
        if i < window:
            # Not enough data in the window
            for k in range(1, max_lag + 1):
                results[f'acf_lag{k}'].append(np.nan)
                results[f'pacf_lag{k}'].append(np.nan)
        else:
            window_data = series.iloc[i - window + 1 : i + 1].dropna()
            if len(window_data) < 2:
                for k in range(1, max_lag + 1):
                    results[f'acf_lag{k}'].append(np.nan)
                    results[f'pacf_lag{k}'].append(np.nan)
                continue

            acf_vals = acf(window_data, nlags=max_lag, fft=False)
            pacf_vals = pacf(window_data, nlags=max_lag)

            for k in range(1, max_lag + 1):
                results[f'acf_lag{k}'].append(acf_vals[k])
                results[f'pacf_lag{k}'].append(pacf_vals[k])

    return pd.DataFrame(results, index=series.index)

def rsi(series: pd.Series, window: int = 14) -> pd.Series:
    """
    Compute RSI (Relative Strength Index).
    """
    delta = series.diff()
    gain = delta.clip(lower=0)
    loss = -1 * delta.clip(upper=0)
    avg_gain = gain.rolling(window=window).mean()
    avg_loss = loss.rolling(window=window).mean()

    rs = avg_gain / (avg_loss + 1e-9)
    rsi_ = 100 - (100 / (1 + rs))
    return rsi_

def bollinger_bands(series: pd.Series, window: int = 20, num_std: float = 2.0):
    """
    Compute Bollinger Bands: middle (SMA), upper, and lower.
    """
    sma = series.rolling(window=window).mean()
    std = series.rolling(window=window).std(ddof=0)
    upper = sma + num_std * std
    lower = sma - num_std * std
    return sma, upper, lower

In [None]:
class FeatureEngineer(BaseEstimator, TransformerMixin):
    def __init__(self, 
                 window_size=20, 
                 delta=5,
                 acf_pacf_window=100,
                 acf_pacf_max_lag=5):
        """
        window_size: default rolling window for features like realized volatility,
                     bipower variation, RSI, Bollinger, etc.
        delta: lookahead for future midpoint and adverse cost calculations
        acf_pacf_window: rolling window size for ACF/PACF
        acf_pacf_max_lag: maximum lag for ACF/PACF
        """
        self.window_size = window_size
        self.delta = delta
        self.acf_pacf_window = acf_pacf_window
        self.acf_pacf_max_lag = acf_pacf_max_lag

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        df = X.copy()

        # Midpoints
        df["midpoint1"] = (df["ask_price1"] + df["bid_price1"]) / 2.0
        df["midpoint2"] = (df["ask_price2"] + df["bid_price2"]) / 2.0

        # Bid-Ask spreads
        df["bid_ask_spread1"] = df["ask_price1"] - df["bid_price1"]
        df["bid_ask_spread2"] = df["ask_price2"] - df["bid_price2"]

        # Log return
        df["log_return"] = np.log(df["midpoint1"] / df["midpoint1"].shift(1))

        # Realized volatility (rolling)
        df["realized_volatility"] = df["log_return"].rolling(window=self.window_size).apply(
            lambda x: np.sqrt(np.sum(x ** 2)), raw=True
        )

        # Integrated variance & volatility (cumulative)
        df["integrated_variance"] = (df["log_return"] ** 2).cumsum()
        df["integrated_volatility"] = np.sqrt(df["integrated_variance"])

        # Normalized spreads, order book imbalance
        df["normalized_spread1"] = df["bid_ask_spread1"] / df["midpoint1"]
        df["normalized_spread2"] = df["bid_ask_spread2"] / df["midpoint2"]
        df["order_book_imbalance1"] = df["bid_size1"] / (df["ask_size1"] + df["bid_size1"])
        df["order_book_imbalance2"] = df["bid_size2"] / (df["ask_size2"] + df["bid_size2"])
        df["OBI_L2"] = (df["bid_size1"] + df["bid_size2"]) / (
            df["ask_size1"] + df["bid_size1"] + df["ask_size2"] + df["bid_size2"]
        )

        # Future midpoint & adverse costs
        df["future_midpoint"] = df["midpoint1"].shift(-self.delta)
        df["adverse_cost_buy"] = df["future_midpoint"] - df["ask_price1"]
        df["adverse_cost_sell"] = df["bid_price1"] - df["future_midpoint"]

        # LOB entropy
        lob_columns = ["bid_size1", "bid_size2", "ask_size1", "ask_size2"]
        volumes = df[lob_columns].values.astype(float)
        total_volume = volumes.sum(axis=1, keepdims=True)
        with np.errstate(divide="ignore", invalid="ignore"):
            p = np.divide(volumes, total_volume, where=total_volume != 0)
        p = np.where(total_volume == 0, 0, p)
        entropy = -np.nansum(np.where(p > 0, p * np.log(p), 0), axis=1)
        df["LOB_entropy"] = entropy
        df["LOB_entropy_normalized"] = entropy / np.log(4)

        # Microprice & LOB slopes
        df["microprice"] = (
            (df["ask_price1"] * df["bid_size1"] + df["bid_price1"] * df["ask_size1"])
            / (df["bid_size1"] + df["ask_size1"])
        )
        df["lob_slope"] = (
            ((df["ask_price2"] - df["ask_price1"]) / (df["ask_size2"] + 1e-9))
            - ((df["bid_price1"] - df["bid_price2"]) / (df["bid_size2"] + 1e-9))
        )
        df["lob_slope_top2"] = (
            ((df["ask_price2"] - df["ask_price1"]) + (df["bid_price1"] - df["bid_price2"]))
            / ((df["ask_size2"] + df["ask_size1"]) + (df["bid_size1"] + df["bid_size2"]))
        )

        df["trade_sign"] = np.sign(df["log_return"])
        df["jump_in_quotes"] = df["ask_price1"].diff().abs() + df["bid_price1"].diff().abs()

        # Bipower variation
        df["bipower_var"] = bipower_variation(df["log_return"], window=self.window_size)

        # Rolling integrated variance
        df["rolling_integrated_variance"] = rolling_integrated_variance(df["log_return"], window=self.window_size)
        df["rolling_integrated_vol"] = np.sqrt(df["rolling_integrated_variance"])

        # Rolling ACF/PACF
        df_acf_pacf = rolling_acf_pacf(
            df["log_return"],
            window=self.acf_pacf_window,
            max_lag=self.acf_pacf_max_lag
        )
        df = df.join(df_acf_pacf)

        # RSI (using midpoint1)
        df["rsi_14"] = rsi(df["midpoint1"], window=self.window_size)

        # Bollinger Bands (using midpoint1)
        sma, upper, lower = bollinger_bands(df["midpoint1"], window=self.window_size, num_std=2.0)
        df["bb_mid"] = sma
        df["bb_upper"] = upper
        df["bb_lower"] = lower

        df = df.dropna()

        return df

feature_pipeline = Pipeline([
    ('feature_engineering', FeatureEngineer(
        window_size=20,      
        delta=5, 
        acf_pacf_window=100,
        acf_pacf_max_lag=5
    ))
])

In [None]:
df = feature_pipeline.fit_transform(df)
df.info()

## **Plotting**