# **DATA3888 Project: Optiver**

In [1]:
import os
import pandas as pd
import dask.dataframe as dd

In [2]:
def load_data(directory: str) -> pd.DataFrame:

    all_files = []
    for root, _, files in os.walk(directory):
        for file in files:
            if file.endswith(".csv"):
                all_files.append(os.path.join(root, file))

    if not all_files:
        raise FileNotFoundError("No CSV files found in the given directory.")

    df = dd.read_csv(all_files)
    return df.compute()

data_path = "./Data/individual_book_train"
df = load_data(data_path)

## **Data Exploration**

In [None]:
df.head()

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,stock_id
0,5,0,1.000129,1.000386,0.999871,1.000643,302,615,500,400,13
1,5,1,1.000129,1.000386,0.999871,1.000643,602,515,400,500,13
2,5,2,1.000129,1.000386,0.999871,1.000643,502,515,400,500,13
3,5,3,1.000129,1.000386,0.999871,1.000643,502,515,400,500,13
4,5,4,1.000129,1.000386,0.999871,1.000643,502,515,400,600,13


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 167253289 entries, 0 to 962099
Data columns (total 11 columns):
 #   Column             Dtype  
---  ------             -----  
 0   time_id            int64  
 1   seconds_in_bucket  int64  
 2   bid_price1         float64
 3   ask_price1         float64
 4   bid_price2         float64
 5   ask_price2         float64
 6   bid_size1          int64  
 7   ask_size1          int64  
 8   bid_size2          int64  
 9   ask_size2          int64  
 10  stock_id           int64  
dtypes: float64(4), int64(7)
memory usage: 15.0 GB


In [None]:
df.describe()

Unnamed: 0,time_id,seconds_in_bucket,bid_price1,ask_price1,bid_price2,ask_price2,bid_size1,ask_size1,bid_size2,ask_size2,stock_id
count,167253300.0,167253300.0,167253300.0,167253300.0,167253300.0,167253300.0,167253300.0,167253300.0,167253300.0,167253300.0,167253300.0
mean,16022.37,296.9969,0.9997121,1.000283,0.9995184,1.000479,928.5549,923.3744,1181.631,1146.534,62.71922
std,9370.937,173.4195,0.003811545,0.003810885,0.003821979,0.00382081,5782.958,5263.738,7168.244,6121.242,36.92018
min,5.0,0.0,0.8807735,0.8876458,0.8806137,0.8898833,1.0,1.0,1.0,1.0,0.0
25%,7837.0,146.0,0.9984497,0.9989405,0.9982569,0.9991112,100.0,100.0,100.0,100.0,32.0
50%,15845.0,296.0,0.9998062,1.000211,0.9996398,1.00038,161.0,161.0,159.0,161.0,62.0
75%,23958.0,447.0,1.001055,1.001535,1.000888,1.001728,400.0,397.0,500.0,500.0,95.0
max,32767.0,599.0,1.125048,1.12715,1.12457,1.127245,1051433.0,646294.0,980137.0,850139.0,126.0


In [None]:
df.isnull().sum()

time_id              0
seconds_in_bucket    0
bid_price1           0
ask_price1           0
bid_price2           0
ask_price2           0
bid_size1            0
ask_size1            0
bid_size2            0
ask_size2            0
stock_id             0
dtype: int64

## **Feature Engineering**

In [4]:
def create_features(df):
    # --- Level 1 and Level 2 Price Features ---
    # Mid prices and spreads
    df['mid_price1'] = (df['bid_price1'] + df['ask_price1']) / 2
    df['spread1'] = df['ask_price1'] - df['bid_price1']
    df['mid_price2'] = (df['bid_price2'] + df['ask_price2']) / 2
    df['spread2'] = df['ask_price2'] - df['bid_price2']
    
    # Relative spreads (spread as a fraction of mid price)
    df['relative_spread1'] = df['spread1'] / df['mid_price1']
    df['relative_spread2'] = df['spread2'] / df['mid_price2']
    
    # Price differences between levels
    df['bid_price_diff'] = df['bid_price1'] - df['bid_price2']
    df['ask_price_diff'] = df['ask_price2'] - df['ask_price1']
    df['mid_price_diff'] = df['mid_price1'] - df['mid_price2']
    
    # --- Order Size Features ---
    # Imbalance at each level (difference over sum)
    df['order_imbalance_level1'] = (df['bid_size1'] - df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
    df['order_imbalance_level2'] = (df['bid_size2'] - df['ask_size2']) / (df['bid_size2'] + df['ask_size2'])
    
    # Combined sizes and overall order imbalance
    df['total_bid_size'] = df['bid_size1'] + df['bid_size2']
    df['total_ask_size'] = df['ask_size1'] + df['ask_size2']
    df['total_order_imbalance'] = (df['total_bid_size'] - df['total_ask_size']) / (df['total_bid_size'] + df['total_ask_size'])
    
    # A weighted mid price for level 1 (using sizes as weights)
    df['weighted_mid_price1'] = (df['bid_price1'] * df['bid_size1'] + df['ask_price1'] * df['ask_size1']) / (df['bid_size1'] + df['ask_size1'])
    
    # --- Additional Derived Features ---
    # Difference between the spreads of the two levels
    df['spread_diff'] = df['spread1'] - df['spread2']
    
    # --- Rolling (Temporal) Features ---
    # Make sure data is sorted by time; adjust window size as needed
    df = df.sort_values('time_id')
    window = 10  # Example window size
    df['rolling_mid_price1_mean'] = df['mid_price1'].rolling(window=window).mean()
    df['rolling_mid_price1_std'] = df['mid_price1'].rolling(window=window).std()
    df['rolling_spread1_mean'] = df['spread1'].rolling(window=window).mean()
    df['rolling_spread1_std'] = df['spread1'].rolling(window=window).std()
    
    return df

# Apply the feature engineering function
df_features = create_features(df)
print(df_features.info())

<class 'pandas.core.frame.DataFrame'>
Index: 167253289 entries, 0 to 962099
Data columns (total 31 columns):
 #   Column                   Dtype  
---  ------                   -----  
 0   time_id                  int64  
 1   seconds_in_bucket        int64  
 2   bid_price1               float64
 3   ask_price1               float64
 4   bid_price2               float64
 5   ask_price2               float64
 6   bid_size1                int64  
 7   ask_size1                int64  
 8   bid_size2                int64  
 9   ask_size2                int64  
 10  stock_id                 int64  
 11  mid_price1               float64
 12  spread1                  float64
 13  mid_price2               float64
 14  spread2                  float64
 15  relative_spread1         float64
 16  relative_spread2         float64
 17  bid_price_diff           float64
 18  ask_price_diff           float64
 19  mid_price_diff           float64
 20  order_imbalance_level1   float64
 21  order_imbala