
 01_feature_engineering.ipynb
 Market Microstructure + Options-Implied Feature Engineering


 Notebook Goals:
 - Collect raw data using OpenBB
 - Store raw + processed data in ArcticDB
 - Engineer microstructure features
 - Engineer options-implied volatility features
 - Validate feature quality
 - Display data with charts and plots, save them into the reports folder
 - Save processed data for the next notebook


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from openbb import obb
from arcticdb import Arctic
from datetime import datetime, timedelta

import warnings
warnings.filterwarnings("ignore")


In [None]:
# Initialize ArcticDB local instance
arctic = Arctic("lmdb://arcticdb")

# Create libraries if they don't exist
for lib in ["raw_orderbook", "raw_trades", "raw_options", "processed_features"]:
    if lib not in arctic.list_libraries():
        arctic.create_library(lib)


In [None]:
tickers = ["AAPL", "MSFT", "SPY"]
start = "2023-01-01"
end = "2023-12-31"


In [None]:
orderbook_lib = arctic["raw_orderbook"]

for t in tickers:
    print(f"Collecting orderbook for {t}...")
    ob = obb.ob.orderbook(symbol=t, provider="polygon")
    df_ob = ob.to_df()
    orderbook_lib.write(t, df_ob)


In [None]:
trades_lib = arctic["raw_trades"]

for t in tickers:
    print(f"Collecting trades for {t}...")
    tr = obb.ob.trades(symbol=t, provider="polygon")
    df_tr = tr.to_df()
    trades_lib.write(t, df_tr)


In [None]:
options_lib = arctic["raw_options"]

for t in tickers:
    print(f"Collecting options chain for {t}...")
    opt = obb.options.chains(symbol=t, provider="polygon")
    df_opt = opt.to_df()
    options_lib.write(t, df_opt)


In [None]:
orderbook_lib = arctic["raw_orderbook"]
trades_lib = arctic["raw_trades"]

ob = orderbook_lib.read("AAPL").data
tr = trades_lib.read("AAPL").data


In [None]:
def compute_obi(df):
    df["obi"] = (df["bid_size"] - df["ask_size"]) / (df["bid_size"] + df["ask_size"])
    return df


In [None]:
def compute_spread_mid(df):
    df["spread"] = df["ask_price"] - df["bid_price"]
    df["mid"] = (df["ask_price"] + df["bid_price"]) / 2
    return df


In [None]:
def realized_vol(df, window=60):
    df["log_ret"] = np.log(df["mid"]).diff()
    df["rv"] = df["log_ret"].rolling(window).std() * np.sqrt(252 * 24 * 60)
    return df


In [None]:
def trade_flow_imbalance(df):
    df["signed_volume"] = np.where(df["price"] > df["price"].shift(1),
                                   df["size"],
                                   -df["size"])
    df["tfi"] = df["signed_volume"].rolling(50).sum()
    return df


In [None]:
ob = compute_obi(ob)
ob = compute_spread_mid(ob)
ob = realized_vol(ob)

tr = trade_flow_imbalance(tr)


In [None]:
options_lib = arctic["raw_options"]
opt = options_lib.read("AAPL").data


In [None]:
def compute_iv_skew(df):
    df["iv_skew"] = df.groupby("expiration")["implied_volatility"].transform(
        lambda x: x.quantile(0.9) - x.quantile(0.1)
    )
    return df


In [None]:
def compute_term_structure(df):
    df["ts_slope"] = df.groupby("strike")["implied_volatility"].transform(
        lambda x: x.diff().fillna(0)
    )
    return df


In [None]:
def compute_vol_of_vol(df):
    df["vol_of_vol"] = df.groupby("strike")["implied_volatility"].transform(
        lambda x: x.rolling(20).std()
    )
    return df


In [None]:
opt = compute_iv_skew(opt)
opt = compute_term_structure(opt)
opt = compute_vol_of_vol(opt)


In [None]:
# Merge on nearest timestamp
merged = pd.merge_asof(
    ob.sort_index(),
    tr.sort_index(),
    left_index=True,
    right_index=True,
    direction="nearest"
)

# Merge options data (daily)
opt_daily = opt.groupby(opt.index.date).mean()
opt_daily.index = pd.to_datetime(opt_daily.index)

merged = pd.merge_asof(
    merged.sort_index(),
    opt_daily.sort_index(),
    left_index=True,
    right_index=True,
    direction="nearest"
)


In [None]:
processed_lib = arctic["processed_features"]
processed_lib.write("AAPL", merged)

print("Saved processed features for AAPL.")


In [None]:
sns.pairplot(
    merged[["obi", "spread", "rv", "tfi", "iv_skew", "vol_of_vol"]].dropna(),
    diag_kind="kde"
)
plt.show()
