In [1]:
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from sklearn.linear_model import LinearRegression

# --- CONFIG ---
ROOT = "Data"
ORDER_SIZES = np.arange(1, 1001)
TICKERS = ["CRWV", "FROG", "SOUN"]
ALL_FILES = []

# Collect all file paths
for ticker in TICKERS:
    folder = os.path.join(ROOT, ticker)
    for fname in sorted(os.listdir(folder)):
        if fname.endswith(".csv") and "sample" not in fname and "__MACOSX" not in fname:
            ALL_FILES.append(os.path.join(folder, fname))

# --- Store gt(x) for each minute ---
# Dict of minute -> list of (x, g_t(x)) pairs
gt_points_per_minute = { }

# Populate expected 390 trading minutes
from datetime import datetime, timedelta
start_time = pd.Timestamp("2025-04-04 13:30:00+00:00")
minutes = [start_time + pd.Timedelta(minutes=i) for i in range(390)]
for m in minutes:
    gt_points_per_minute[m] = []

# --- Helper function to compute g_t(x) ---
def estimate_gt(x, prices, sizes):
    cum_qty = np.cumsum(sizes)
    cum_cost = np.cumsum(prices * sizes)
    if x > cum_qty[-1]:
        return np.nan
    idx = np.searchsorted(cum_qty, x)
    if idx == 0:
        return x * prices[0]
    prev_qty = cum_qty[idx - 1]
    prev_cost = cum_cost[idx - 1]
    return prev_cost + (x - prev_qty) * prices[idx]

# --- Process Each File ---
for fpath in tqdm(ALL_FILES, desc="Processing files"):
    try:
        df = pd.read_csv(fpath)
        df['ts_event'] = pd.to_datetime(df['ts_event'], utc=True, format='ISO8601')
        df = df.sort_values("ts_event")
        df['minute'] = df['ts_event'].dt.floor('min')

        minute_snapshots = df.groupby('minute').head(1)

        for _, row in minute_snapshots.iterrows():
            ts_minute = row['ts_event'].floor('min')
            if ts_minute not in gt_points_per_minute:
                continue

            prices = np.array([row[f'ask_px_0{i}'] for i in range(10)])
            sizes = np.array([row[f'ask_sz_0{i}'] for i in range(10)])
            mask = ~np.isnan(prices) & ~np.isnan(sizes)
            prices = prices[mask]
            sizes = sizes[mask]
            if len(prices) == 0:
                continue

            for x in ORDER_SIZES:
                gt = estimate_gt(x, prices, sizes)
                if np.isnan(gt):
                    break
                gt_points_per_minute[ts_minute].append((x, gt))

    except Exception as e:
        print(f"Error processing {fpath}: {e}")
        continue

# --- Fit 390 Linear Regressions ---
minutes_clean = []
betas = []

for minute, xg_list in gt_points_per_minute.items():
    if len(xg_list) < 10:
        continue  # not enough data
    x_vals, g_vals = zip(*xg_list)
    x_arr = np.array(x_vals).reshape(-1, 1)
    g_arr = np.array(g_vals)
    model = LinearRegression().fit(x_arr, g_arr)
    minutes_clean.append(minute)
    betas.append(model.coef_[0])

# --- Save Output ---
out_df = pd.DataFrame({
    "minute": minutes_clean,
    "beta": betas
})
out_df.to_csv("linearized_impact_coefficients.csv", index=False)


Processing files: 100%|██████████| 63/63 [01:17<00:00,  1.22s/it]
