# Phase 1: Exploration & EDA

## 1.1 Data Ingestion & Sanity Checks


In [None]:
# %%
from python.data_ingestion import load_tick_data
ticks = load_tick_data("data/market_ticks.parquet")
assert ticks['timestamp'].is_monotonic_increasing, "Timestamps must be sorted"
assert ticks.isnull().sum().sum() == 0, "No missing values allowed"


## 1.2 Bar Construction & Visualization


In [None]:
# %%
bars_1s = resample_bars(ticks, freq="1S")
bars_1s['mid'] = (bars_1s.high + bars_1s.low) / 2

plt.figure(figsize=(12,4))
plt.plot(bars_1s.index, bars_1s['mid'])
plt.title("1-Second Mid-Price Series")
plt.show()

bars_1s['volume'].hist(bins=50)
plt.title("1-Second Volume Distribution")
plt.show()


## 1.3 Order-Book Imbalance Heatmap


In [None]:
# %%
bars_1s['imbalance'] = (bars_1s['bid_size'] - bars_1s['ask_size']) / (
    bars_1s['bid_size'] + bars_1s['ask_size']
)
heat = bars_1s.pivot_table(
    index=bars_1s.index.time,
    columns=bars_1s.index.date,
    values='imbalance'
)
plt.figure(figsize=(10,6))
sns.heatmap(heat, cmap='RdBu_r', center=0)
plt.title("Order-Book Imbalance Over Time")
plt.show()


# Phase 2: Feature Development

## 2.1 Compute Rolling Features


In [None]:
# %%
feats = compute_features(bars_1s, window=20)
assert not feats.isnull().values.any(), "Features must be NaN-free"


## 2.2 Correlation & Predictiveness


In [None]:
# %%
corr = feats.corr()
plt.figure(figsize=(8,6))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="vlag")
plt.title("Feature Correlation Matrix")
plt.show()

plt.scatter(feats['vwap'], feats['return'].shift(-1), s=5, alpha=0.3)
plt.title("VWAP vs. Next-Step Return")
plt.show()


# Phase 3: Model Prototyping

## 3.1 Train Ridge Regression

In [None]:
# %%
df = feats.assign(future_return=feats['return'].shift(-1)).dropna()
res = train_ridge(df, target_col='future_return', alpha=1.0)
metrics = evaluate_model(res)
print("Ridge Metrics:", metrics)


## 3.2 XGBoost Classifier & ROC


In [None]:
# %%
from xgboost import XGBClassifier
from sklearn.metrics import roc_curve, auc

X_train, X_test = res['X_test'], res['X_test']
y_train, y_test = res['y_test'], res['y_test']
clf = XGBClassifier(n_estimators=50, max_depth=3)
clf.fit(X_train, (y_train > 0).astype(int))
probs = clf.predict_proba(X_test)[:,1]
fpr, tpr, _ = roc_curve((y_test>0).astype(int), probs)
plt.plot(fpr, tpr, label=f"AUC={auc(fpr,tpr):.2f}")
plt.title("XGBoost ROC Curve")
plt.legend()
plt.show()


# Phase 4: OCaml Integration

## 4.1 Backtester Smoke Test


In [None]:
# %%
sample_feats = feats.iloc[:500].values
pl = run_backtester(sample_feats, initial_cash=1e6, slippage=1e-4)
assert len(pl) == sample_feats.shape[0], "P&L length mismatch"


## 4.2 Python vs. OCaml P&L Comparison


In [None]:
# %%
# Python reference: simple cumulative returns
py_pl = np.cumsum(sample_feats[:,0] * 1e6)
plt.plot(py_pl, label="Python Ref")
plt.plot(np.cumsum(pl), label="OCaml Backtest")
plt.legend()
plt.title("Cumulative P&L Comparison")
plt.show()


# Phase 5: Reinforcement Learning Training

## 5.1 Train OCaml RL Agent


In [None]:
# %%
rl_config = {
    "episodes": 100,
    "batch_size": 32,
    "gamma": 0.99,
    # add other hyperparameters here
}
history = train_rl_agent(rl_config)
pd.DataFrame(history).plot(subplots=True, figsize=(10,6))
plt.suptitle("RL Training Curves")
plt.show()


# Phase 6: GPU-Accelerated Deep Models

## 6.1 Mixed-Precision Training


In [None]:
# %%
gpu_config = {
    "features": torch.tensor(feats.values, dtype=torch.float32),
    "window": 20,
    "batch_size": 64,
    "epochs": 10,
    "lr": 1e-3,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "log_dir": "runs/gpu_experiment"
}
best_ckpt = train_gpu(gpu_config)
print("Best GPU checkpoint saved at:", best_ckpt)


## 6.2 Inference & Backtest with TorchScript


In [None]:
# %%
import torch
model = torch.load(best_ckpt)
model.eval()
scripted = torch.jit.script(model)
# export and call from OCaml or Python as needed...
