# Data Loading & Quality Checks

In [None]:
# Import and load data using your DataHandler
from src.data_handler import DataHandler

dh = DataHandler(data_dir="../data")
df = dh.load_data("minute_data.parquet")  # or "tick_data.parquet"

# Visualize missing data and outliers using DataHandler methods
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

plt.figure(figsize=(12, 4))
sns.heatmap(df.isnull(), cbar=False)
plt.title("Missing Data Heatmap")
plt.show()

# Outlier detection and cleaning
cleaned_df = dh.clean_data(df)
print("Data cleaned: missing values filled, outliers clipped.")

# Pair Seclection & Cointegration Analysis

In [None]:
# Use PairSelector for cointegration and hedge ratio
from src.pair_selection import PairSelector

selector = PairSelector(cleaned_df[["AAPL", "MSFT"]])
coint_pairs = selector.find_cointegrated_pairs()
print("Cointegrated pairs:", coint_pairs)

# Engle-Granger and Johansen via PairSelector
adf_p = selector.adf_test(cleaned_df["AAPL"] - cleaned_df["MSFT"])
johansen_res = selector.johansen_test(cleaned_df[["AAPL", "MSFT"]])
print("ADF p-value:", adf_p)
print("Johansen test:", johansen_res)

# Visualize cointegrating residuals
hedge_ratio = selector.calculate_hedge_ratio(cleaned_df["AAPL"], cleaned_df["MSFT"])
residuals = cleaned_df["AAPL"] - hedge_ratio * cleaned_df["MSFT"]
plt.figure(figsize=(12, 4))
plt.plot(residuals)
plt.title("Cointegrating Regression Residuals")
plt.show()

# Dynamic Hedge Ratio Visualization

In [None]:
# Kalman Filter dynamic hedge ratio
kalman_hedge = selector.kalman_filter_hedge_ratio(cleaned_df["AAPL"], cleaned_df["MSFT"])
plt.figure(figsize=(12, 4))
plt.plot(kalman_hedge, label="Kalman Filter")
plt.axhline(hedge_ratio, color='r', linestyle='--', label="Static OLS")
plt.title("Dynamic vs Static Hedge Ratio")
plt.legend()
plt.show()

# Feature Engineering & Mean-Reversion Diagnostics

In [None]:
# Feature engineering for spread
from src.features import FeatureEngineer

fe = FeatureEngineer(pd.DataFrame({"spread": residuals}))
fe.add_zscore("spread").add_half_life("spread").add_hurst_exponent("spread")
features_df = fe.get_features()
print(features_df.tail())

# Stationarity tests via PairSelector
adf_p = selector.adf_test(residuals)
kpss_p = selector.kpss_test(residuals)
print(f"ADF p-value: {adf_p:.4f}, KPSS p-value: {kpss_p:.4f}")

# Clustering & Pair Scoring

In [None]:
# Clustering analysis for pair selection
from sklearn.cluster import KMeans

# Example: Use rolling correlation, volatility, and cointegration p-value as features
pair_features = []
tickers = cleaned_df.columns
for i, t1 in enumerate(tickers):
    for t2 in tickers[i+1:]:
        selector = PairSelector(cleaned_df[[t1, t2]])
        corr = cleaned_df[t1].rolling(60).corr(cleaned_df[t2]).mean()
        vol = cleaned_df[[t1, t2]].pct_change().std().mean()
        pval = selector.adf_test(cleaned_df[t1] - cleaned_df[t2])
        pair_features.append([corr, vol, pval])
pair_features = np.array(pair_features)

kmeans = KMeans(n_clusters=4, random_state=42)
labels = kmeans.fit_predict(pair_features)

plt.figure(figsize=(8, 6))
plt.scatter(pair_features[:,0], pair_features[:,2], c=labels, cmap='tab10')
plt.xlabel("Correlation")
plt.ylabel("ADF p-value")
plt.title("Pair Clustering: Correlation vs ADF p-value")
plt.show()

# Scenario Analysis

In [None]:
# Scenario analysis using DataHandler and PairSelector
scenarios = [
    ("Normal Market", cleaned_df),
    ("High Volatility", cleaned_df * (1 + np.random.normal(0, 0.05, cleaned_df.shape))),
    ("Flash Crash", cleaned_df.copy().assign(AAPL=cleaned_df["AAPL"] * (1 - 0.2))),
    ("Regime Shift", cleaned_df.copy().assign(MSFT=cleaned_df["MSFT"] * (1 + np.linspace(0, 0.1, len(cleaned_df)))))
]

for name, scenario_df in scenarios:
    selector = PairSelector(scenario_df[["AAPL", "MSFT"]])
    print(f"Scenario: {name}")
    coint_pairs = selector.find_cointegrated_pairs()
    print("  Cointegrated pairs:", coint_pairs)
    hl = fe.add_half_life("spread").get_features().iloc[-1]["spread_half_life"]
    print(f"  Half-life: {hl:.2f}")

# interactive Visualization

In [None]:
# Interactive plot with plotly
import plotly.graph_objs as go

fig = go.Figure()
fig.add_trace(go.Scatter(x=cleaned_df.index, y=cleaned_df["AAPL"], name="AAPL"))
fig.add_trace(go.Scatter(x=cleaned_df.index, y=cleaned_df["MSFT"], name="MSFT"))
fig.add_trace(go.Scatter(x=cleaned_df.index, y=residuals, name="Spread Residuals", yaxis="y2"))
fig.update_layout(
    title="Price Series and Spread Residuals",
    yaxis=dict(title="Price"),
    yaxis2=dict(title="Spread", overlaying="y", side="right"),
    legend=dict(x=0, y=1)
)
fig.show()