In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
────────────────────────────────────────────────────────────────────────────
Event Study – Final Version (Corrected Output Path)
────────────────────────────────────────────────────────────────────────────
This script is specifically designed to read WIDE-FORMAT event files, where
columns represent event types and values are the corresponding dates.
"""

# ── Imports ────────────────────────────────────────────────────────────── #
import glob
from pathlib import Path
import numpy  as np
import pandas as pd
import statsmodels.api as sm
from scipy.stats import ttest_1samp, t

# ── Tunables & study design ────────────────────────────────────────────── #
EST_WIN_DAYS  = 200
EST_BUF_DAYS  = 11
EVENT_WINDOWS = [1, 5, 10]
MAX_EVENT_WINDOW = max(EVENT_WINDOWS) if EVENT_WINDOWS else 10

models = {
    "MM_SPY":         ["SPY"],
    "MM_Gold":        ["Gold"],
    "MM_N100":        ["Nasdaq100"],
    "EM_Gold_SPY":    ["Gold", "SPY"],
    "EM_Gold_Nasdaq": ["Gold", "Nasdaq100"],
}

# ── Helper functions ───────────────────────────────────────────────────── #
def read_csv_robustly(path: Path, engine: str = 'c', sep=','):
    """
    Reads a CSV file by trying a sequence of common encodings,
    using the specified parser engine and separator.
    """
    encodings_to_try = ['utf-8', 'utf-8-sig', 'gbk', 'gb2312', 'latin-1']
    if engine == 'python': sep = None # Let python engine auto-detect separator
    
    for enc in encodings_to_try:
        try:
            return pd.read_csv(path, encoding=enc, engine=engine, sep=sep)
        except (UnicodeDecodeError, UnicodeError, pd.errors.ParserError):
            continue
    raise ValueError(f"Failed to read or parse '{path}'. Please check its encoding, structure, and separator.")

def std_cols(df: pd.DataFrame) -> pd.DataFrame:
    df.columns = (df.columns.str.lower().str.replace(" ", "").str.replace(".", "", regex=False).str.strip())
    return df

def load_ret(path: Path) -> pd.Series:
    """Reads structured price data files with encoding fallback."""
    # The input 'path' is now expected to be a Path object
    df = read_csv_robustly(path) # Uses fast 'c' engine by default
    df = std_cols(df)
    if {"date", "price"} - set(df.columns): raise ValueError(f"'{path}': Must contain 'Date' & 'Price'.")
    df["date"]  = pd.to_datetime(df["date"], errors="coerce")
    df["price"] = (df["price"].astype(str).str.replace(",", "").str.strip().pipe(pd.to_numeric, errors="coerce"))
    df = (df.dropna(subset=["date", "price"]).set_index("date").sort_index())
    asset_name = path.stem
    return np.log(df["price"]).diff().rename(asset_name)

def addc(x: pd.DataFrame) -> pd.DataFrame:
    return sm.add_constant(x, has_constant="add")

def reg(y: pd.Series, X: pd.DataFrame) -> pd.Series:
    return sm.OLS(y, addc(X)).fit().params

def car(ar: pd.Series, evt: pd.Timestamp) -> dict[str, float]:
    return {f"CAR(-{k},+{k})": ar.loc[evt - pd.Timedelta(days=k) : evt + pd.Timedelta(days=k)].sum() for k in EVENT_WINDOWS}

# ── 1) Load all data from external folders ─────────────────────────────── #
# --- ADJUSTED: Benchmark Data Loading ---
print("--- Loading Benchmark Data ---")
BENCHMARK_DIR = Path("./benchmark")
if not BENCHMARK_DIR.is_dir(): raise FileNotFoundError(f"Benchmark directory '{BENCHMARK_DIR}' not found.")

bench_files = {"Gold": "Gold.csv", "Nasdaq100": "Nasdaq100.csv", "SPY": "SPY.csv"}
# Load each file by combining the benchmark directory path with the filename
bench_ret = {name: load_ret(BENCHMARK_DIR / path) for name, path in bench_files.items()}
print(f"  • Loaded: {', '.join(bench_ret.keys())}")


print("\n--- Loading Crypto Asset Data ---")
CRYPTO_DATA_DIR = Path("./crypto_data")
if not CRYPTO_DATA_DIR.is_dir(): raise FileNotFoundError(f"Directory '{CRYPTO_DATA_DIR}' not found.")
# Use Path.glob for a more modern approach
crypto_files = list(CRYPTO_DATA_DIR.glob("*.csv"))
if not crypto_files: raise FileNotFoundError(f"No CSV files found in '{CRYPTO_DATA_DIR}'.")
asset_ret = {f.stem: load_ret(f) for f in crypto_files}
print(f"  • Found and loaded {len(asset_ret)} assets.")

print("\n--- Loading Wide-Format Event Calendar Data ---")
EVENTS_DIR = Path("./events")
train_events_file = EVENTS_DIR / "training_set.csv"
test_events_file = EVENTS_DIR / "test_set.csv"

if not EVENTS_DIR.is_dir(): raise FileNotFoundError(f"Directory '{EVENTS_DIR}' not found.")
if not train_events_file.is_file(): raise FileNotFoundError(f"Training file '{train_events_file}' not found.")
if not test_events_file.is_file(): raise FileNotFoundError(f"Test file '{test_events_file}' not found.")

events = {}

def load_wide_events(path: Path, suffix: str) -> dict:
    """
    Loads WIDE-FORMAT event files.
    It iterates through COLUMNS to build the event dictionary.
    """
    local_events = {}
    # Force the use of the flexible 'python' engine for these specific files
    # to handle ragged columns and prevent ParserError.
    df = read_csv_robustly(path, engine='python')
    
    # Clean column names before processing
    df = std_cols(df)
    
    for group_name in df.columns:
        dates = pd.to_datetime(df[group_name].dropna(), errors='coerce').dropna()
        local_events[f"{group_name}{suffix}"] = pd.DatetimeIndex(dates)
    return local_events

events.update(load_wide_events(train_events_file, "_train"))
print(f"  • Loaded {len(events)} training groups from '{train_events_file.name}'.")
test_events = load_wide_events(test_events_file, "_test")
print(f"  • Loaded {len(test_events)} test groups from '{test_events_file.name}'.")
events.update(test_events)
print(f"  • Total unique event groups to process: {len(events)}")
print("----------------------------------------------------")

panel = (pd.concat([*bench_ret.values(), *asset_ret.values()], axis=1).sort_index().ffill())

# ── 2) Loop through events → run regressions & compute CAR ─────────────── #
event_rows, daily_ar_rows, estimation_data_rows = [], [], []

for asset in asset_ret:
    merged = panel[[asset, *bench_files.keys()]].dropna()
    for grp, dates in events.items():
        for evt in dates:
            if evt not in merged.index:
                print(f"Warning: Event date {evt.date()} for group '{grp}' not in price data for asset '{asset}'. Skipping.")
                continue

            est_end   = evt - pd.Timedelta(days=EST_BUF_DAYS)
            est_start = est_end - pd.Timedelta(days=EST_WIN_DAYS)
            est       = merged.loc[est_start : est_end]
            if len(est) < 30:
                print(f"Warning: Insufficient data for event {evt.date()} ('{grp}') for asset '{asset}'. Skipping.")
                continue

            row = {}
            for mdl, facs in models.items():
                params = reg(est[asset], est[facs])
                pred = params["const"] + (merged[facs] * params[facs]).sum(axis=1) if len(facs) > 1 else params["const"] + params[facs[0]] * merged[facs[0]]
                ar   = merged[asset] - pred
                cars = car(ar, evt)

                ar_window_series = ar.loc[evt - pd.Timedelta(days=MAX_EVENT_WINDOW) : evt + pd.Timedelta(days=MAX_EVENT_WINDOW)]
                
                df_ar_temp = ar_window_series.reset_index(name='AR').rename(columns={'date': 'Date'})
                df_ar_temp = df_ar_temp.assign(
                    RelativeDay = (df_ar_temp['Date'] - evt).dt.days,
                    Asset = asset,
                    EventGroup = grp,
                    EventDate = evt.strftime("%Y-%m-%d"),
                    Model = mdl
                )
                daily_ar_rows.append(df_ar_temp)

                df_est_temp = est[[asset, *facs]].copy().reset_index().rename(columns={'date': 'Date', asset: 'AssetReturn'})
                df_est_temp = df_est_temp.assign(
                    Asset = asset,
                    EventGroup = grp,
                    EventDate = evt.strftime("%Y-%m-%d"),
                    Model = mdl
                )
                estimation_data_rows.append(df_est_temp)

                if mdl.startswith("MM"):
                    fac = facs[0]
                    row.update({f"{fac}_α": params["const"], f"{fac}_β": params[fac]})
                    for k in EVENT_WINDOWS: row[f"CAR_MM_{fac}(-{k},+{k})"] = cars[f"CAR(-{k},+{k})"]
                elif mdl == "EM_Gold_SPY":
                    row.update({"EM_Gold_SPY_α": params["const"], "EM_Gold_SPY_β_Gold": params["Gold"], "EM_Gold_SPY_β_SPY": params["SPY"]})
                    for k in EVENT_WINDOWS: row[f"CAR_EM_Gold_SPY(-{k},+{k})"] = cars[f"CAR(-{k},+{k})"]
                else: # Assumes EM_Gold_Nasdaq
                    row.update({"EM_Gold_Nasdaq_α": params["const"], "EM_Gold_Nasdaq_β_Gold": params["Gold"], "EM_Gold_Nasdaq_β_Nasdaq100": params["Nasdaq100"]})
                    for k in EVENT_WINDOWS: row[f"CAR_EM_Gold_Nasdaq(-{k},+{k})"] = cars[f"CAR(-{k},+{k})"]

            idx = (asset, grp, evt.strftime("%Y-%m-%d"))
            event_rows.append((idx, row))

# ── 3) Convert results to DataFrames & 4) Calculate Group Means ────────── #
if not event_rows:
    print("\nNo events were processed. Ending script.")
else:
    idx_vals, dict_vals = zip(*event_rows)
    df_evt_wide = pd.DataFrame(list(dict_vals), index=pd.MultiIndex.from_tuples(idx_vals, names=["Asset", "EventGroup", "EventDate"]))
    col_seq = []
    for fac in ["SPY", "Gold", "Nasdaq100"]: col_seq.extend([f"{fac}_α", f"{fac}_β", *[f"CAR_MM_{fac}(-{k},+{k})" for k in EVENT_WINDOWS]])
    col_seq.extend(["EM_Gold_SPY_α", "EM_Gold_SPY_β_Gold", "EM_Gold_SPY_β_SPY", *[f"CAR_EM_Gold_SPY(-{k},+{k})" for k in EVENT_WINDOWS]])
    col_seq.extend(["EM_Gold_Nasdaq_α", "EM_Gold_Nasdaq_β_Gold", "EM_Gold_Nasdaq_β_Nasdaq100", *[f"CAR_EM_Gold_Nasdaq(-{k},+{k})" for k in EVENT_WINDOWS]])
    df_evt_wide = df_evt_wide.reindex(columns=col_seq)

    mean_rows = []
    for (asset, grp), sub in df_evt_wide.groupby(level=["Asset", "EventGroup"]):
        for mdl, facs in models.items():
            label = ("MM_" + facs[0]) if mdl.startswith("MM") else mdl
            for k in EVENT_WINDOWS:
                col = (f"CAR_MM_{facs[0]}(-{k},+{k})" if mdl.startswith("MM") else f"CAR_{mdl}(-{k},+{k})")
                if col in sub:
                    vals = sub[col].dropna()
                    n = len(vals)
                    mean, ci_lo, ci_hi, pval = (np.nan,)*4
                    if n >= 2:
                        mean  = vals.mean()
                        se    = vals.std(ddof=1) / np.sqrt(n)
                        tcrit = t.ppf(0.975, n-1)
                        ci_lo, ci_hi = mean - tcrit*se, mean + tcrit*se
                        _, pval = ttest_1samp(vals, 0, nan_policy='omit')
                    mean_rows.append({"Asset": asset, "EventGroup": grp, "Model": label, "Window": f"(-{k},+{k})", "N": n, "MeanCAR": mean, "95%CI_low": ci_lo, "95%CI_high": ci_hi, "p-value": pval})

    df_mean = pd.DataFrame(mean_rows).set_index(["Asset", "EventGroup", "Model", "Window"]).sort_index()
    df_daily_ar = pd.concat(daily_ar_rows, ignore_index=True) if daily_ar_rows else pd.DataFrame()
    df_estimation = pd.concat(estimation_data_rows, ignore_index=True) if estimation_data_rows else pd.DataFrame()

    # ── 5) CLI output & Save all data artifacts ────────────────────────────── #
    pd.set_option("display.max_columns", None, "display.width", 2000, "display.float_format", "{:.6f}".format)
    print("\n==== Event-level wide table (df_evt_wide) [PREVIEW] ====")
    print(df_evt_wide.head())
    print("\n==== Asset × EventGroup  MeanCAR ± 95 % CI [PREVIEW] ====")
    print(df_mean.head())

    OUTPUT_DIR = Path("./outcome")
    OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    print(f"\n==== Saving All Output Files to '{OUTPUT_DIR}' folder ====")

    df_evt_wide.to_csv(OUTPUT_DIR / "event_study_wide_results.csv")
    print(f" -> Saved event_study_wide_results.csv ({len(df_evt_wide)} rows)")
    df_mean.to_csv(OUTPUT_DIR / "event_study_mean_results.csv")
    print(f" -> Saved event_study_mean_results.csv ({len(df_mean)} rows)")
    df_daily_ar.to_csv(OUTPUT_DIR / "event_study_daily_ar.csv", index=False)
    print(f" -> Saved event_study_daily_ar.csv ({len(df_daily_ar)} rows)")
    df_estimation.to_csv(OUTPUT_DIR / "event_study_estimation.csv", index=False)
    print(f" -> Saved event_study_estimation.csv ({len(df_estimation)} rows)")
    print("\nAnalysis complete. All data files saved successfully.")