# Multinomial Logit (LASSO) on S&P 500 Panel Data

This notebook builds a **panel dataset** (all S&P 500 firms together) and explains:
- `up`
- `same`
- `down`

for daily returns using **Multinomial Logistic Regression with L1 regularization (LASSO)**.

## Workflow
1. Load consolidated panel `ConstructionDataset/all_companies_features.csv`.
2. Left join `stock_sectors.csv` and one-hot encode sector.
3. Keep observations from **2024-11-01 to 2025-10-31**.
4. Use **10 months for train/CV** and **last 2 months for out-of-sample validation**.
5. Choose a symmetric threshold around zero based on train percentiles to balance classes.
6. Train multinomial logit with LASSO using `TimeSeriesSplit(n_splits=5)`.
7. Save metrics, predictions, coefficients, and plots to `MultinomialLogit/`.


In [1]:
import warnings
warnings.filterwarnings("ignore")

from datetime import datetime
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    accuracy_score,
    balanced_accuracy_score,
    f1_score,
    classification_report,
    confusion_matrix,
)
from sklearn.model_selection import TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler


## 1) Configuration


In [2]:
PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / "ConstructionDataset" / "all_companies_features.csv").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent

PANEL_PATH = PROJECT_ROOT / "ConstructionDataset" / "all_companies_features.csv"
SECTOR_PATH = PROJECT_ROOT / "stock_sectors.csv"
OUTPUT_BASE = PROJECT_ROOT / "MultinomialLogit"

if not PANEL_PATH.exists():
    raise FileNotFoundError(f"Panel dataset not found: {PANEL_PATH}")

START_DATE = pd.Timestamp("2024-11-01")
END_DATE = pd.Timestamp("2025-10-31")
TRAIN_START = pd.Timestamp("2024-11-01")
TRAIN_END = pd.Timestamp("2025-08-31")
VAL_START = pd.Timestamp("2025-09-01")
VAL_END = pd.Timestamp("2025-10-31")

NUMERIC_FEATURES = [
    "Return_Lag1", "Return_Lag2", "Return_Lag3", "Return_Lag5", "Return_Lag10", "Return_Lag20",
    "Sentiment_Lag1", "Sentiment_Lag2", "Sentiment_Lag3", "Sentiment_Lag5", "Sentiment_Lag10", "Sentiment_Lag20",
    "Sentiment_Lag1_squared", "Sentiment_Lag1_cubic",
    "VIX_Lag1", "VIX_Lag2", "VIX_Lag3", "VIX_Lag5", "VIX_Lag10", "VIX_Lag20",
    "roe", "roa", "op_margin", "debt_to_equity", "liquidity_ratio", "current_ratio",
    "free_cf_margin", "revenue_growth", "ocf_to_assets",
    "GDP_GDP_SA_PC_QOQ", "GDP_GDP_SA_PC_YOY", "GDP_GDP_NSA_PC_QOQ", "GDP_GDP_NSA_PC_YOY",
    "IPI_IPI", "IPI_IPI_YOY", "IPI_IPI_QOQ", "IPI_IPI_SA", "IPI_IPI_SA_YOY", "IPI_IPI_SA_QOQ",
    "UNEMP_UNRATE", "UNEMP_UNRATE_PC1", "UNEMP_UNRATE_PCH",
    "UNEMP_UNRATENSA", "UNEMP_UNRATENSA_PC1", "UNEMP_UNRATENSA_PCH",
]

CATEGORICAL_FEATURES = ["ticker", "sector"]  # ticker FE + sector FE
TARGET_COL = "Return_Status"
RETURN_COL = "Return"

N_SPLITS = 5
C_GRID = np.logspace(-2, 1.5, 8)
MAX_ITER = 3000
RANDOM_STATE = 42

RUN_TAG = datetime.now().strftime("%Y%m%d_%H%M%S")
OUTPUT_DIR = OUTPUT_BASE / f"run_{RUN_TAG}"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print("Project root:", PROJECT_ROOT)
print("Panel path:", PANEL_PATH)
print("Output dir:", OUTPUT_DIR)


Project root: c:\Users\jorge\OneDrive\Documentos\Data 606\Project
Output dir: c:\Users\jorge\OneDrive\Documentos\Data 606\Project\MultinomialLogit\run_20260212_043813


## 2) Helper functions


In [3]:
def _make_ohe():
    try:
        return OneHotEncoder(handle_unknown="ignore", sparse_output=True)
    except TypeError:
        return OneHotEncoder(handle_unknown="ignore", sparse=True)


def load_panel_feature_data(data_path: Path, required_cols: list[str]) -> pd.DataFrame:
    if not data_path.exists():
        raise FileNotFoundError(f"Panel dataset not found: {data_path}")

    panel = pd.read_csv(data_path)

    derived_cols = {"Sentiment_Lag1_squared", "Sentiment_Lag1_cubic"}
    needed = ["Date", "ticker", RETURN_COL] + [c for c in required_cols if c not in derived_cols]
    missing = [c for c in needed if c not in panel.columns]
    if missing:
        raise ValueError(f"{data_path.name} missing columns: {missing}")

    panel = panel[needed].copy()
    panel["ticker"] = panel["ticker"].astype(str).str.strip()
    panel["Sentiment_Lag1"] = pd.to_numeric(panel["Sentiment_Lag1"], errors="coerce")
    panel["Sentiment_Lag1_squared"] = panel["Sentiment_Lag1"] ** 2
    panel["Sentiment_Lag1_cubic"] = panel["Sentiment_Lag1"] ** 3
    panel["Date"] = pd.to_datetime(panel["Date"], errors="coerce")

    panel = panel.dropna(subset=["Date", "ticker"])
    return panel


def make_return_status(ret: pd.Series, thr: float) -> pd.Series:
    y = pd.Series(index=ret.index, dtype="object")
    y[ret > thr] = "up"
    y[ret < -thr] = "down"
    y[ret.between(-thr, thr, inclusive="both")] = "same"
    return y


def choose_balanced_symmetric_threshold(train_returns: pd.Series, percentiles: np.ndarray):
    train_returns = train_returns.dropna()
    abs_ret = train_returns.abs()
    rows = []

    for p in percentiles:
        thr = np.percentile(abs_ret, p)
        labels = make_return_status(train_returns, thr)
        share = labels.value_counts(normalize=True).reindex(["down", "same", "up"], fill_value=0.0)
        imbalance = ((share - (1.0 / 3.0)) ** 2).sum()

        rows.append(
            {
                "percentile": float(p),
                "threshold": float(thr),
                "share_down": float(share["down"]),
                "share_same": float(share["same"]),
                "share_up": float(share["up"]),
                "imbalance_score": float(imbalance),
            }
        )

    grid_df = pd.DataFrame(rows).sort_values(["imbalance_score", "percentile"]).reset_index(drop=True)
    best_thr = float(grid_df.loc[0, "threshold"])
    return best_thr, grid_df


def time_series_splits_by_date(dates: pd.Series, n_splits: int = 5):
    d = pd.to_datetime(dates)
    unique_dates = np.array(sorted(pd.Series(d.unique())))
    tscv = TimeSeriesSplit(n_splits=n_splits)

    for fold, (tr_d, te_d) in enumerate(tscv.split(unique_dates), start=1):
        tr_dates = set(unique_dates[tr_d])
        te_dates = set(unique_dates[te_d])

        tr_idx = np.where(d.isin(tr_dates))[0]
        te_idx = np.where(d.isin(te_dates))[0]
        yield fold, tr_idx, te_idx


def metric_frame(y_true, y_pred, split_name: str) -> pd.DataFrame:
    return pd.DataFrame(
        [
            {
                "split": split_name,
                "accuracy": accuracy_score(y_true, y_pred),
                "balanced_accuracy": balanced_accuracy_score(y_true, y_pred),
                "f1_macro": f1_score(y_true, y_pred, average="macro", zero_division=0),
                "f1_weighted": f1_score(y_true, y_pred, average="weighted", zero_division=0),
                "n_samples": len(y_true),
            }
        ]
    )


## 3) Build panel + left join sectors


In [4]:
panel = load_panel_feature_data(PANEL_PATH, NUMERIC_FEATURES)
sectors = pd.read_csv(SECTOR_PATH)[["ticker", "sector"]].copy()
panel = panel.merge(sectors, on="ticker", how="left")

panel = panel[(panel["Date"] >= START_DATE) & (panel["Date"] <= END_DATE)].copy()
panel = panel.sort_values(["Date", "ticker"]).reset_index(drop=True)

print("Panel shape:", panel.shape)
print("Tickers:", panel["ticker"].nunique(), "| Sectors:", panel["sector"].nunique())
panel.head()


Panel shape: (125750, 47)
Tickers: 503 | Sectors: 11


Unnamed: 0,Date,Return,Return_Lag1,Return_Lag2,Return_Lag3,Return_Lag5,Return_Lag10,Return_Lag20,Sentiment_Lag1,Sentiment_Lag2,...,IPI_IPI_SA_YOY,IPI_IPI_SA_QOQ,UNEMP_UNRATE,UNEMP_UNRATE_PC1,UNEMP_UNRATE_PCH,UNEMP_UNRATENSA,UNEMP_UNRATENSA_PC1,UNEMP_UNRATENSA_PCH,ticker,sector
0,2024-11-01,0.050341,-0.008974,0.001981,-0.002357,-0.003826,0.010261,0.002906,-0.215,-0.333,...,-0.28793,-0.30036,4.3,2.38095,2.38095,4.5,2.27273,-2.17391,A,Healthcare
1,2024-11-01,-0.01328,-0.018209,-0.015278,0.001157,0.003643,0.012277,0.005007,-0.0011,0.0016,...,-0.28793,-0.30036,4.3,2.38095,2.38095,4.5,2.27273,-2.17391,AAPL,Technology
2,2024-11-01,-0.00157,0.011762,0.063605,-0.001213,-0.009491,0.001538,-0.005935,0.7737,0.0261,...,-0.28793,-0.30036,4.3,2.38095,2.38095,4.5,2.27273,-2.17391,ABBV,Healthcare
3,2024-11-01,0.01239,-0.012528,-0.00929,0.01473,0.013785,0.003161,0.036396,-0.0055,0.0148,...,-0.28793,-0.30036,4.3,2.38095,2.38095,4.5,2.27273,-2.17391,ABNB,Consumer Cyclical
4,2024-11-01,0.046132,-0.009436,0.009259,-0.005874,-0.019907,0.012724,0.003653,0.0557,-0.1955,...,-0.28793,-0.30036,4.3,2.38095,2.38095,4.5,2.27273,-2.17391,ABT,Healthcare


## 4) Time split


In [5]:
train_df = panel[(panel["Date"] >= TRAIN_START) & (panel["Date"] <= TRAIN_END)].copy()
val_df = panel[(panel["Date"] >= VAL_START) & (panel["Date"] <= VAL_END)].copy()

print("Train rows:", len(train_df), "|", train_df["Date"].min().date(), "to", train_df["Date"].max().date())
print("Val rows  :", len(val_df), "|", val_df["Date"].min().date(), "to", val_df["Date"].max().date())


Train rows: 103618 | 2024-11-01 to 2025-08-29
Val rows  : 22132 | 2025-09-02 to 2025-10-31


## 5) Symmetric threshold selection (train only)


In [6]:
percentile_grid = np.arange(10, 46, 1)
best_thr, thr_grid = choose_balanced_symmetric_threshold(train_df[RETURN_COL], percentile_grid)
print(f"Selected threshold: ±{best_thr:.6f} ({best_thr*100:.3f}%)")
thr_grid.head(10)


Selected threshold: ±0.006007 (0.601%)


Unnamed: 0,percentile,threshold,share_down,share_same,share_up,imbalance_score
0,33.0,0.006007,0.31565,0.330001,0.35435,0.000765
1,34.0,0.006207,0.311056,0.339999,0.348945,0.000784
2,32.0,0.005809,0.320832,0.320002,0.359165,0.001001
3,35.0,0.006416,0.30598,0.349997,0.344023,0.00114
4,31.0,0.005598,0.32559,0.310004,0.364406,0.00157
5,36.0,0.006633,0.301154,0.360005,0.338841,0.001777
6,30.0,0.005391,0.330618,0.300006,0.369376,0.002417
7,37.0,0.006838,0.296194,0.370003,0.333803,0.002724
8,29.0,0.005195,0.335395,0.289998,0.374607,0.003586
9,38.0,0.00704,0.291474,0.380002,0.328524,0.003953


In [None]:
fig, ax = plt.subplots(figsize=(11, 5))
ax.plot(thr_grid["percentile"], thr_grid["share_down"], label="Down share", color="#d62728", linewidth=2)
ax.plot(thr_grid["percentile"], thr_grid["share_same"], label="Same share", color="#1f77b4", linewidth=2)
ax.plot(thr_grid["percentile"], thr_grid["share_up"], label="Up share", color="#2ca02c", linewidth=2)
ax.axhline(1/3, color="gray", linestyle="--", linewidth=1.2, label="Ideal 1/3")
best_row = thr_grid.iloc[0]
ax.axvline(best_row["percentile"], color="black", linestyle=":", linewidth=1.8)
ax.set_title("Class Balance vs Symmetric Threshold (Train)", fontweight="bold")
ax.set_xlabel("Percentile of |Return|")
ax.set_ylabel("Class share")
ax.grid(alpha=0.25)
ax.legend()
plt.tight_layout()
plt.show()


## 6) Build target and inspect class balance


In [None]:
panel[TARGET_COL] = make_return_status(panel[RETURN_COL], best_thr)

train_df = panel[(panel["Date"] >= TRAIN_START) & (panel["Date"] <= TRAIN_END)].copy()
val_df = panel[(panel["Date"] >= VAL_START) & (panel["Date"] <= VAL_END)].copy()

train_counts = train_df[TARGET_COL].value_counts().reindex(["down", "same", "up"], fill_value=0)
val_counts = val_df[TARGET_COL].value_counts().reindex(["down", "same", "up"], fill_value=0)

display(pd.DataFrame({
    "train_count": train_counts,
    "train_share": train_counts / train_counts.sum(),
    "val_count": val_counts,
    "val_share": val_counts / val_counts.sum(),
}))


## 7) Prepare model data and preprocessing pipeline


In [None]:
model_cols = ["Date", TARGET_COL] + NUMERIC_FEATURES + CATEGORICAL_FEATURES
train_model = train_df[model_cols].dropna(subset=[TARGET_COL]).copy()
val_model = val_df[model_cols].dropna(subset=[TARGET_COL]).copy()

X_train = train_model[NUMERIC_FEATURES + CATEGORICAL_FEATURES].copy()
y_train = train_model[TARGET_COL].copy()
X_val = val_model[NUMERIC_FEATURES + CATEGORICAL_FEATURES].copy()
y_val = val_model[TARGET_COL].copy()

preprocess = ColumnTransformer(
    transformers=[
        (
            "num",
            Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="median")),
                    ("scaler", StandardScaler()),
                ]
            ),
            NUMERIC_FEATURES,
        ),
        (
            "cat",
            Pipeline(
                steps=[
                    ("imputer", SimpleImputer(strategy="most_frequent")),
                    ("onehot", _make_ohe()),
                ]
            ),
            CATEGORICAL_FEATURES,
        ),
    ],
    remainder="drop",
)

print("Train:", X_train.shape, "| Validation:", X_val.shape)


## 8) TimeSeries CV (n_splits=5) to choose C


In [None]:
cv_rows = []

X_train_ord = X_train.reset_index(drop=True)
y_train_ord = y_train.reset_index(drop=True)
dates_train_ord = train_model["Date"].reset_index(drop=True)

for C in C_GRID:
    fold_acc = []
    fold_f1 = []

    for fold, tr_idx, te_idx in time_series_splits_by_date(dates_train_ord, n_splits=N_SPLITS):
        X_tr, X_te = X_train_ord.iloc[tr_idx], X_train_ord.iloc[te_idx]
        y_tr, y_te = y_train_ord.iloc[tr_idx], y_train_ord.iloc[te_idx]

        clf = LogisticRegression(
            penalty="l1",
            C=float(C),
            solver="saga",
            multi_class="multinomial",
            max_iter=MAX_ITER,
            n_jobs=-1,
            random_state=RANDOM_STATE,
        )
        pipe = Pipeline(steps=[("preprocess", preprocess), ("model", clf)])

        pipe.fit(X_tr, y_tr)
        pred = pipe.predict(X_te)

        acc = accuracy_score(y_te, pred)
        f1m = f1_score(y_te, pred, average="macro", zero_division=0)

        fold_acc.append(acc)
        fold_f1.append(f1m)
        cv_rows.append({"C": float(C), "fold": fold, "accuracy": acc, "f1_macro": f1m})

    print(f"C={C:8.5f} | mean_acc={np.mean(fold_acc):.4f} | mean_f1_macro={np.mean(fold_f1):.4f}")

cv_detail = pd.DataFrame(cv_rows)
cv_summary = (
    cv_detail.groupby("C", as_index=False)
    .agg(
        mean_accuracy=("accuracy", "mean"),
        std_accuracy=("accuracy", "std"),
        mean_f1_macro=("f1_macro", "mean"),
        std_f1_macro=("f1_macro", "std"),
    )
    .sort_values(["mean_f1_macro", "mean_accuracy"], ascending=False)
    .reset_index(drop=True)
)

best_C = float(cv_summary.loc[0, "C"])
print("\nSelected best C:", best_C)
display(cv_summary)


In [None]:
fig, ax = plt.subplots(figsize=(9, 4.8))
ax.plot(cv_summary["C"], cv_summary["mean_f1_macro"], marker="o", linewidth=2, label="Mean CV Macro-F1")
ax.plot(cv_summary["C"], cv_summary["mean_accuracy"], marker="s", linewidth=2, label="Mean CV Accuracy")
ax.set_xscale("log")
ax.axvline(best_C, color="black", linestyle=":", linewidth=1.5, label=f"Selected C={best_C:.4g}")
ax.set_title("TimeSeries CV by C", fontweight="bold")
ax.set_xlabel("C (log scale)")
ax.set_ylabel("Score")
ax.grid(alpha=0.25)
ax.legend()
plt.tight_layout()
plt.show()


## 9) Final model fit and evaluation


In [None]:
final_model = Pipeline(
    steps=[
        ("preprocess", preprocess),
        (
            "model",
            LogisticRegression(
                penalty="l1",
                C=best_C,
                solver="saga",
                multi_class="multinomial",
                max_iter=MAX_ITER,
                n_jobs=-1,
                random_state=RANDOM_STATE,
            ),
        ),
    ]
)

final_model.fit(X_train, y_train)

pred_train = final_model.predict(X_train)
pred_val = final_model.predict(X_val)

metrics_train = metric_frame(y_train, pred_train, "train")
metrics_val = metric_frame(y_val, pred_val, "validation")
metrics_all = pd.concat([metrics_train, metrics_val], ignore_index=True)
display(metrics_all)

report_train = classification_report(y_train, pred_train, digits=4, zero_division=0)
report_val = classification_report(y_val, pred_val, digits=4, zero_division=0)

print("TRAIN report\n")
print(report_train)
print("\nVALIDATION report\n")
print(report_val)


In [None]:
labels = ["down", "same", "up"]
cm = confusion_matrix(y_val, pred_val, labels=labels)
cm_norm = cm / cm.sum(axis=1, keepdims=True)

fig, axes = plt.subplots(1, 2, figsize=(12, 4.8))

im0 = axes[0].imshow(cm, cmap="Blues")
axes[0].set_title("Validation CM (Counts)", fontweight="bold")
axes[0].set_xticks(range(len(labels)))
axes[0].set_yticks(range(len(labels)))
axes[0].set_xticklabels(labels)
axes[0].set_yticklabels(labels)
axes[0].set_xlabel("Predicted")
axes[0].set_ylabel("Actual")
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        axes[0].text(j, i, f"{cm[i, j]}", ha="center", va="center", color="black")

im1 = axes[1].imshow(cm_norm, cmap="Greens", vmin=0, vmax=1)
axes[1].set_title("Validation CM (Row %)", fontweight="bold")
axes[1].set_xticks(range(len(labels)))
axes[1].set_yticks(range(len(labels)))
axes[1].set_xticklabels(labels)
axes[1].set_yticklabels(labels)
axes[1].set_xlabel("Predicted")
axes[1].set_ylabel("Actual")
for i in range(cm_norm.shape[0]):
    for j in range(cm_norm.shape[1]):
        axes[1].text(j, i, f"{cm_norm[i, j]:.2%}", ha="center", va="center", color="black")

fig.colorbar(im0, ax=axes[0], fraction=0.046, pad=0.04)
fig.colorbar(im1, ax=axes[1], fraction=0.046, pad=0.04)
plt.tight_layout()
plt.show()


## 10) LASSO drivers and sector diagnostics


In [None]:
preprocess_fitted = final_model.named_steps["preprocess"]
clf_fitted = final_model.named_steps["model"]

feature_names_out = preprocess_fitted.get_feature_names_out()
class_names = list(clf_fitted.classes_)

coef_df = pd.DataFrame(
    clf_fitted.coef_.T,
    index=feature_names_out,
    columns=[f"coef_{c}" for c in class_names],
)
coef_df["max_abs_coef"] = coef_df.abs().max(axis=1)
coef_df = coef_df.sort_values("max_abs_coef", ascending=False)

display(coef_df.head(25))

top_n = 20
plot_df = coef_df.head(top_n).iloc[::-1]
fig, ax = plt.subplots(figsize=(10, 7))
ax.barh(plot_df.index, plot_df["max_abs_coef"], color="#4e79a7")
ax.set_title(f"Top {top_n} Drivers by |Coefficient|", fontweight="bold")
ax.set_xlabel("Max absolute coefficient")
ax.grid(axis="x", alpha=0.2)
plt.tight_layout()
plt.show()

proba_val = final_model.predict_proba(X_val)
proba_cols = [f"p_{c}" for c in class_names]
pred_val_df = val_model[["Date", "ticker", "sector", RETURN_COL, TARGET_COL]].copy().reset_index(drop=True)
pred_val_df["pred_status"] = pred_val
pred_val_df = pd.concat([pred_val_df, pd.DataFrame(proba_val, columns=proba_cols)], axis=1)

sector_acc = (
    pred_val_df.groupby("sector", dropna=False)
    .apply(lambda g: (g[TARGET_COL] == g["pred_status"]).mean())
    .reset_index(name="validation_accuracy")
    .sort_values("validation_accuracy", ascending=False)
)
display(sector_acc)


## 11) Save all outputs to `MultinomialLogit/`


In [None]:
thr_grid.to_csv(OUTPUT_DIR / "threshold_grid_train_percentiles.csv", index=False)
cv_detail.to_csv(OUTPUT_DIR / "cv_detail_by_fold.csv", index=False)
cv_summary.to_csv(OUTPUT_DIR / "cv_summary_by_C.csv", index=False)
metrics_all.to_csv(OUTPUT_DIR / "metrics_train_validation.csv", index=False)
coef_df.to_csv(OUTPUT_DIR / "lasso_coefficients_full.csv", index=True)
pred_val_df.to_csv(OUTPUT_DIR / "validation_predictions_with_probabilities.csv", index=False)
sector_acc.to_csv(OUTPUT_DIR / "validation_accuracy_by_sector.csv", index=False)

labels = ["down", "same", "up"]
cm = confusion_matrix(y_val, pred_val, labels=labels)
cm_df = pd.DataFrame(cm, index=[f"actual_{l}" for l in labels], columns=[f"pred_{l}" for l in labels])
cm_df.to_csv(OUTPUT_DIR / "validation_confusion_matrix_counts.csv", index=True)

cm_norm = cm / cm.sum(axis=1, keepdims=True)
cm_norm_df = pd.DataFrame(cm_norm, index=[f"actual_{l}" for l in labels], columns=[f"pred_{l}" for l in labels])
cm_norm_df.to_csv(OUTPUT_DIR / "validation_confusion_matrix_rowpct.csv", index=True)

(OUTPUT_DIR / "classification_report_train.txt").write_text(report_train, encoding="utf-8")
(OUTPUT_DIR / "classification_report_validation.txt").write_text(report_val, encoding="utf-8")

summary_lines = [
    "Multinomial Logit (LASSO) Panel Run Summary",
    f"Run tag: {RUN_TAG}",
    f"Date window: {START_DATE.date()} to {END_DATE.date()}",
    f"Train window: {TRAIN_START.date()} to {TRAIN_END.date()}",
    f"Validation window: {VAL_START.date()} to {VAL_END.date()}",
    f"Selected threshold: ±{best_thr:.6f} ({best_thr*100:.3f}%)",
    f"Selected C: {best_C}",
    "",
    metrics_all.to_string(index=False),
]
(OUTPUT_DIR / "run_summary.txt").write_text("\n".join(summary_lines), encoding="utf-8")

print("Saved outputs to:", OUTPUT_DIR)
for p in sorted(OUTPUT_DIR.glob("*")):
    print(" -", p.name)


## Notes
- This is a panel model with ticker + sector fixed effects (one-hot encoded).
- Threshold is selected only from train data to avoid look-ahead bias.
- Validation is strictly out-of-sample (last 2 months).


## 12) Load Existing Run Results (No Retraining)

Use this section to inspect previously generated output folders (for example `smoke_test_10_tickers_*`, `smoke_test_50_tickers_*`, or full `run_*`) without re-running model training.


In [None]:
from pathlib import Path
import pandas as pd

base = OUTPUT_BASE if 'OUTPUT_BASE' in globals() else Path.cwd() / 'MultinomialLogit'
if not base.exists():
    base = Path.cwd().parent / 'MultinomialLogit'

folders = sorted([p for p in base.iterdir() if p.is_dir() and (p.name.startswith('run_') or p.name.startswith('smoke_test_'))])
if not folders:
    raise FileNotFoundError(f'No run folders found in {base}')

for i, p in enumerate(folders, start=1):
    print(f'{i:02d}. {p.name}')

# Pick the latest by default; change the index manually if needed
selected_run = folders[-1]
print('\nSelected run:', selected_run)


In [None]:
def _safe_read_csv(path):
    return pd.read_csv(path) if path.exists() else None

files_to_check = [
    'metrics_train_testing_validation.csv',
    'metrics_train_validation.csv',
    'cv_summary_by_C.csv',
    'threshold_grid_train_percentiles.csv',
    'testing_tscv_fold_metrics.csv',
    'validation_accuracy_by_sector.csv',
]

loaded = {}
for fn in files_to_check:
    p = selected_run / fn
    df = _safe_read_csv(p)
    if df is not None:
        loaded[fn] = df
        print(f'Loaded: {fn} | shape={df.shape}')

if 'metrics_train_testing_validation.csv' in loaded:
    display(loaded['metrics_train_testing_validation.csv'])
elif 'metrics_train_validation.csv' in loaded:
    display(loaded['metrics_train_validation.csv'])

if 'cv_summary_by_C.csv' in loaded:
    display(loaded['cv_summary_by_C.csv'])

if 'threshold_grid_train_percentiles.csv' in loaded:
    display(loaded['threshold_grid_train_percentiles.csv'].head(10))
