In [5]:
# ============================================================
# Fama–French 5 Factors (Monthly) — FINAL CLEAN LOADER
# ============================================================

import pandas as pd
from pathlib import Path

# ------------------------------------------------------------
# Paths
# ------------------------------------------------------------
RAW_PATH = Path(
    "/PC/data/raw"
)
FILE = RAW_PATH / "F-F_Research_Data_5_Factors_2x3.csv"

OUT_PATH = Path(
    "/PC/data/processed"
)
OUT_PATH.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------
# Load raw CSV (no parsing assumptions)
# ------------------------------------------------------------
ff5 = pd.read_csv(FILE)

# ------------------------------------------------------------
# Keep only monthly rows (YYYYMM)
# ------------------------------------------------------------
ff5 = ff5[
    ff5["Date"]
    .astype(str)
    .str.match(r"^\d{6}$", na=False)
].copy()

# ------------------------------------------------------------
# Parse dates: YYYYMM → month-end
# ------------------------------------------------------------
ff5["Date"] = (
    pd.to_datetime(ff5["Date"], format="%Y%m")
    + pd.offsets.MonthEnd(0)
)
ff5 = ff5.set_index("Date").sort_index()

# ------------------------------------------------------------
# Force numeric conversion (CRITICAL FIX)
# ------------------------------------------------------------
factor_cols = ["Mkt-RF", "SMB", "HML", "RMW", "CMA", "RF"]

for col in factor_cols:
    ff5[col] = pd.to_numeric(ff5[col], errors="coerce")

# ------------------------------------------------------------
# Convert percent → decimal
# ------------------------------------------------------------
ff5[factor_cols] = ff5[factor_cols] / 100.0

# ------------------------------------------------------------
# Rename columns (clean, consistent)
# ------------------------------------------------------------
ff5 = ff5.rename(
    columns={
        "Mkt-RF": "MKT",
        "SMB": "SMB",
        "HML": "HML",
        "RMW": "RMW",
        "CMA": "CMA",
        "RF": "RF",
    }
)

# ------------------------------------------------------------
# Trim sample (start 2007-01)
# ------------------------------------------------------------
ff5 = ff5.loc["2007-01-31":]

# ------------------------------------------------------------
# Final sanity checks
# ------------------------------------------------------------
print("FF5 start:", ff5.index.min())
print("FF5 end  :", ff5.index.max())
print("Rows     :", len(ff5))
print()
print(ff5.dtypes)
print()
print(ff5.head())
print()
print(ff5.tail())

# ------------------------------------------------------------
# Save canonical clean dataset
# ------------------------------------------------------------
ff5.to_csv(OUT_PATH / "ff5_monthly_clean.csv")


FF5 start: 2007-01-31 00:00:00
FF5 end  : 2025-11-30 00:00:00
Rows     : 227

MKT    float64
SMB    float64
HML    float64
RMW    float64
CMA    float64
RF     float64
dtype: object

               MKT     SMB     HML     RMW     CMA      RF
Date                                                      
2007-01-31  0.0138  0.0002 -0.0068  0.0019  0.0040  0.0044
2007-02-28 -0.0196  0.0133 -0.0008 -0.0052 -0.0066  0.0038
2007-03-31  0.0071  0.0014 -0.0088  0.0055 -0.0067  0.0043
2007-04-30  0.0349 -0.0208 -0.0144  0.0118  0.0101  0.0044
2007-05-31  0.0324  0.0033 -0.0059  0.0147 -0.0126  0.0041

               MKT     SMB     HML     RMW     CMA      RF
Date                                                      
2025-07-31  0.0198 -0.0015 -0.0127 -0.0029 -0.0207  0.0034
2025-08-31  0.0185  0.0488  0.0442 -0.0067  0.0208  0.0038
2025-09-30  0.0339 -0.0218 -0.0105 -0.0203 -0.0222  0.0033
2025-10-31  0.0196 -0.0131 -0.0309 -0.0522 -0.0403  0.0037
2025-11-30 -0.0013  0.0147  0.0376  0.0143  0.006

In [9]:
# ============================================================
# FF5 Factor Backtesting — FINAL (PANDAS-SAFE, FORMATTED)
# ============================================================

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from pathlib import Path

# ------------------------------------------------------------
# Configuration
# ------------------------------------------------------------
ROLLING_WINDOW = 36
INITIAL_CAPITAL = 1000
ANNUALIZATION = 12

FACTORS = ["SMB", "HML", "RMW", "CMA"]
MARKET = "MKT"

COLORS = {
    "MKT": "#1f77b4",
    "SMB": "#ff7f0e",
    "HML": "#2ca02c",
    "RMW": "#d62728",
    "CMA": "#9467bd",
}

# ------------------------------------------------------------
# Paths
# ------------------------------------------------------------
BASE = Path(
    "/PC/data/processed/Factor Backtesting"
)

FIGURES = BASE / "figures"
TABLES = BASE / "tables"

for p in [
    FIGURES / "pnl",
    FIGURES / "rolling_beta",
    FIGURES / "rolling_alpha",
    FIGURES / "correlations",
    TABLES / "pnl",
    TABLES / "regressions",
    TABLES / "summary",
]:
    p.mkdir(parents=True, exist_ok=True)

# ------------------------------------------------------------
# Styling
# ------------------------------------------------------------
sns.set_theme(style="white")
plt.rcParams.update({
    "font.size": 10,
    "axes.titlesize": 11,
    "axes.labelsize": 10
})

def clean_axes(ax):
    ax.grid(False)
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)

# ------------------------------------------------------------
# Load data
# ------------------------------------------------------------
df = pd.read_csv(
    "/PC/data/processed/ff5_monthly_clean.csv",
    parse_dates=["Date"],
    index_col="Date"
)

# ------------------------------------------------------------
# Build P&L series
# ------------------------------------------------------------
pnl = (1 + df[[MARKET] + FACTORS]).cumprod() * INITIAL_CAPITAL

# ------------------------------------------------------------
# Summary performance table (FORMATTED)
# ------------------------------------------------------------
summary_rows = []

for f in [MARKET] + FACTORS:
    r = df[f]
    pnl_f = pnl[f]

    vol = r.std() * np.sqrt(ANNUALIZATION)
    sharpe = (r.mean() / r.std()) * np.sqrt(ANNUALIZATION)
    drawdown = (pnl_f / pnl_f.cummax() - 1).min()

    summary_rows.append([
        f,
        int(round(pnl_f.iloc[-1], 0)),
        f"{vol * 100:.2f}\\%",
        f"{sharpe:.2f}",
        f"{drawdown * 100:.2f}\\%",
    ])

summary = pd.DataFrame(
    summary_rows,
    columns=["Factor", "Final Value", "Volatility", "Sharpe", "Max Drawdown"]
)

summary.to_latex(
    TABLES / "summary/performance_summary.tex",
    index=False,
    escape=False
)

# ------------------------------------------------------------
# Summary regression diagnostics table
# ------------------------------------------------------------
reg_summary_rows = []

for f in FACTORS:
    X = sm.add_constant(df[MARKET])
    model = sm.OLS(df[f], X).fit()

    reg_summary_rows.append([
        f,
        model.params["const"],
        model.tvalues["const"],
        model.params[MARKET],
        model.tvalues[MARKET],
        model.rsquared,
        int(model.nobs)
    ])

reg_summary = pd.DataFrame(
    reg_summary_rows,
    columns=[
        "Factor", "Alpha", "Alpha t-stat",
        "Beta", "Beta t-stat", "R2", "N"
    ]
)

reg_summary.to_latex(
    TABLES / "summary/regression_summary.tex",
    index=False,
    float_format="%.4f"
)

# ------------------------------------------------------------
# Per-factor analysis
# ------------------------------------------------------------
for f in FACTORS:

    # =======================
    # P&L Plot
    # =======================
    fig, ax = plt.subplots(figsize=(8, 4))
    ax.plot(pnl[f], color=COLORS[f], lw=2, label=f)
    ax.plot(pnl[MARKET], color=COLORS[MARKET], ls="--", alpha=0.6, label="Market")
    ax.set_title(f"{f} — P&L ($1000 base)")
    ax.annotate(
        f"${pnl[f].iloc[-1]:.0f}",
        xy=(pnl.index[-1], pnl[f].iloc[-1]),
        xytext=(6, 0),
        textcoords="offset points",
        color="green",
        va="center"
    )
    ax.legend()
    clean_axes(ax)
    plt.tight_layout()
    plt.savefig(FIGURES / "pnl" / f"pnl_{f}.png", dpi=300)
    plt.close()

    # =======================
    # P&L Table (FORMATTED)
    # =======================
    pnl_table = pd.DataFrame({
        "Metric": ["Final Value", "Volatility", "Sharpe", "Max Drawdown"],
        "Value": [
            int(round(pnl[f].iloc[-1], 0)),
            f"{df[f].std() * np.sqrt(ANNUALIZATION) * 100:.2f}\\%",
            f"{(df[f].mean() / df[f].std()) * np.sqrt(ANNUALIZATION):.2f}",
            f"{(pnl[f] / pnl[f].cummax() - 1).min() * 100:.2f}\\%"
        ]
    })

    pnl_table.to_latex(
        TABLES / "pnl" / f"pnl_{f}.tex",
        index=False,
        escape=False
    )

    # =======================
    # Rolling Beta & Alpha
    # =======================
    betas, alphas, dates = [], [], []

    for i in range(ROLLING_WINDOW, len(df)):
        y = df[f].iloc[i-ROLLING_WINDOW:i]
        X = sm.add_constant(df[MARKET].iloc[i-ROLLING_WINDOW:i])
        res = sm.OLS(y, X).fit()

        betas.append(res.params[MARKET])
        alphas.append(res.params["const"])
        dates.append(df.index[i])

    betas = pd.Series(betas, index=dates)
    alphas = pd.Series(alphas, index=dates)

    for series, name in [(betas, "beta"), (alphas, "alpha")]:
        fig, ax = plt.subplots(figsize=(8, 4))
        ax.plot(series, color=COLORS[f], lw=2)
        ax.set_title(f"{f} — Rolling {name.capitalize()} (36m)")
        ax.annotate(
            f"{name[0]} = {series.iloc[-1]:.3f}",
            xy=(series.index[-1], series.iloc[-1]),
            xytext=(6, 0),
            textcoords="offset points",
            color="green",
            va="center"
        )
        clean_axes(ax)
        plt.tight_layout()
        plt.savefig(FIGURES / f"rolling_{name}" / f"{f}_rolling_{name}.png", dpi=300)
        plt.close()

    # =======================
    # Regression diagnostics (single-factor)
    # =======================
    X = sm.add_constant(df[MARKET])
    model = sm.OLS(df[f], X).fit()

    reg = pd.DataFrame({
        "Coefficient": model.params,
        "t-stat": model.tvalues,
        "p-value": model.pvalues
    })

    reg["R2"] = model.rsquared
    reg["N"] = int(model.nobs)

    reg.to_latex(
        TABLES / "regressions" / f"regression_{f}.tex",
        float_format="%.4f"
    )

# ------------------------------------------------------------
# Correlation matrix
# ------------------------------------------------------------
corr = df[[MARKET] + FACTORS].corr()

fig, ax = plt.subplots(figsize=(6, 5))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
ax.xaxis.tick_bottom()
clean_axes(ax)
plt.tight_layout()
plt.savefig(FIGURES / "correlations/correlation_matrix.png", dpi=300)
plt.close()

print("✔ Fully fixed factor backtesting complete.")


✔ Fully fixed factor backtesting complete.
