In [None]:
# final_pipeline.ipynb
# Basic imports + plotting style

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path

from hmm_utils import (
    baum_welch_train,
    viterbi_decode,
    count_num_params,
    compute_aic_bic,
)

# make plots a bit bigger, nothing fancy
plt.rcParams["figure.figsize"] = (10, 4)
plt.rcParams["axes.grid"] = True

DATA_PATH = Path("data") / "clean" / "hpi_po_summary_cleaned.xlsx"   # adjust to your file name


In [None]:
# Load FHFA HPI and build a clean time series with log returns.

def load_and_preprocess_hpi(filepath):
    """
    Load FHFA HPI data and build a univariate time series.

    This function will depend on the exact CSV format.
    For now the idea is:
      - filter national, seasonally-adjusted, purchase-only HPI
      - keep date and index level columns
      - compute log-returns as observations
    """
    df_raw = pd.read_excel(filepath)

    # TODO: adapt these column names / filters to the actual FHFA schema.
    # Example placeholder:
    # df = df_raw[df_raw["Level"] == "National"]
    # df = df[df["Seasonally_Adjusted"] == "Yes"]
    # df = df[df["Index_Type"] == "Purchase-Only"]

    # For now, assume the file is already just one series:
    df = df_raw.copy()

    # parse date if needed
    if "date" in df.columns:
        df["date"] = pd.to_datetime(df["date"])
        df = df.sort_values("date")
    else:
        # if no date column, just make an integer index as time
        df = df.sort_index()

    # assume the HPI level is in a column called "hpi"
    if "hpi" not in df.columns:
        raise ValueError("Please rename your HPI level column to 'hpi' or update this function.")

    # compute log-returns
    df["log_hpi"] = np.log(df["hpi"])
    df["ret"] = df["log_hpi"].diff()

    # drop first NaN return
    df = df.dropna(subset=["ret"]).reset_index(drop=True)

    return df


df = load_and_preprocess_hpi(DATA_PATH)
df.head()


In [None]:
# Quick sanity check plots

fig, axes = plt.subplots(2, 1, figsize=(10, 6), sharex=True)

axes[0].plot(df["date"], df["hpi"])
axes[0].set_title("FHFA HPI Level (National, example)")
axes[0].set_ylabel("Index level")

axes[1].plot(df["date"], df["ret"])
axes[1].set_title("Log Returns of HPI")
axes[1].set_ylabel("log-return")
axes[1].set_xlabel("Time")

plt.tight_layout()
plt.show()

In [None]:
# Helper to train an HMM for a given K and collect metrics.

def train_and_evaluate_hmm_for_k(observations, K, max_iters=100, n_restarts=3, seed=0):
    """
    Train a Gaussian HMM with K states on the given 1D observations.

    This wraps:
      - Baum-Welch training (from mini-project style code)
      - Viterbi decoding
      - AIC/BIC calculation

    Returns a simple dict storing everything we care about.
    """
    T = len(observations)

    best_params, loglik_trace = baum_welch_train(
        observations,
        K,
        max_iters=max_iters,
        tol=1e-6,
        n_restarts=n_restarts,
        seed=seed,
    )

    final_loglik = loglik_trace[-1]
    num_params = count_num_params(K)
    aic, bic = compute_aic_bic(final_loglik, num_params, T)

    viterbi_path = viterbi_decode(observations, best_params)

    result = {
        "K": K,
        "params": best_params,
        "loglik_trace": loglik_trace,
        "final_loglik": final_loglik,
        "AIC": aic,
        "BIC": bic,
        "viterbi_path": viterbi_path,
    }
    return result

In [None]:
# Train HMMs with K = 2, 3, 4 and store results.

observations = df["ret"].values

results_by_k = {}

for K in [2, 3, 4]:
    print(f"Training HMM with K = {K}...")
    res = train_and_evaluate_hmm_for_k(observations, K, max_iters=100, n_restarts=3, seed=42)
    results_by_k[K] = res
    print(f"  final log-likelihood: {res['final_loglik']:.2f}, BIC: {res['BIC']:.2f}")

In [None]:
# Plot log-likelihood vs iteration to see if EM is behaving.

plt.figure()
for K, res in results_by_k.items():
    trace = res["loglik_trace"]
    plt.plot(trace, label=f"K={K}")
plt.xlabel("EM iteration")
plt.ylabel("log-likelihood")
plt.title("EM convergence for different K")
plt.legend()
plt.show()

In [None]:
# Quick comparison of K in tabular form.

rows = []
for K, res in results_by_k.items():
    rows.append(
        {
            "K": K,
            "final_loglik": res["final_loglik"],
            "AIC": res["AIC"],
            "BIC": res["BIC"],
        }
    )

df_k_compare = pd.DataFrame(rows).sort_values("K")
df_k_compare

In [None]:
# Plot the Viterbi path for K=3 as colored bands under the returns.

K_target = 3
res_k3 = results_by_k[K_target]
path = res_k3["viterbi_path"]

# For a chill visualization, we just map states to colors.
state_colors = ["tab:blue", "tab:orange", "tab:green", "tab:red"]

fig, ax1 = plt.subplots(figsize=(10, 4))

ax1.plot(df["date"], df["ret"], label="log-return")
ax1.set_ylabel("log-return")
ax1.set_title(f"HMM regimes (K={K_target}) via Viterbi")

# add regime bands as a background color bar
# we draw vertical segments; simple but works
for t in range(len(df)):
    ax1.axvspan(
        df["date"].iloc[t],
        df["date"].iloc[t] + pd.Timedelta("1D"),
        color=state_colors[path[t]],
        alpha=0.08,
    )

plt.tight_layout()
plt.show()

In [None]:
# Optional: save the key plots and print a short summary for the report.

FIG_DIR = Path("figures")
FIG_DIR.mkdir(exist_ok=True)

# Example: save the K comparison table as CSV
df_k_compare.to_csv(FIG_DIR / "k_compare.csv", index=False)

print("K comparison summary:")
print(df_k_compare.to_string(index=False))

print("\nYou can now copy numbers from this table into your report and")
print("use the plots above as figures in the Results section.")