# 02 — Model Experiments & Selection (Ticker-Agnostic)

Trains and evaluates:
- ARIMA (statsmodels)
- Prophet (prophet)
- LSTM (tensorflow/keras)

Logs metrics → `data/logs/{TICKER}_experiments.json`.

Selects best by RMSE, archives existing `models/latest/{TICKER}` → `models/archived/{TICKER}/{timestamp}/`,
then saves:
- `models/latest/{TICKER}/model.pkl`
- `models/latest/{TICKER}/metadata.json`

In [None]:
# Parameters (injected by backend/runner)
import os

TICKER = os.environ.get("TICKER")
if not TICKER:
    raise ValueError("TICKER env var is required (e.g., set TICKER=AAPL)")

TICKER = TICKER.strip().upper()
print("TICKER:", TICKER)

In [None]:
import json
import pickle
import shutil
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd

ROOT = Path(".").resolve()
DATA_DIR = ROOT / "data"
PROC_PATH = DATA_DIR / "processed" / f"{TICKER}.csv"
LOG_DIR = DATA_DIR / "logs"
LOG_DIR.mkdir(parents=True, exist_ok=True)
EXP_LOG_PATH = LOG_DIR / f"{TICKER}_experiments.json"

MODELS_DIR = ROOT / "models"
LATEST_DIR = MODELS_DIR / "latest" / TICKER
ARCHIVE_BASE = MODELS_DIR / "archived" / TICKER

for d in (MODELS_DIR, LATEST_DIR, ARCHIVE_BASE):
    d.mkdir(parents=True, exist_ok=True)

if not PROC_PATH.exists():
    raise FileNotFoundError(f"Processed dataset not found: {PROC_PATH}")

df = pd.read_csv(PROC_PATH, parse_dates=["date"])
df = df.sort_values("date").reset_index(drop=True)

close_col = "adj_close" if "adj_close" in df.columns else "close"
if close_col not in df.columns:
    raise ValueError(f"Expected close column not found in processed data. Found: {list(df.columns)}")

train_df = df[df["split"] == "train"].copy()
val_df = df[df["split"] == "val"].copy()

y_train = train_df[close_col].astype(float).values
y_val = val_df[close_col].astype(float).values

date_min = df["date"].min().date().isoformat()
date_max = df["date"].max().date().isoformat()

len(train_df), len(val_df), close_col

In [None]:
def rmse(y_true, y_pred):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))

def mape(y_true, y_pred, eps=1e-8):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = np.maximum(np.abs(y_true), eps)
    return float(np.mean(np.abs((y_true - y_pred) / denom)))

def safe_json(obj):
    # ensure JSON serializable
    if isinstance(obj, (np.floating, np.integer)):
        return obj.item()
    return obj

## 1) ARIMA (grid over small orders)

In [None]:
arima_result = {"model": "ARIMA", "status": "skipped"}
try:
    from statsmodels.tsa.arima.model import ARIMA

    best = None
    best_fit = None
    # small grid (kept light for automation)
    for p in [0, 1, 2, 3]:
        for d in [0, 1]:
            for q in [0, 1, 2]:
                try:
                    fit = ARIMA(y_train, order=(p, d, q)).fit()
                    pred = fit.forecast(steps=len(y_val))
                    score = rmse(y_val, pred)
                    if best is None or score < best["rmse"]:
                        best = {
                            "order": (p, d, q),
                            "rmse": score,
                            "mape": mape(y_val, pred),
                        }
                        best_fit = fit
                except Exception:
                    continue

    if best_fit is None:
        raise RuntimeError("ARIMA grid search failed for all orders")

    arima_result = {
        "model": "ARIMA",
        "status": "ok",
        **best,
    }
    arima_artifact = {"type": "ARIMA", "order": best["order"], "fit": best_fit}
except Exception as e:
    arima_result = {"model": "ARIMA", "status": "error", "error": str(e)}
    arima_artifact = None

arima_result

## 2) Prophet

Requires `prophet` package. If missing, it will be marked as `error` in logs.

In [None]:
prophet_result = {"model": "Prophet", "status": "skipped"}
try:
    from prophet import Prophet

    p_train = train_df[["date", close_col]].rename(columns={"date": "ds", close_col: "y"})
    p_val = val_df[["date", close_col]].rename(columns={"date": "ds", close_col: "y"})

    m = Prophet(daily_seasonality=False, weekly_seasonality=True, yearly_seasonality=True)
    m.fit(p_train)

    future = p_val[["ds"]]
    forecast = m.predict(future)
    pred = forecast["yhat"].values

    prophet_result = {
        "model": "Prophet",
        "status": "ok",
        "rmse": rmse(p_val["y"].values, pred),
        "mape": mape(p_val["y"].values, pred),
    }
    prophet_artifact = {"type": "Prophet", "model": m}
except Exception as e:
    prophet_result = {"model": "Prophet", "status": "error", "error": str(e)}
    prophet_artifact = None

prophet_result

## 3) LSTM (simple)

Creates supervised windows on close price, trains a small LSTM, and forecasts the validation horizon.

In [None]:
lstm_result = {"model": "LSTM", "status": "skipped"}
try:
    import tensorflow as tf
    from tensorflow import keras

    # normalize using train stats
    mu = float(np.mean(y_train))
    sigma = float(np.std(y_train) + 1e-8)
    y_train_s = (y_train - mu) / sigma
    y_val_s = (y_val - mu) / sigma

    def make_windows(series, window):
        X, Y = [], []
        for i in range(len(series) - window):
            X.append(series[i:i+window])
            Y.append(series[i+window])
        X = np.asarray(X, dtype=np.float32)[..., None]
        Y = np.asarray(Y, dtype=np.float32)
        return X, Y

    window = 30
    Xtr, Ytr = make_windows(y_train_s, window)
    # train small, deterministic-ish
    tf.random.set_seed(42)

    model = keras.Sequential([
        keras.layers.Input(shape=(window, 1)),
        keras.layers.LSTM(32),
        keras.layers.Dense(1),
    ])
    model.compile(optimizer=keras.optimizers.Adam(1e-3), loss="mse")
    model.fit(Xtr, Ytr, epochs=10, batch_size=32, verbose=0)

    # recursive forecast over validation horizon
    history = list(y_train_s[-window:])
    preds_s = []
    for _ in range(len(y_val_s)):
        x = np.asarray(history[-window:], dtype=np.float32)[None, :, None]
        yhat = float(model.predict(x, verbose=0).ravel()[0])
        preds_s.append(yhat)
        history.append(yhat)

    preds = np.asarray(preds_s) * sigma + mu

    lstm_result = {
        "model": "LSTM",
        "status": "ok",
        "rmse": rmse(y_val, preds),
        "mape": mape(y_val, preds),
        "window": window,
        "epochs": 10,
    }

    # store picklable bundle
    lstm_artifact = {
        "type": "LSTM",
        "model_json": model.to_json(),
        "weights": model.get_weights(),
        "mu": mu,
        "sigma": sigma,
        "window": window,
    }
except Exception as e:
    lstm_result = {"model": "LSTM", "status": "error", "error": str(e)}
    lstm_artifact = None

lstm_result

## 4) Log experiments + Select best

In [None]:
results = [arima_result, prophet_result, lstm_result]

ok = [r for r in results if r.get("status") == "ok"]
if not ok:
    raise RuntimeError(f"No models trained successfully: {results}")

best = sorted(ok, key=lambda r: r["rmse"])[0]
best_type = best["model"]

experiment_log = {
    "ticker": TICKER,
    "generated_at_utc": datetime.utcnow().isoformat() + "Z",
    "data_range": {"min": date_min, "max": date_max},
    "n_train": int(len(train_df)),
    "n_val": int(len(val_df)),
    "target": close_col,
    "results": results,
    "best": best,
}

EXP_LOG_PATH.write_text(json.dumps(experiment_log, indent=2, default=safe_json))
print("Saved experiments log:", EXP_LOG_PATH)
print("Best:", best_type, "RMSE:", best["rmse"])

## 5) Archive previous model + Save new best

In [None]:
def archive_latest_if_exists(latest_dir: Path, archive_base: Path):
    if latest_dir.exists() and any(latest_dir.iterdir()):
        ts = datetime.utcnow().strftime("%Y%m%dT%H%M%SZ")
        dest = archive_base / ts
        dest.parent.mkdir(parents=True, exist_ok=True)
        if dest.exists():
            shutil.rmtree(dest)
        shutil.move(str(latest_dir), str(dest))
        latest_dir.mkdir(parents=True, exist_ok=True)
        return str(dest)
    latest_dir.mkdir(parents=True, exist_ok=True)
    return None

archived_to = archive_latest_if_exists(LATEST_DIR, ARCHIVE_BASE)
print("Archived to:", archived_to)

artifact = None
if best_type == "ARIMA":
    artifact = arima_artifact
elif best_type == "Prophet":
    artifact = prophet_artifact
elif best_type == "LSTM":
    artifact = lstm_artifact
else:
    raise ValueError(f"Unknown best model type: {best_type}")

model_pkl = LATEST_DIR / "model.pkl"
meta_json = LATEST_DIR / "metadata.json"

with model_pkl.open("wb") as f:
    pickle.dump(artifact, f)

metadata = {
    "ticker": TICKER,
    "model_type": best_type,
    "trained_at_utc": datetime.utcnow().isoformat() + "Z",
    "metrics": {"rmse": best["rmse"], "mape": best["mape"]},
    "data_range": {"min": date_min, "max": date_max},
    "target": close_col,
    "archived_previous_to": archived_to,
}

meta_json.write_text(json.dumps(metadata, indent=2, default=safe_json))

print("Saved:", model_pkl)
print("Saved:", meta_json)