In [9]:
# Cell 0 — make `src/` importable in this notebook

import sys
from pathlib import Path
import importlib


# If this notebook lives in <ROOT>/notebooks, go to project root; else keep cwd
ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
SRC = ROOT / "src"

# Ensure src package structure exists (idempotent)
SRC.mkdir(parents=True, exist_ok=True)
(SRC / "__init__.py").touch(exist_ok=True)
(SRC / "models").mkdir(parents=True, exist_ok=True)
(SRC / "models" / "__init__.py").touch(exist_ok=True)

# Ensure src/utils.py exists
(SRC / "utils.py").touch(exist_ok=True)
(SRC / "config.py").touch(exist_ok=True)
(SRC / "cv.py").touch(exist_ok=True)
(SRC / "window.py").touch(exist_ok=True)
(SRC / "scaling.py").touch(exist_ok=True)
(SRC / "metrics.py").touch(exist_ok=True)
(SRC / "models" / "lstm.py").touch(exist_ok=True)

# Put project ROOT (the directory that CONTAINS 'src') on sys.path
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

# Clear stale caches
importlib.invalidate_caches()

print("ROOT:", ROOT)
print("SRC exists:", SRC.exists())
print("sys.path[0]:", sys.path[0])


ROOT: /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo
SRC exists: True
sys.path[0]: /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo


In [10]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
from importlib import reload


# Verify src/utils.py contains the required functions
assert (SRC / "utils.py").exists(), "utils.py is missing in src directory"

from src.utils import seed_all, get_logger
from src.config import DATA_INTERIM, OUTPUTS, MODELS_DIR
from src.cv import make_blocked_folds, mask_for_range
from src.window import make_windows
from src.scaling import PerFoldScaler
from src.metrics import rmse
from src.models.lstm import build_lstm
from src.features import add_target_lags



seed_all(42)
log = get_logger("LSTM-BL")
OUTPUTS.mkdir(parents=True, exist_ok=True)
MODELS_DIR.mkdir(parents=True, exist_ok=True)


LOOKBACK = 168  # start with 168; we'll try 72 and 336 later
BATCH = 64
EPOCHS = 50
PATIENCE = 8
LR = 1e-3
UNITS = 64
N_LAYERS = 2
DROPOUT = 0.2
REC_DROPOUT = 0.0
CLIPNORM = 1.0

TARGET     = "pm2_5"
LAGS       = (1, 24, 168)    # NEW: autoregressive features
MAX_LAG    = max(LAGS)       # used to trim early rows lacking lag values


## 2. Load data, feature list, folds

In [11]:
# Load interim set and keep train split
df = pd.read_csv(DATA_INTERIM / "clean.csv", parse_dates=["datetime"])
train_df = df.query("split=='train'").reset_index(drop=True)

# Add target lags ONCE globally, then we’ll slice per fold
train_df = add_target_lags(train_df, target=TARGET, lags=LAGS)

# Build feature list (exclude time, ids, and raw target)
EXCLUDE = {"datetime", "split", "pm2.5", "pm2_5", "No"}
feature_cols = [c for c in train_df.columns if c not in EXCLUDE]
print("n_features:", len(feature_cols))
print("first 10 features:", feature_cols[:10])

# Folds (blocked, using 30-day validation windows)
folds = make_blocked_folds(train_df, "datetime", n_folds=3, val_days=30)
for f in folds:
    print(f"{f.name}: TRAIN {f.train_start} → {f.train_end} | VAL {f.val_start} → {f.val_end}")


n_features: 20
first 10 features: ['DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'cbwd_NW', 'cbwd_SE', 'cbwd_cv', 'hour']
fold1: TRAIN 2010-01-02 00:00:00 → 2013-06-03 02:00:00 | VAL 2013-06-03 03:00:00 → 2013-07-02 03:00:00
fold2: TRAIN 2010-01-02 00:00:00 → 2013-05-04 02:00:00 | VAL 2013-05-04 03:00:00 → 2013-06-02 03:00:00
fold3: TRAIN 2010-01-02 00:00:00 → 2013-04-04 02:00:00 | VAL 2013-04-04 03:00:00 → 2013-05-03 03:00:00


### 3. Per-fold training loop

In [12]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

cv_rows = []
histories = {}

for f in folds:
    # ---- slice fold windows
    tr_mask = (train_df["datetime"] >= f.train_start) & (train_df["datetime"] <= f.train_end)
    va_mask = (train_df["datetime"] >= f.val_start)   & (train_df["datetime"] <= f.val_end)

    tr = train_df.loc[tr_mask].copy()
    va = train_df.loc[va_mask].copy()

    # ---- trim initial rows lacking lag values
    tr = tr.iloc[MAX_LAG:].reset_index(drop=True)
    va = va.iloc[MAX_LAG:].reset_index(drop=True)

    # ---- targets (log1p on train; val true kept in original scale for scoring)
    y_tr      = np.log1p(tr[TARGET].values.astype("float32"))
    y_va_true = va[TARGET].values.astype("float32")

    # ---- scale features per-fold on training slice only
    scaler = PerFoldScaler()
    X_tr = scaler.fit_transform(tr[feature_cols])
    X_va = scaler.transform(va[feature_cols])

    # ---- windowing (predict y_t from [t-L..t-1])
    Xw_tr, yw_tr = make_windows(X_tr, y_tr, lookback=LOOKBACK)
    # for val, we supply dummy y just to get shape; callbacks will use real val y below
    Xw_va, _     = make_windows(X_va, np.log1p(y_va_true), lookback=LOOKBACK)

    # ---- build model
    model = build_lstm(
        input_len=LOOKBACK, n_features=X_tr.shape[1],
        units=UNITS, n_layers=N_LAYERS,
        dropout=DROPOUT, recurrent_dropout=REC_DROPOUT,
        clipnorm=CLIPNORM, lr=LR
    )

    # ---- callbacks: use the fold's TRUE validation window
    ckpt_path = MODELS_DIR / f"lstm_lb{LOOKBACK}_u{UNITS}_L{N_LAYERS}_{f.name}.keras"
    callbacks = [
        EarlyStopping(monitor="val_loss", patience=PATIENCE, restore_best_weights=True),
        ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=max(2, PATIENCE//2), min_lr=1e-5),
        ModelCheckpoint(filepath=str(ckpt_path), monitor="val_loss", save_best_only=True)
    ]
    y_va_log = np.log1p(y_va_true)[LOOKBACK:]  # align with Xw_va

    history = model.fit(
        Xw_tr, yw_tr,
        validation_data=(Xw_va, y_va_log),
        epochs=EPOCHS,
        batch_size=BATCH,
        callbacks=callbacks,
        verbose=1,
        shuffle=False,  # time windows shouldn't be shuffled
    )
    histories[f.name] = history.history

    # ---- predict & score on validation
    y_va_pred_log = model.predict(Xw_va, verbose=0).reshape(-1)
    y_va_pred     = np.expm1(y_va_pred_log)
    y_va_true_aligned = y_va_true[LOOKBACK:]  # align lengths

    fold_rmse = rmse(y_va_true_aligned, y_va_pred)
    print(f"{f.name} RMSE: {fold_rmse:.3f}")

    # also keep the path where best model was saved
    cv_rows.append({
        "fold": f.name,
        "rmse": fold_rmse,
        "model_path": str(ckpt_path)
    })

# ---- summary
cv_df = pd.DataFrame(cv_rows)
cv_mean = cv_df["rmse"].mean()
cv_std  = cv_df["rmse"].std(ddof=1)
print(f"CV RMSE mean±std: {cv_mean:.3f} ± {cv_std:.3f}")
display(cv_df)


Epoch 1/50
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 54ms/step - loss: 0.9125 - val_loss: 0.2224 - learning_rate: 0.0010
Epoch 2/50
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 51ms/step - loss: 0.3443 - val_loss: 0.1830 - learning_rate: 0.0010
Epoch 3/50
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 51ms/step - loss: 0.3123 - val_loss: 0.1653 - learning_rate: 0.0010
Epoch 4/50
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 54ms/step - loss: 0.2957 - val_loss: 0.1459 - learning_rate: 0.0010
Epoch 5/50
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 57ms/step - loss: 0.2851 - val_loss: 0.1402 - learning_rate: 0.0010
Epoch 6/50
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 73ms/step - loss: 0.2727 - val_loss: 0.1479 - learning_rate: 0.0010
Epoch 7/50
[1m434/434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 89ms/step - loss: 0.2618 - val_loss: 0.1515 - 

Unnamed: 0,fold,rmse,model_path
0,fold1,45.604717,/Users/testsolutions/Documents/school/year3/te...
1,fold2,19.607052,/Users/testsolutions/Documents/school/year3/te...
2,fold3,19.147961,/Users/testsolutions/Documents/school/year3/te...


### 4. Log experiment to experiments/experiments.csv

In [13]:
from datetime import datetime, timezone

exp_row = {
    "timestamp": datetime.now(timezone.utc).isoformat(),  # timezone-aware
    "model": "LSTM",
    "lookback": LOOKBACK,
    "features": len(feature_cols),
    "layers": N_LAYERS,
    "units": UNITS,
    "dropout": DROPOUT,
    "recurrent_dropout": REC_DROPOUT,
    "optimizer": "Adam",
    "lr": LR,
    "batch": BATCH,
    "epochs": EPOCHS,
    "clipnorm": CLIPNORM,
    "target_transform": "log1p",
    "lags": str(LAGS),
    "cv_rmse_mean": round(float(cv_mean), 6),
    "cv_rmse_std": round(float(cv_std), 6),
    "notes": "LSTM + target lags + true fold validation + checkpoint"
}

exp_path = OUTPUTS / "experiments.csv"
exp_path.parent.mkdir(parents=True, exist_ok=True)
if exp_path.exists():
    pd.concat([pd.read_csv(exp_path), pd.DataFrame([exp_row])], ignore_index=True).to_csv(exp_path, index=False)
else:
    pd.DataFrame([exp_row]).to_csv(exp_path, index=False)

print("Logged:", exp_path)
display(pd.read_csv(exp_path).tail())


Logged: /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/experiments.csv


Unnamed: 0,timestamp,model,lookback,features,layers,units,dropout,recurrent_dropout,optimizer,lr,batch,epochs,clipnorm,target_transform,lags,cv_rmse_mean,cv_rmse_std,notes
0,2025-09-21T17:00:48.504357+00:00,LSTM,168,20,2,64,0.2,0.0,Adam,0.001,64,50,1.0,log1p,"(1, 24, 168)",28.11991,15.144027,LSTM + target lags + true fold validation + ch...
