In [1]:
# Make project root importable (same Cell 0 you’ve been using)
import sys, importlib
from pathlib import Path



ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
SRC = ROOT / "src"

# Ensure src package structure exists (idempotent)
SRC.mkdir(parents=True, exist_ok=True)
(SRC / "__init__.py").touch(exist_ok=True)
(SRC / "models").mkdir(parents=True, exist_ok=True)
(SRC / "models" / "__init__.py").touch(exist_ok=True)

# Ensure src/utils.py exists
(SRC / "utils.py").touch(exist_ok=True)
(SRC / "config.py").touch(exist_ok=True)
(SRC / "cv.py").touch(exist_ok=True)
(SRC / "window.py").touch(exist_ok=True)
(SRC / "scaling.py").touch(exist_ok=True)
(SRC / "metrics.py").touch(exist_ok=True)
(SRC / "models" / "lstm.py").touch(exist_ok=True)

if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))
importlib.invalidate_caches()

print("ROOT:", ROOT)


ROOT: /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo


In [2]:
import json
import numpy as np
import pandas as pd
from pathlib import Path
from datetime import datetime, timezone

from src.utils import seed_all, get_logger
from src.config import DATA_INTERIM, DATA_RAW, OUTPUTS, MODELS_DIR, SUBMISSIONS_DIR
from src.features import add_target_lags
from src.inference import final_fit_windows, rollout_forecast_test
from src.models.lstm import build_lstm

seed_all(42)
log = get_logger("FINALFIT")
OUTPUTS.mkdir(parents=True, exist_ok=True)
MODELS_DIR.mkdir(parents=True, exist_ok=True)
SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True)

# === Choose the BEST LSTM hyperparams from your experiments ===
LOOKBACK   = 168        # update if a better run used 72 or 336
UNITS      = 64         # update if 128 was better
N_LAYERS   = 2
DROPOUT    = 0.2        # update if 0.1 was better
REC_DROPOUT= 0.0
LR         = 1e-3
CLIPNORM   = 1.0
BATCH      = 64
EPOCHS     = 60
PATIENCE   = 10

TARGET = "pm2_5"
LAGS   = (1, 24, 168)
MAX_LAG = max(LAGS)


In [3]:
df = pd.read_csv(DATA_INTERIM / "clean.csv", parse_dates=["datetime"])
train_df = df.query("split=='train'").reset_index(drop=True)
test_df  = df.query("split=='test'").reset_index(drop=True)

# Add target lags to BOTH; test lags will be populated during rollout
train_df = add_target_lags(train_df, target=TARGET, lags=LAGS)
test_df  = add_target_lags(test_df,  target=TARGET, lags=LAGS)

# Feature list (same contract as training notebooks; exclude noise 'No')
EXCLUDE = {"datetime","split","pm2.5","pm2_5","No"}
feature_cols = [c for c in train_df.columns if c not in EXCLUDE]
print("n_features:", len(feature_cols))


n_features: 20


In [4]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

# Windows + scaler
Xw_tr, yw_tr, scaler = final_fit_windows(
    df_train=train_df,
    feature_cols=feature_cols,
    target_col=TARGET,
    lookback=LOOKBACK,
    max_lag=MAX_LAG,
)

# Build model
model = build_lstm(
    input_len=LOOKBACK, n_features=Xw_tr.shape[2],
    units=UNITS, n_layers=N_LAYERS,
    dropout=DROPOUT, recurrent_dropout=REC_DROPOUT,
    clipnorm=CLIPNORM, lr=LR
)

ckpt_path = MODELS_DIR / f"lstm_FULL_lb{LOOKBACK}_u{UNITS}_L{N_LAYERS}.keras"
callbacks = [
    EarlyStopping(monitor="val_loss", patience=PATIENCE, restore_best_weights=True),
    ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=max(3, PATIENCE//2), min_lr=1e-5),
    ModelCheckpoint(filepath=str(ckpt_path), monitor="val_loss", save_best_only=True)
]

history = model.fit(
    Xw_tr, yw_tr,
    validation_split=0.1,   # small internal split for callbacks
    epochs=EPOCHS,
    batch_size=BATCH,
    callbacks=callbacks,
    verbose=1,
    shuffle=False,
)

# Save final model (best weights are already restored)
final_model_path = MODELS_DIR / f"lstm_FULL_lb{LOOKBACK}_u{UNITS}_L{N_LAYERS}_final.keras"
model.save(final_model_path)
print("Saved:", final_model_path)


Epoch 1/60
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 58ms/step - loss: 0.9810 - val_loss: 0.2474 - learning_rate: 0.0010
Epoch 2/60
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 55ms/step - loss: 0.3545 - val_loss: 0.1994 - learning_rate: 0.0010
Epoch 3/60
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 53ms/step - loss: 0.3203 - val_loss: 0.1792 - learning_rate: 0.0010
Epoch 4/60
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 56ms/step - loss: 0.3056 - val_loss: 0.1773 - learning_rate: 0.0010
Epoch 5/60
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 54ms/step - loss: 0.2912 - val_loss: 0.1655 - learning_rate: 0.0010
Epoch 6/60
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 62ms/step - loss: 0.2700 - val_loss: 0.1685 - learning_rate: 0.0010
Epoch 7/60
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 65ms/step - loss: 0.2583 - val_loss: 0.1607 - 

In [5]:
# Rollout predictions across the entire test horizon
y_test_pred = rollout_forecast_test(
    model=model,
    df_train=train_df,
    df_test=test_df,
    feature_cols=feature_cols,
    target_col=TARGET,
    lags=LAGS,
    lookback=LOOKBACK,
    scaler=scaler,
)

# Non-negativity & (optional) light clipping against absurd spikes
y_test_pred = np.clip(y_test_pred, 0, None)

print("Test preds shape:", y_test_pred.shape)
print("First 5 preds:", y_test_pred[:5])


Test preds shape: (13148,)
First 5 preds: [18.99878  19.658453 21.492937 22.690647 23.372854]


In [8]:
from pathlib import Path
from datetime import datetime, timezone
import numpy as np
import pandas as pd

# --- 0) Ensure we have y_test_pred in memory ---
assert 'y_test_pred' in globals(), "y_test_pred not found. Run the rollout cell first."

# --- 1) Save predictions to a *new* file (for provenance) ---
PRED_DIR = OUTPUTS / "predictions"
PRED_DIR.mkdir(parents=True, exist_ok=True)
ts = datetime.now(timezone.utc).strftime("%Y%m%dT%H%M%SZ")
pred_path = PRED_DIR / f"preds_lstm_lb{LOOKBACK}_u{UNITS}_L{N_LAYERS}_{ts}.npy"
np.save(pred_path, y_test_pred)
print("Saved raw preds:", pred_path)

# --- 2) Load the official sample (Path-safe) ---
from pathlib import Path
candidates = [
    (DATA_RAW / "sample_submission.csv") if hasattr(DATA_RAW, "joinpath") else Path(DATA_RAW) / "sample_submission.csv",
    Path("sample_submission.csv"),
    Path.cwd().parent / "sample_submission.csv",
]
sample_df = None
for p in candidates:
    if p.exists():
        sample_df = pd.read_csv(p)
        print("Loaded sample submission:", p)
        break
assert sample_df is not None, "Could not find sample_submission.csv"

# --- 3) Align predictions to sample order (No-preferred; else time key) ---
sub = sample_df.copy()
target_col_name = "pm2.5" if "pm2.5" in sub.columns else ("pm2_5" if "pm2_5" in sub.columns else sub.columns[-1])

if ("No" in test_df.columns) and ("No" in sub.columns):
    pred_map = pd.Series(y_test_pred, index=test_df["No"]).to_dict()
    sub[target_col_name] = sub["No"].map(pred_map)
else:
    # Find a time-like key in sample
    sample_time_col = None
    for cand in ["row ID", "row_id", "datetime", "timestamp", "date_time"]:
        if cand in sub.columns:
            sample_time_col = cand
            break
    assert sample_time_col is not None, "No alignable time-like key in sample."
    pred_map = pd.Series(y_test_pred, index=pd.to_datetime(test_df["datetime"])).to_dict()
    sub[target_col_name] = [pred_map.get(t, np.nan) for t in pd.to_datetime(sub[sample_time_col])]

# --- 4) Sanity checks ---
assert sub[target_col_name].isna().sum() == 0, "Alignment produced NaNs."
assert (sub[target_col_name] >= 0).all(), "Negative predictions found."

# --- 5) Write a NEW submission with a unique name (no overwrite) ---
SUBMISSIONS_DIR.mkdir(parents=True, exist_ok=True)
sub_path = SUBMISSIONS_DIR / f"submission_lstm_lb{LOOKBACK}_u{UNITS}_L{N_LAYERS}_aligned_{ts}.csv"
sub.to_csv(sub_path, index=False)
print("Wrote NEW aligned submission:", sub_path)
display(sub.head())


Saved raw preds: /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/predictions/preds_lstm_lb168_u64_L2_20250921T182112Z.npy
Loaded sample submission: /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/data/raw/sample_submission.csv
Wrote NEW aligned submission: /Users/testsolutions/Documents/school/year3/term2/time-series-forecasting/tsf-repo/outputs/submissions/submission_lstm_lb168_u64_L2_aligned_20250921T182112Z.csv


Unnamed: 0,row ID,pm2.5
0,2013-07-02 4:00:00,18.998779
1,2013-07-02 5:00:00,19.658453
2,2013-07-02 6:00:00,21.492937
3,2013-07-02 7:00:00,22.690647
4,2013-07-02 8:00:00,23.372854
