In [None]:
import os
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("MKL_NUM_THREADS", "1")
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
os.environ.setdefault("VECLIB_MAXIMUM_THREADS", "1")


import random
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score, balanced_accuracy_score, f1_score,
    precision_score, recall_score, roc_auc_score,
    confusion_matrix, roc_curve, auc,
    mean_absolute_error, mean_squared_error
)


PROJECT_ROOT = Path.cwd()


if PROJECT_ROOT.name == "notebooks":
    PROJECT_ROOT = PROJECT_ROOT.parent

print("Project root:", PROJECT_ROOT)


torch.set_num_threads(1)
try:
    torch.set_num_interop_threads(1)
except Exception:
    pass

def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

set_seed(42)
device = torch.device("cpu")
print("Device:", device)
print("Torch:", torch.__version__)

plt.rcParams["figure.figsize"] = (7, 4)

OUT_MODEL_DIR = PROJECT_ROOT / "reports" / "models"
OUT_FIG_DIR   = PROJECT_ROOT / "reports" / "figures" / "poster"
OUT_MODEL_DIR.mkdir(parents=True, exist_ok=True)
OUT_FIG_DIR.mkdir(parents=True, exist_ok=True)


FEATURES_PATH = PROJECT_ROOT / "data" / "processed" / "features.csv"
print("[Step] Reading:", FEATURES_PATH.resolve())

if not FEATURES_PATH.exists():
    raise FileNotFoundError(f"Cannot find {FEATURES_PATH}. Please ensure data/processed/features.csv exists.")

df = pd.read_csv(FEATURES_PATH, index_col=0)
print("[Step] CSV loaded. Shape:", df.shape)


try:
    df.index = pd.to_datetime(df.index, errors="coerce")
except Exception:
    pass
df = df.sort_index()

REQ = {"next_ret_up", "next_log_ret"}
missing = REQ - set(df.columns)
if missing:
    raise ValueError(
        f"Missing required columns: {missing}. "
        f"Expected at least {REQ}. Existing columns: {list(df.columns)[:10]} ..."
    )

y_cls = df["next_ret_up"].astype(np.float32).values      # 0/1 float
y_reg = df["next_log_ret"].astype(np.float32).values     # float

feature_cols = [c for c in df.columns if c not in ["next_ret_up", "next_log_ret"]]
X_raw = df[feature_cols].astype(np.float32).values

n, d = X_raw.shape
print("[Step] Features:", d, "| Total rows:", n)
print("[Step] Any NaN in raw X?:", np.isnan(X_raw).any())
print("[Step] Any inf in raw X?:", np.isinf(X_raw).any())


WINDOW = 30
TRAIN_END = int(n * 0.70)
VAL_END   = int(n * 0.85)

print("[Step] Split idx:", TRAIN_END, VAL_END, n)

if TRAIN_END <= WINDOW:
    raise ValueError(f"Train split too small for WINDOW={WINDOW}. "
                     f"Need TRAIN_END > WINDOW, got TRAIN_END={TRAIN_END}.")

X_train_raw = X_raw[:TRAIN_END]

imputer = SimpleImputer(strategy="median")
scaler  = StandardScaler()

X_train_imp = imputer.fit_transform(X_train_raw)
X_train_sc  = scaler.fit_transform(X_train_imp)

X_all_sc = scaler.transform(imputer.transform(X_raw)).astype(np.float32)
print("[Step] Preprocess done. X_all_sc shape:", X_all_sc.shape)

all_t = np.arange(WINDOW - 1, n)
t_train = all_t[all_t < TRAIN_END]
t_val   = all_t[(all_t >= TRAIN_END) & (all_t < VAL_END)]
t_test  = all_t[all_t >= VAL_END]
print("[Step] Seq sample counts — train/val/test:", len(t_train), len(t_val), len(t_test))

if len(t_train) == 0 or len(t_val) == 0 or len(t_test) == 0:
    raise ValueError("One of the splits has zero sequence samples. "
                     "Try reducing WINDOW or check your dataset length/splits.")


class WindowDataset(Dataset):
    def __init__(self, X_sc: np.ndarray, y: np.ndarray, t_indices: np.ndarray, window: int, task: str):
        self.X = X_sc
        self.y = y
        self.t = t_indices
        self.window = window
        self.task = task  # "cls" or "reg"

    def __len__(self):
        return len(self.t)

    def __getitem__(self, i):
        t = int(self.t[i])
        x = self.X[t - self.window + 1 : t + 1]  # (T, D)
        if self.task == "cls":
            y = self.y[t]
            return torch.from_numpy(x), torch.tensor(y, dtype=torch.float32)
        else:
            y = self.y[t]
            return torch.from_numpy(x), torch.tensor(y, dtype=torch.float32)

BATCH_SIZE = 64
train_cls_ds = WindowDataset(X_all_sc, y_cls, t_train, WINDOW, task="cls")
val_cls_ds   = WindowDataset(X_all_sc, y_cls, t_val,   WINDOW, task="cls")
test_cls_ds  = WindowDataset(X_all_sc, y_cls, t_test,  WINDOW, task="cls")

train_reg_ds = WindowDataset(X_all_sc, y_reg, t_train, WINDOW, task="reg")
val_reg_ds   = WindowDataset(X_all_sc, y_reg, t_val,   WINDOW, task="reg")
test_reg_ds  = WindowDataset(X_all_sc, y_reg, t_test,  WINDOW, task="reg")

train_cls_loader = DataLoader(train_cls_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=0)
val_cls_loader   = DataLoader(val_cls_ds,   batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_cls_loader  = DataLoader(test_cls_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

train_reg_loader = DataLoader(train_reg_ds, batch_size=BATCH_SIZE, shuffle=True,  num_workers=0)
val_reg_loader   = DataLoader(val_reg_ds,   batch_size=BATCH_SIZE, shuffle=False, num_workers=0)
test_reg_loader  = DataLoader(test_reg_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=0)

print("[Step] DataLoaders ready.")


class LSTMBackbone(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int = 64, num_layers: int = 2, dropout: float = 0.2):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0.0
        )

    def forward(self, x):
        out, (h_n, _) = self.lstm(x)
        h_last = h_n[-1]
        return h_last

class LSTMClassifier(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int = 64, num_layers: int = 2, dropout: float = 0.2):
        super().__init__()
        self.backbone = LSTMBackbone(input_dim, hidden_dim, num_layers, dropout)
        self.head = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        h = self.backbone(x)
        logits = self.head(h).squeeze(-1)
        return logits

class LSTMRegressor(nn.Module):
    def __init__(self, input_dim: int, hidden_dim: int = 64, num_layers: int = 2, dropout: float = 0.2):
        super().__init__()
        self.backbone = LSTMBackbone(input_dim, hidden_dim, num_layers, dropout)
        self.head = nn.Linear(hidden_dim, 1)

    def forward(self, x):
        h = self.backbone(x)
        y = self.head(h).squeeze(-1)
        return y

INPUT_DIM = X_all_sc.shape[1]
print("[Step] INPUT_DIM:", INPUT_DIM)


def train_with_early_stopping(
    model: nn.Module,
    train_loader: DataLoader,
    val_loader: DataLoader,
    criterion,
    optimizer,
    max_epochs: int = 20,
    patience: int = 5,
    task: str = "cls"
):
    history = {"train_loss": [], "val_loss": []}
    best_val = float("inf")
    best_state = None
    bad = 0

    for epoch in range(1, max_epochs + 1):
        model.train()
        train_loss_sum = 0.0
        n_train = 0

        for xb, yb in train_loader:
            xb = xb.to(device)
            yb = yb.to(device)

            optimizer.zero_grad()
            out = model(xb)
            loss = criterion(out, yb)
            loss.backward()
            optimizer.step()

            train_loss_sum += loss.item() * xb.size(0)
            n_train += xb.size(0)

        train_loss = train_loss_sum / max(n_train, 1)

        model.eval()
        val_loss_sum = 0.0
        n_val = 0
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(device)
                yb = yb.to(device)
                out = model(xb)
                loss = criterion(out, yb)
                val_loss_sum += loss.item() * xb.size(0)
                n_val += xb.size(0)

        val_loss = val_loss_sum / max(n_val, 1)
        history["train_loss"].append(train_loss)
        history["val_loss"].append(val_loss)

        print(f"Epoch {epoch:02d} | train_loss={train_loss:.5f} | val_loss={val_loss:.5f}")

        if val_loss < best_val - 1e-6:
            best_val = val_loss
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            bad = 0
        else:
            bad += 1
            if bad >= patience:
                print(f"[EarlyStop] No improvement for {patience} epochs.")
                break

    if best_state is not None:
        model.load_state_dict(best_state)

    return history

def plot_loss_curve(history, title, out_path: Path):
    fig, ax = plt.subplots()
    ax.plot(history["train_loss"], label="train")
    ax.plot(history["val_loss"], label="val")
    ax.set_title(title)
    ax.set_xlabel("epoch")
    ax.set_ylabel("loss")
    ax.legend()
    fig.tight_layout()
    out_path.parent.mkdir(parents=True, exist_ok=True)
    fig.savefig(out_path, dpi=200)
    plt.close(fig)


print("\n[Step] === Training LSTM CLASSIFIER ===")
cls_model = LSTMClassifier(INPUT_DIM, hidden_dim=64, num_layers=2, dropout=0.2).to(device)

cls_criterion = nn.BCEWithLogitsLoss()
cls_optimizer = torch.optim.Adam(cls_model.parameters(), lr=1e-3)

cls_hist = train_with_early_stopping(
    cls_model, train_cls_loader, val_cls_loader,
    cls_criterion, cls_optimizer,
    max_epochs=20, patience=5, task="cls"
)

plot_loss_curve(cls_hist, "LSTM Classifier Loss", OUT_FIG_DIR / "lstm_cls_loss.png")
torch.save(cls_model.state_dict(), OUT_MODEL_DIR / "lstm_clf.pt")
print("[Saved] ", OUT_MODEL_DIR / "lstm_clf.pt")

cls_model.eval()
all_probs = []
all_y = []
with torch.no_grad():
    for xb, yb in test_cls_loader:
        xb = xb.to(device)
        logits = cls_model(xb).cpu()
        probs = torch.sigmoid(logits)
        all_probs.append(probs.numpy())
        all_y.append(yb.numpy())

y_true = np.concatenate(all_y).astype(int)
y_prob = np.concatenate(all_probs)
y_pred = (y_prob >= 0.5).astype(int)

acc  = accuracy_score(y_true, y_pred)
bacc = balanced_accuracy_score(y_true, y_pred)
f1   = f1_score(y_true, y_pred)
prec = precision_score(y_true, y_pred, zero_division=0)
rec  = recall_score(y_true, y_pred, zero_division=0)

try:
    auc_score = roc_auc_score(y_true, y_prob)
except Exception:
    auc_score = float("nan")

print(f"[TEST-CLS] acc={acc:.3f} bacc={bacc:.3f} f1={f1:.3f} prec={prec:.3f} rec={rec:.3f} auc={auc_score:.3f}")

cm = confusion_matrix(y_true, y_pred)
fig, ax = plt.subplots()
im = ax.imshow(cm)
ax.set_title("LSTM Confusion Matrix (TEST)")
ax.set_xlabel("Pred")
ax.set_ylabel("True")
for (i, j), v in np.ndenumerate(cm):
    ax.text(j, i, str(v), ha="center", va="center")
fig.tight_layout()
fig.savefig(OUT_FIG_DIR / "cm_lstm_test.png", dpi=200)
plt.close(fig)

try:
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc = auc(fpr, tpr)
    fig, ax = plt.subplots()
    ax.plot(fpr, tpr, label=f"AUC={roc_auc:.3f}")
    ax.plot([0, 1], [0, 1], linestyle="--")
    ax.set_title("LSTM ROC (TEST)")
    ax.set_xlabel("FPR")
    ax.set_ylabel("TPR")
    ax.legend()
    fig.tight_layout()
    fig.savefig(OUT_FIG_DIR / "roc_lstm_test.png", dpi=200)
    plt.close(fig)
except Exception as e:
    print("[Warn] ROC plot skipped:", e)


print("\n[Step] === Training LSTM REGRESSOR ===")
reg_model = LSTMRegressor(INPUT_DIM, hidden_dim=64, num_layers=2, dropout=0.2).to(device)

reg_criterion = nn.MSELoss()
reg_optimizer = torch.optim.Adam(reg_model.parameters(), lr=1e-3)

reg_hist = train_with_early_stopping(
    reg_model, train_reg_loader, val_reg_loader,
    reg_criterion, reg_optimizer,
    max_epochs=20, patience=5, task="reg"
)

plot_loss_curve(reg_hist, "LSTM Regressor Loss", OUT_FIG_DIR / "lstm_reg_loss.png")
torch.save(reg_model.state_dict(), OUT_MODEL_DIR / "lstm_reg.pt")
print("[Saved] ", OUT_MODEL_DIR / "lstm_reg.pt")

reg_model.eval()
all_pred = []
all_true = []
with torch.no_grad():
    for xb, yb in test_reg_loader:
        xb = xb.to(device)
        pred = reg_model(xb).cpu().numpy()
        all_pred.append(pred)
        all_true.append(yb.numpy())

y_true_r = np.concatenate(all_true).astype(np.float32)
y_pred_r = np.concatenate(all_pred).astype(np.float32)

mae  = mean_absolute_error(y_true_r, y_pred_r)
rmse = float(np.sqrt(mean_squared_error(y_true_r, y_pred_r)))
print(f"[TEST-REG] mae={mae:.6f} rmse={rmse:.6f}")

fig, ax = plt.subplots(figsize=(10, 4))
ax.plot(y_true_r, label="True", linewidth=1)
ax.plot(y_pred_r, label="Pred", linewidth=1)
ax.set_title("Next-day log-return — LSTM (TEST)")
ax.legend()
fig.tight_layout()
fig.savefig(OUT_FIG_DIR / "lstm_reg_pred_vs_true_test.png", dpi=200)
plt.close(fig)

resid = y_true_r - y_pred_r
fig, ax = plt.subplots()
ax.hist(resid, bins=40)
ax.set_title("Residuals — LSTM Regression (TEST)")
ax.set_xlabel("residual")
fig.tight_layout()
fig.savefig(OUT_FIG_DIR / "lstm_reg_residual_hist_test.png", dpi=200)
plt.close(fig)

print("\n DONE. Outputs:")
print("  Models:", (OUT_MODEL_DIR / "lstm_clf.pt"), (OUT_MODEL_DIR / "lstm_reg.pt"))
print("  Figures saved under:", OUT_FIG_DIR.resolve())


Project root: c:\Users\15173\Desktop\stock-ts-forecast
Device: cpu
Torch: 2.9.0+cpu
[Step] Reading: C:\Users\15173\Desktop\stock-ts-forecast\data\processed\features.csv
[Step] CSV loaded. Shape: (3483, 24)
[Step] Features: 22 | Total rows: 3483
[Step] Any NaN in raw X?: True
[Step] Any inf in raw X?: False
[Step] Split idx: 2438 2960 3483
[Step] Preprocess done. X_all_sc shape: (3483, 19)
[Step] Seq sample counts — train/val/test: 2409 522 523
[Step] DataLoaders ready.
[Step] INPUT_DIM: 19

[Step] === Training LSTM CLASSIFIER ===




Epoch 01 | train_loss=0.69365 | val_loss=0.69404
Epoch 02 | train_loss=0.69213 | val_loss=0.69228
Epoch 03 | train_loss=0.69187 | val_loss=0.69274
Epoch 04 | train_loss=0.69106 | val_loss=0.69298
Epoch 05 | train_loss=0.69115 | val_loss=0.69149
Epoch 06 | train_loss=0.68980 | val_loss=0.68925
Epoch 07 | train_loss=0.68871 | val_loss=0.68916
Epoch 08 | train_loss=0.68799 | val_loss=0.68642
Epoch 09 | train_loss=0.68533 | val_loss=0.68798
Epoch 10 | train_loss=0.68462 | val_loss=0.69051
Epoch 11 | train_loss=0.68068 | val_loss=0.68576
Epoch 12 | train_loss=0.67999 | val_loss=0.69754
Epoch 13 | train_loss=0.67523 | val_loss=0.68880
Epoch 14 | train_loss=0.66988 | val_loss=0.70135
Epoch 15 | train_loss=0.66912 | val_loss=0.71147
Epoch 16 | train_loss=0.66314 | val_loss=0.69377
[EarlyStop] No improvement for 5 epochs.
[Saved]  c:\Users\15173\Desktop\stock-ts-forecast\reports\models\lstm_clf.pt
[TEST-CLS] acc=0.444 bacc=0.450 f1=0.426 prec=0.486 rec=0.379 auc=0.444

[Step] === Training LSTM 