<a href="https://colab.research.google.com/github/AHubanov/LSTMPractice/blob/main/lstm_nextstep_two_stage_sched.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# LSTM next-step forecasting (train=85%, val=15%, без test split, с Google Drive и проверкой)

In [1]:
import os
import shutil
import random
import time
from typing import List, Tuple

import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from tqdm.auto import tqdm
import joblib

In [2]:



try:
    from google.colab import drive
    drive.mount('/content/drive')
except:
    pass

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

DRIVE_DATA_DIR = os.environ.get("DRIVE_DATA_DIR", "/content/drive/MyDrive/")
DRIVE_OUTPUT_DIR = os.environ.get("DRIVE_OUTPUT_DIR", "/content/drive/MyDrive/LSTM/artifacts")
os.makedirs("datasets", exist_ok=True)
src = os.path.join(DRIVE_DATA_DIR, "predict/train.parquet")
if os.path.exists(src):
    shutil.copy(src, "datasets/train.parquet")
PARQUET_PATH = "datasets/train.parquet" if os.path.exists("datasets/train.parquet") else "train.parquet"




Mounted at /content/drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [3]:
WINDOW = 100
BATCH_SIZE = 192
EPOCHS = 200
LR = 0.8e-3
MIN_LR = 0.0001
HIDDEN_SIZE = 192
NUM_LAYERS = 2
DROPOUT = 0.4
PATIENCE = 20

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
df = pd.read_parquet(PARQUET_PATH)
required_cols = {"seq_ix", "step_in_seq", "need_prediction"}
missing = required_cols - set(df.columns)
if missing:
    raise ValueError(f"Нет колонок: {missing}")

feature_cols = [c for c in df.columns if c not in ["seq_ix", "step_in_seq", "need_prediction"]]
feature_cols = sorted(feature_cols, key=lambda x: int(x))

df = df.sort_values(["seq_ix", "step_in_seq"]).reset_index(drop=True)
all_seq = df["seq_ix"].unique()
rng = np.random.default_rng(SEED)
rng.shuffle(all_seq)
n_total = len(all_seq)
n_train = int(0.85 * n_total)
train_seq_ids = all_seq[:n_train]
val_seq_ids   = all_seq[n_train:]

df["_pos"] = df.groupby("seq_ix").cumcount()
df["_len"] = df.groupby("seq_ix")["_pos"].transform("max") + 1
df["_split"] = (0.85 * df["_len"]).astype(int)
df["_val_start"] = (df["_split"] - WINDOW).clip(lower=0)

train_mask = df["_pos"] < df["_split"]
val_mask = df["_pos"] >= df["_val_start"]

train_seq = df.loc[train_mask].drop(columns=["_pos", "_len", "_split", "_val_start"]).reset_index(drop=True)

val_seq = df.loc[val_mask].copy()
val_seq["need_prediction"] = val_seq["_pos"] >= val_seq["_split"]
val_seq = val_seq.drop(columns=["_pos", "_len", "_split", "_val_start"]).reset_index(drop=True)

In [None]:

# df = pd.read_parquet(PARQUET_PATH)
# required_cols = {"seq_ix", "step_in_seq", "need_prediction"}
# missing = required_cols - set(df.columns)
# if missing:
#     raise ValueError(f"Нет колонок: {missing}")
# feature_cols = [c for c in df.columns if c not in ["seq_ix", "step_in_seq", "need_prediction"]]
# feature_cols = sorted(feature_cols, key=lambda x: int(x))
# df = df.sort_values(["seq_ix", "step_in_seq"]).reset_index(drop=True)
# all_seq = df["seq_ix"].unique()
# rng = np.random.default_rng(SEED)
# rng.shuffle(all_seq)
# n_total = len(all_seq)
# n_train = int(0.85 * n_total)
# train_seq = all_seq[:n_train]
# val_seq   = all_seq[n_train:]


In [5]:

def subset_by_seq(d: pd.DataFrame, seq_ids):
    return d[d["seq_ix"].isin(seq_ids)].copy()

df_train_raw = subset_by_seq(df, train_seq)
df_val_raw   = subset_by_seq(df, val_seq)


In [6]:
from typing import List, Tuple

df_train_raw = train_seq.copy()
df_val_raw   = val_seq.copy()

def build_windows(df_seq: pd.DataFrame, window: int) -> List[Tuple[np.ndarray, np.ndarray]]:
    feats = df_seq[feature_cols].values
    steps = df_seq["step_in_seq"].values
    out = []
    n = len(df_seq)
    for i in range(window, n):
        window_steps = steps[i-window:i+1]
        if not np.all(np.diff(window_steps) == 1):
            continue
        X = feats[i-window:i]
        y = feats[i]
        out.append((X, y))
    return out

def build_all_windows(df_raw: pd.DataFrame, window: int) -> List[Tuple[np.ndarray, np.ndarray]]:
    data = []
    for _, g in df_raw.groupby("seq_ix"):
        g = g.sort_values("step_in_seq")
        data.extend(build_windows(g, window))
    return data

train_pairs = build_all_windows(df_train_raw, WINDOW)
val_pairs   = build_all_windows(df_val_raw, WINDOW)

print(f"Windows: train={len(train_pairs)}, val={len(val_pairs)}")


Windows: train=387750, val=77550


In [7]:

X_train_stack = np.concatenate([p[0] for p in train_pairs], axis=0)
y_train_stack = np.stack([p[1] for p in train_pairs], axis=0)
all_train_for_scaler = np.vstack([X_train_stack, y_train_stack])
scaler = StandardScaler()
scaler.fit(all_train_for_scaler)

def scale_pairs(pairs):
    Xs = []
    ys = []
    for X, y in pairs:
        Xs.append(scaler.transform(X))
        ys.append(scaler.transform(y.reshape(1, -1)).squeeze(0))
    return Xs, ys

X_train, y_train = scale_pairs(train_pairs)
X_val,   y_val   = scale_pairs(val_pairs)


In [8]:

class WindowDataset(Dataset):
    def __init__(self, X_list, y_list):
        self.X = [torch.tensor(x, dtype=torch.float32) for x in X_list]
        self.y = [torch.tensor(y, dtype=torch.float32) for y in y_list]
    def __len__(self):
        return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = WindowDataset(X_train, y_train)
val_ds   = WindowDataset(X_val, y_val)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, drop_last=False)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, drop_last=False)

input_size = len(feature_cols)
output_size = len(feature_cols)


In [9]:

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=256, num_layers=2, dropout=0.18, output_size=31):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers>1 else 0.0,
            batch_first=True,
            bidirectional=False
        )
        d = hidden_size
        self.head = nn.Sequential(
            nn.LayerNorm(d),
            nn.Linear(d, d*2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d*2, output_size)
        )
        for name, p in self.lstm.named_parameters():
            if "weight_hh" in name:
                nn.init.orthogonal_(p)
            elif "weight_ih" in name:
                nn.init.xavier_uniform_(p)
            elif "bias" in name:
                nn.init.zeros_(p)
        for m in self.head.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
    def forward(self, x):
        o, _ = self.lstm(x)
        h = o[:, -1, :]
        return self.head(h)


In [10]:
import torch

def _weighted_mse(y_pred, y_true):
    d = y_pred.shape[-1]
    w = torch.ones(d, device=y_pred.device)
    k = 5 if d >= 5 else d
    if k > 0:
        w[:k] = 1.5
    while w.dim() < y_pred.dim():
        w = w.unsqueeze(0)
    e = (y_pred - y_true) ** 2
    return (e * w).mean()

model = LSTMModel(
    input_size=input_size,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    output_size=output_size
).to(DEVICE)

optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
criterion = _weighted_mse

best_val = float("inf")
best_state = None
no_improve = 0


In [11]:
import os, re, pickle, copy, time, torch, torch.nn as nn
from tqdm import tqdm

MODEL_PATH = "/content/artifacts/common.pt"
SCALER_PATH = "/content/artifacts/common.pkl"
CKPT_PATH = "/content/artifacts/best.ckpt"

def _stem_and_dir(p):
    d = os.path.dirname(p)
    b = os.path.basename(p)
    stem, ext = os.path.splitext(b)
    return d if d else ".", stem

ART_DIR, MODEL_STEM = _stem_and_dir(MODEL_PATH)
_, SCALER_STEM = _stem_and_dir(SCALER_PATH)
_, CKPT_STEM = _stem_and_dir(CKPT_PATH)

def _next_index(d, stem):
    if not os.path.exists(d):
        return 1
    mx = 0
    for f in os.listdir(d):
        m = re.fullmatch(rf"{re.escape(stem)}_(\d{{3}})\.(pt|pkl|ckpt)", f)
        if m:
            mx = max(mx, int(m.group(1)))
    return mx + 1

def latest_ckpt(d, stem, fallback):
    best = None
    bi = -1
    if os.path.exists(d):
        for f in os.listdir(d):
            m = re.fullmatch(rf"{re.escape(stem)}_(\d{{3}})\.ckpt", f)
            if m:
                i = int(m.group(1))
                if i > bi:
                    bi = i
                    best = os.path.join(d, f)
    if best is None and os.path.exists(fallback):
        return fallback
    return best

best_val = float("inf")
best_state = None
best_optim_state = None
best_sched_state = None
no_improve = 0

os.makedirs(ART_DIR, exist_ok=True)

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.85, patience=1)
prev_lr = optimizer.param_groups[0]["lr"]

epoch_bar = tqdm(range(1, EPOCHS + 1), desc="EP", position=0, leave=True)
for epoch in epoch_bar:
    model.train()
    train_loss = 0.0
    for xb, yb in train_loader:
        xb = xb.to(DEVICE)
        yb = yb.to(DEVICE)
        optimizer.zero_grad()
        yhat = model(xb)
        loss = criterion(yhat, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        train_loss += loss.item() * xb.size(0)
    train_loss /= len(train_ds)

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(DEVICE)
            yb = yb.to(DEVICE)
            yhat = model(xb)
            loss = criterion(yhat, yb)
            val_loss += loss.item() * xb.size(0)
    val_loss /= len(val_ds) if len(val_ds) > 0 else 1

    scheduler.step(val_loss)
    current_lr = optimizer.param_groups[0]['lr']

    if current_lr < prev_lr:
        ckpt_path = latest_ckpt(ART_DIR, CKPT_STEM, CKPT_PATH)
        if ckpt_path is not None and os.path.exists(ckpt_path):
            ckpt = torch.load(ckpt_path, map_location="cpu")
            if "model" in ckpt:
                model.load_state_dict(ckpt["model"])
            if "optimizer" in ckpt and ckpt["optimizer"] is not None:
                optimizer.load_state_dict(ckpt["optimizer"])
            for g in optimizer.param_groups:
                g["lr"] = current_lr
            if "scheduler" in ckpt and ckpt["scheduler"] is not None:
                try:
                    scheduler.load_state_dict(ckpt["scheduler"])
                except Exception:
                    pass
            no_improve = 0
        prev_lr = current_lr

    epoch_bar.set_postfix(train_MSE=f"{train_loss:.6f}", val_MSE=f"{val_loss:.6f}", lr=f"{current_lr:.6f}")

    if val_loss + 1e-9 < best_val:
        best_val = val_loss
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}

        idx = _next_index(ART_DIR, MODEL_STEM)
        model_path_i = os.path.join(ART_DIR, f"{MODEL_STEM}_{idx:03d}.pt")
        scaler_path_i = os.path.join(ART_DIR, f"{SCALER_STEM}_{idx:03d}.pkl")
        ckpt_path_i = os.path.join(ART_DIR, f"{CKPT_STEM}_{idx:03d}.ckpt")

        torch.save(best_state, model_path_i)
        try:
            with open(scaler_path_i, "wb") as f:
                pickle.dump(scaler, f)
        except NameError:
            pass

        prev_meta = {}
        prev_ckpt_path = latest_ckpt(ART_DIR, CKPT_STEM, CKPT_PATH)
        if prev_ckpt_path is not None and os.path.exists(prev_ckpt_path):
            try:
                prev_meta = torch.load(prev_ckpt_path, map_location="cpu")
            except Exception:
                prev_meta = {}

        ckpt = dict(prev_meta)
        ckpt.update({
            "model": best_state,
            "optimizer": copy.deepcopy(optimizer.state_dict()),
            "scheduler": copy.deepcopy(scheduler.state_dict()),
            "best_val": best_val,
            "epoch": epoch,
        })
        if hasattr(model, "hidden_size"):
            ckpt["hidden_size"] = model.hidden_size
        if hasattr(model, "config"):
            ckpt["config"] = model.config

        torch.save(ckpt, ckpt_path_i)
        print(f"✅ Saved: {idx:03d}")
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= PATIENCE or current_lr <= MIN_LR:
            print("Early stopping.")
            break

if best_state is not None:
    model.load_state_dict(best_state)
model.to(DEVICE)

EP:   0%|          | 1/200 [02:16<7:33:19, 136.68s/it, lr=0.000800, train_MSE=0.732642, val_MSE=0.693151]

✅ Saved: 001


EP:   1%|          | 2/200 [04:32<7:29:36, 136.25s/it, lr=0.000800, train_MSE=0.692483, val_MSE=0.683246]

✅ Saved: 002


EP:   2%|▏         | 3/200 [06:48<7:26:51, 136.10s/it, lr=0.000800, train_MSE=0.683118, val_MSE=0.679133]

✅ Saved: 003


EP:   2%|▏         | 4/200 [09:04<7:24:20, 136.03s/it, lr=0.000800, train_MSE=0.677337, val_MSE=0.676086]

✅ Saved: 004


EP:   2%|▎         | 5/200 [11:20<7:22:08, 136.04s/it, lr=0.000800, train_MSE=0.672701, val_MSE=0.674832]

✅ Saved: 005


EP:   3%|▎         | 6/200 [13:36<7:19:44, 136.00s/it, lr=0.000800, train_MSE=0.669609, val_MSE=0.671433]

✅ Saved: 006


EP:   4%|▎         | 7/200 [15:52<7:17:20, 135.96s/it, lr=0.000800, train_MSE=0.666209, val_MSE=0.669610]

✅ Saved: 007


EP:   4%|▍         | 8/200 [18:08<7:15:00, 135.94s/it, lr=0.000800, train_MSE=0.663111, val_MSE=0.669368]

✅ Saved: 008


EP:   4%|▍         | 9/200 [20:24<7:12:35, 135.89s/it, lr=0.000800, train_MSE=0.660017, val_MSE=0.667733]

✅ Saved: 009


EP:   6%|▌         | 12/200 [27:11<7:05:41, 135.86s/it, lr=0.000680, train_MSE=0.654983, val_MSE=0.667307]

✅ Saved: 010


EP:   8%|▊         | 17/200 [38:30<6:54:24, 135.87s/it, lr=0.000491, train_MSE=0.647862, val_MSE=0.665909]

✅ Saved: 011


EP:  14%|█▍        | 28/200 [1:03:38<6:30:56, 136.37s/it, lr=0.000218, train_MSE=0.639083, val_MSE=0.665988]


KeyboardInterrupt: 

In [12]:

os.makedirs("artifacts", exist_ok=True)
torch.save({
    "state_dict": model.state_dict(),
    "hidden_size": HIDDEN_SIZE,
    "num_layers": NUM_LAYERS,
    "dropout": DROPOUT,
    "output_size": output_size,
    "window": WINDOW,
    "feature_cols": feature_cols
}, "artifacts/lstm_nextstep.pt")
joblib.dump(scaler, "artifacts/standard_scaler.pkl")
print("Сохранено: artifacts/lstm_nextstep.pt, artifacts/standard_scaler.pkl")
try:
    os.makedirs(DRIVE_OUTPUT_DIR, exist_ok=True)
    shutil.copy("artifacts/lstm_nextstep.pt", os.path.join(DRIVE_OUTPUT_DIR, "lstm_nextstep.pt"))
    shutil.copy("artifacts/standard_scaler.pkl", os.path.join(DRIVE_OUTPUT_DIR, "standard_scaler.pkl"))
    print("Скопировано в Google Drive")
except Exception as e:
    print(e)


Сохранено: artifacts/lstm_nextstep.pt, artifacts/standard_scaler.pkl
Скопировано в Google Drive


In [None]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from dataclasses import dataclass
from tqdm.auto import tqdm
from typing import List
from sklearn.metrics import r2_score
import torch
from torch import nn
import joblib

PARQUET_PATH = "/content/datasets/train.parquet"
MODEL_PATH = "/content/artifacts/lstm_nextstep.pt"
SCALER_PATH = "/content/artifacts/standard_scaler.pkl"
FRAC_SEQ = 1
SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

@dataclass
class DataPoint:
    seq_ix: int
    step_in_seq: int
    need_prediction: bool
    state: np.ndarray

class ScorerStepByStep:
    def __init__(self, dataset_path: str, frac_seq: float, seed: int, min_len: int):
        df = pd.read_parquet(dataset_path).sort_values(["seq_ix","step_in_seq"])
        groups = [g for _, g in df.groupby("seq_ix", sort=False) if len(g) >= min_len]
        rng = np.random.default_rng(seed)
        n = len(groups)
        k = max(1, int(round(n * frac_seq)))
        idx = rng.choice(n, size=k, replace=False)
        subset = pd.concat([groups[i] for i in sorted(idx)], axis=0).reset_index(drop=True)
        self.dataset = subset
        self.dim = self.dataset.shape[1] - 3
        self.features = self.dataset.columns[3:]
    def score(self, model) -> dict:
        predictions, targets = [], []
        next_prediction = None
        for row in tqdm(self.dataset.values):
            seq_ix, step_in_seq, need_prediction = row[:3]
            new_state = row[3:]
            if next_prediction is not None:
                predictions.append(next_prediction)
                targets.append(new_state)
            dp = DataPoint(seq_ix, step_in_seq, need_prediction, new_state)
            next_prediction = model.predict(dp)
            if need_prediction and next_prediction is None:
                raise RuntimeError("predict returned None when need_prediction==1")
        if len(predictions) == 0:
            return {"mean_r2": np.nan}
        P = np.asarray(predictions, dtype=np.float32)
        T = np.asarray(targets, dtype=np.float32)
        scores = {f: r2_score(T[:, i], P[:, i]) for i, f in enumerate(self.features)}
        scores["mean_r2"] = float(np.nanmean(list(scores.values())))
        return scores

def _infer_num_layers_from_state_dict(sd):
    n = 0
    while f"lstm.weight_ih_l{n}" in sd:
        n += 1
    return max(n, 1)

def _infer_hidden_input_from_state_dict(sd):
    w_ih = sd.get("lstm.weight_ih_l0", None)
    if w_ih is None:
        return None, None
    hidden_size = w_ih.shape[0] // 4
    input_size = w_ih.shape[1]
    return hidden_size, input_size

class PredictionModel:
    def __init__(self, ckpt_path: str, scaler_path: str, device: str):
        self.device = device
        raw = torch.load(ckpt_path, map_location=device)
        if isinstance(raw, dict) and "state_dict" in raw:
            meta = raw
            state_dict = raw["state_dict"]
        elif isinstance(raw, dict):
            meta = {}
            state_dict = raw
        else:
            meta = {}
            state_dict = raw
        self.scaler = joblib.load(scaler_path)
        if hasattr(self.scaler, "n_features_in_"):
            nfeat = int(self.scaler.n_features_in_)
        elif hasattr(self.scaler, "mean_"):
            nfeat = int(self.scaler.mean_.shape[0])
        else:
            nfeat = None
        hidden_sd, input_sd = _infer_hidden_input_from_state_dict(state_dict)
        num_layers_sd = _infer_num_layers_from_state_dict(state_dict)
        input_size = int(meta.get("input_size", input_sd if input_sd is not None else nfeat))
        output_size = int(meta.get("output_size", nfeat))
        hidden_size = int(meta.get("hidden_size", hidden_sd if hidden_sd is not None else 256))
        num_layers = int(meta.get("num_layers", num_layers_sd))
        dropout = float(meta.get("dropout", 0.18))
        self.window = int(meta.get("window", 50))
        self.model = LSTMModel(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout,
            output_size=output_size
        ).to(device)
        self.model.load_state_dict(state_dict, strict=True)
        self.model.eval()
        self.current_seq_ix = None
        self.sequence_history: List[np.ndarray] = []
        self.maxlen = self.window
        self.expected_dim = output_size
    def _reset_if_new_sequence(self, seq_ix: int):
        if self.current_seq_ix != seq_ix:
            self.current_seq_ix = seq_ix
            self.sequence_history = []
    def _append_state(self, state: np.ndarray):
        self.sequence_history.append(np.asarray(state, dtype=np.float32))
        if len(self.sequence_history) > self.maxlen:
            self.sequence_history = self.sequence_history[-self.maxlen:]
    @torch.no_grad()
    def _predict_from_window(self, window_np: np.ndarray) -> np.ndarray:
        window_scaled = self.scaler.transform(window_np)
        x = torch.from_numpy(window_scaled[None, ...]).to(self.device).float()
        yhat_scaled = self.model(x).detach().cpu().numpy()[0]
        yhat = self.scaler.inverse_transform(yhat_scaled.reshape(1, -1))[0]
        return yhat.astype(np.float32)
    def predict(self, data_point: DataPoint) -> np.ndarray | None:
        seq_ix = int(data_point.seq_ix)
        need_pred = int(data_point.need_prediction)
        self._reset_if_new_sequence(seq_ix)
        s = np.asarray(data_point.state, dtype=np.float32)
        if s.shape[0] != self.expected_dim:
            return None
        self._append_state(s)
        if need_pred != 1 or len(self.sequence_history) < self.maxlen:
            return None
        window_np = np.stack(self.sequence_history[-self.maxlen:], axis=0)
        return self._predict_from_window(window_np)

assert DEVICE=="cuda", "Нужен GPU: включи среду с CUDA (в Colab: Runtime → Change runtime type → GPU)."
model = PredictionModel(MODEL_PATH, SCALER_PATH, DEVICE)
scorer = ScorerStepByStep(PARQUET_PATH, frac_seq=FRAC_SEQ, seed=SEED, min_len=model.window+1)
print(f"Device: {DEVICE}, window={model.window}, rows={len(scorer.dataset)}")
res = scorer.score(model)
print(f"Mean R²: {res['mean_r2']:.6f}")
feats = list(scorer.features[:5])
for f in feats:
    print(f"{f}: {res[f]:.6f}")
print(f"Total features: {len(scorer.features)}")

Device: cuda, window=100, rows=517000


  0%|          | 0/517000 [00:00<?, ?it/s]

In [13]:
import torch
import pickle
import gzip

try:
    import joblib
except Exception:
    joblib = None

MODEL_PATH = "/content/artifacts/common_009.pt"
SCALER_PATH = "/content/artifacts/common_009.pkl"

LR = 5e-4
PATIENCE = 5

model = LSTMModel(
    input_size=input_size,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    dropout=DROPOUT,
    output_size=output_size
).to(DEVICE)

ckpt = torch.load(MODEL_PATH, map_location=DEVICE)
state_dict = ckpt.get("state_dict", ckpt.get("model_state_dict", ckpt))
state_dict = {k.replace("module.", "", 1): v for k, v in state_dict.items()}
state_dict = {k.replace("model.", "", 1): v for k, v in state_dict.items()}
model.load_state_dict(state_dict, strict=False)
model.eval()

def load_artifact(path, map_location=None):
    if joblib is not None:
        try:
            return joblib.load(path)
        except Exception:
            pass
    try:
        return torch.load(path, map_location=map_location)
    except Exception:
        pass
    try:
        with open(path, "rb") as f:
            return pickle.load(f)
    except Exception:
        pass
    try:
        with gzip.open(path, "rb") as f:
            return pickle.load(f)
    except Exception as e:
        raise RuntimeError(f"Не удалось загрузить артефакт {path}: {e}")

scaler = load_artifact(SCALER_PATH, map_location=DEVICE)

# N_IDX = 1
# N_WEIGHT = 5.0

# def _weighted_mse(y_pred, y_true):
#     d = y_pred.shape[-1]
#     w = torch.ones(d, device=y_pred.device)
#     if 0 <= N_IDX < d:
#         w[N_IDX] = N_WEIGHT
#     while w.dim() < y_pred.dim():
#         w = w.unsqueeze(0)
#     e = (y_pred - y_true) ** 2
#     return (e * w).mean()

def weighted_mse_adaptive_focus(y_pred, y_true, k=5, max_bias=3.0, eps=1e-8):
    e = (y_pred - y_true) ** 2
    var = e.mean(dim=0, keepdim=True)
    w = (var / (var.mean() + eps)).clamp_min(0.5).clamp_max(2.0)
    b = torch.ones_like(w)
    if k > 0:
        b[..., :k] = max_bias
    w = (w * b).clamp_max(3.0)
    return (e * w).mean()



criterion = weighted_mse_adaptive_focus
optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
best_val = float('inf')
best_state = None
no_improve = 0

In [18]:
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.7, patience=1)
print("=== Model and Training Parameters ===")
for k, v in {
    "WINDOW": WINDOW,
    "BATCH_SIZE": BATCH_SIZE,
    "EPOCHS": EPOCHS,
    "LR": LR,
    "HIDDEN_SIZE": HIDDEN_SIZE,
    "NUM_LAYERS": NUM_LAYERS,
    "DROPOUT": DROPOUT,
    "PATIENCE": PATIENCE,
    "DEVICE": DEVICE,
}.items():
    print(f"{k:>12}: {v}")

epoch_bar = tqdm(range(1, EPOCHS + 1), desc="Epochs")
for epoch in epoch_bar:
    t0 = time.time()
    model.train()
    train_loss = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        optimizer.zero_grad()
        yhat = model(xb)
        loss = criterion(yhat, yb)
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        train_loss += loss.item() * xb.size(0)
    train_loss /= len(train_ds)

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            yhat = model(xb)
            loss = criterion(yhat, yb)
            val_loss += loss.item() * xb.size(0)
    val_loss /= len(val_ds) if len(val_ds) > 0 else 1

    dt = time.time() - t0
    print(f"Epoch {epoch:3d} | train MSE: {train_loss:.6f} | val MSE: {val_loss:.6f} | time {dt:.1f}s")
    scheduler.step(val_loss)

    if val_loss + 1e-9 < best_val:
        best_val = val_loss
        best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= PATIENCE:
            print("Early stopping.")
            break

if best_state is not None:
    model.load_state_dict(best_state)
model.to(DEVICE)

=== Model and Training Parameters ===
      WINDOW: 100
  BATCH_SIZE: 192
      EPOCHS: 200
          LR: 0.0005
 HIDDEN_SIZE: 192
  NUM_LAYERS: 2
     DROPOUT: 0.4
    PATIENCE: 5
      DEVICE: cuda


Epochs:   0%|          | 1/200 [02:16<7:31:08, 136.02s/it]

Epoch   1 | train MSE: 0.799804 | val MSE: 0.847319 | time 136.0s


Epochs:   1%|          | 2/200 [04:31<7:28:39, 135.96s/it]

Epoch   2 | train MSE: 0.795431 | val MSE: 0.848596 | time 135.9s


Epochs:   2%|▏         | 3/200 [06:47<7:26:16, 135.92s/it]

Epoch   3 | train MSE: 0.790971 | val MSE: 0.847486 | time 135.9s


Epochs:   2%|▏         | 4/200 [09:03<7:23:52, 135.88s/it]

Epoch   4 | train MSE: 0.782749 | val MSE: 0.851550 | time 135.8s


Epochs:   2%|▎         | 5/200 [11:19<7:21:36, 135.88s/it]

Epoch   5 | train MSE: 0.778067 | val MSE: 0.859431 | time 135.9s


Epochs:   2%|▎         | 5/200 [13:35<8:49:57, 163.06s/it]

Epoch   6 | train MSE: 0.770740 | val MSE: 0.858098 | time 135.8s
Early stopping.





LSTMModel(
  (lstm): LSTM(32, 192, num_layers=2, batch_first=True, dropout=0.4)
  (head): Sequential(
    (0): LayerNorm((192,), eps=1e-05, elementwise_affine=True)
    (1): Linear(in_features=192, out_features=384, bias=True)
    (2): GELU(approximate='none')
    (3): Dropout(p=0.4, inplace=False)
    (4): Linear(in_features=384, out_features=32, bias=True)
  )
)

In [19]:
net = getattr(model, "model", model)
os.makedirs("artifacts", exist_ok=True)
torch.save({
    "state_dict": net.state_dict(),
    "hidden_size": HIDDEN_SIZE,
    "num_layers": NUM_LAYERS,
    "dropout": DROPOUT,
    "output_size": output_size,
    "window": WINDOW,
    "feature_cols": feature_cols
}, "artifacts/lstm_nextstep.pt")
joblib.dump(scaler, "artifacts/standard_scaler.pkl")
print("Сохранено: artifacts/lstm_nextstep.pt, artifacts/standard_scaler.pkl")
try:
    os.makedirs(DRIVE_OUTPUT_DIR, exist_ok=True)
    shutil.copy("artifacts/lstm_nextstep.pt", os.path.join(DRIVE_OUTPUT_DIR, "lstm_nextstep.pt"))
    shutil.copy("artifacts/standard_scaler.pkl", os.path.join(DRIVE_OUTPUT_DIR, "standard_scaler.pkl"))
    print("Скопировано в Google Drive")
except Exception as e:
    print(e)

Сохранено: artifacts/lstm_nextstep.pt, artifacts/standard_scaler.pkl
Скопировано в Google Drive


In [20]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from dataclasses import dataclass
from tqdm.auto import tqdm
from typing import List
from sklearn.metrics import r2_score
import torch
from torch import nn
import joblib

PARQUET_PATH = "/content/datasets/train.parquet"
MODEL_PATH = "/content/artifacts/lstm_nextstep.pt"
SCALER_PATH = "/content/artifacts/standard_scaler.pkl"
FRAC_SEQ = 1
SEED = 42
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

@dataclass
class DataPoint:
    seq_ix: int
    step_in_seq: int
    need_prediction: bool
    state: np.ndarray

class ScorerStepByStep:
    def __init__(self, dataset_path: str, frac_seq: float, seed: int, min_len: int):
        df = pd.read_parquet(dataset_path).sort_values(["seq_ix","step_in_seq"])
        groups = [g for _, g in df.groupby("seq_ix", sort=False) if len(g) >= min_len]
        rng = np.random.default_rng(seed)
        n = len(groups)
        k = max(1, int(round(n * frac_seq)))
        idx = rng.choice(n, size=k, replace=False)
        subset = pd.concat([groups[i] for i in sorted(idx)], axis=0).reset_index(drop=True)
        self.dataset = subset
        self.dim = self.dataset.shape[1] - 3
        self.features = self.dataset.columns[3:]
    def score(self, model) -> dict:
        predictions, targets = [], []
        next_prediction = None
        for row in tqdm(self.dataset.values):
            seq_ix, step_in_seq, need_prediction = row[:3]
            new_state = row[3:]
            if next_prediction is not None:
                predictions.append(next_prediction)
                targets.append(new_state)
            dp = DataPoint(seq_ix, step_in_seq, need_prediction, new_state)
            next_prediction = model.predict(dp)
            if need_prediction and next_prediction is None:
                raise RuntimeError("predict returned None when need_prediction==1")
        if len(predictions) == 0:
            return {"mean_r2": np.nan}
        P = np.asarray(predictions, dtype=np.float32)
        T = np.asarray(targets, dtype=np.float32)
        scores = {f: r2_score(T[:, i], P[:, i]) for i, f in enumerate(self.features)}
        scores["mean_r2"] = float(np.nanmean(list(scores.values())))
        return scores

class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size=256, num_layers=2, dropout=0.18, output_size=31):
        super().__init__()
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout if num_layers>1 else 0.0,
            batch_first=True,
            bidirectional=False
        )
        d = hidden_size
        self.head = nn.Sequential(
            nn.LayerNorm(d),
            nn.Linear(d, d*2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d*2, output_size)
        )
        for name, p in self.lstm.named_parameters():
            if "weight_hh" in name:
                nn.init.orthogonal_(p)
            elif "weight_ih" in name:
                nn.init.xavier_uniform_(p)
            elif "bias" in name:
                nn.init.zeros_(p)
        for m in self.head.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.zeros_(m.bias)
    def forward(self, x):
        o, _ = self.lstm(x)
        h = o[:, -1, :]
        return self.head(h)

def _infer_num_layers_from_state_dict(sd):
    n = 0
    while f"lstm.weight_ih_l{n}" in sd:
        n += 1
    return max(n, 1)

def _infer_hidden_input_from_state_dict(sd):
    w_ih = sd.get("lstm.weight_ih_l0", None)
    if w_ih is None:
        return None, None
    hidden_size = w_ih.shape[0] // 4
    input_size = w_ih.shape[1]
    return hidden_size, input_size

class PredictionModel:
    def __init__(self, ckpt_path: str, scaler_path: str, device: str):
        self.device = device
        raw = torch.load(ckpt_path, map_location=device)
        if isinstance(raw, dict) and "state_dict" in raw:
            meta = raw
            state_dict = raw["state_dict"]
        elif isinstance(raw, dict):
            meta = {}
            state_dict = raw
        else:
            meta = {}
            state_dict = raw
        self.scaler = joblib.load(scaler_path)
        if hasattr(self.scaler, "n_features_in_"):
            nfeat = int(self.scaler.n_features_in_)
        elif hasattr(self.scaler, "mean_"):
            nfeat = int(self.scaler.mean_.shape[0])
        else:
            nfeat = None
        hidden_sd, input_sd = _infer_hidden_input_from_state_dict(state_dict)
        num_layers_sd = _infer_num_layers_from_state_dict(state_dict)
        input_size = int(meta.get("input_size", input_sd if input_sd is not None else nfeat))
        output_size = int(meta.get("output_size", nfeat))
        hidden_size = int(meta.get("hidden_size", hidden_sd if hidden_sd is not None else 256))
        num_layers = int(meta.get("num_layers", num_layers_sd))
        dropout = float(meta.get("dropout", 0.18))
        self.window = int(meta.get("window", 50))
        self.model = LSTMModel(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            dropout=dropout,
            output_size=output_size
        ).to(device)
        self.model.load_state_dict(state_dict, strict=True)
        self.model.eval()
        self.current_seq_ix = None
        self.sequence_history: List[np.ndarray] = []
        self.maxlen = self.window
        self.expected_dim = output_size
    def _reset_if_new_sequence(self, seq_ix: int):
        if self.current_seq_ix != seq_ix:
            self.current_seq_ix = seq_ix
            self.sequence_history = []
    def _append_state(self, state: np.ndarray):
        self.sequence_history.append(np.asarray(state, dtype=np.float32))
        if len(self.sequence_history) > self.maxlen:
            self.sequence_history = self.sequence_history[-self.maxlen:]
    @torch.no_grad()
    def _predict_from_window(self, window_np: np.ndarray) -> np.ndarray:
        window_scaled = self.scaler.transform(window_np)
        x = torch.from_numpy(window_scaled[None, ...]).to(self.device).float()
        yhat_scaled = self.model(x).detach().cpu().numpy()[0]
        yhat = self.scaler.inverse_transform(yhat_scaled.reshape(1, -1))[0]
        return yhat.astype(np.float32)
    def predict(self, data_point: DataPoint) -> np.ndarray | None:
        seq_ix = int(data_point.seq_ix)
        need_pred = int(data_point.need_prediction)
        self._reset_if_new_sequence(seq_ix)
        s = np.asarray(data_point.state, dtype=np.float32)
        if s.shape[0] != self.expected_dim:
            return None
        self._append_state(s)
        if need_pred != 1 or len(self.sequence_history) < self.maxlen:
            return None
        window_np = np.stack(self.sequence_history[-self.maxlen:], axis=0)
        return self._predict_from_window(window_np)

assert DEVICE=="cuda", "Нужен GPU: включи среду с CUDA (в Colab: Runtime → Change runtime type → GPU)."
model = PredictionModel(MODEL_PATH, SCALER_PATH, DEVICE)
scorer = ScorerStepByStep(PARQUET_PATH, frac_seq=FRAC_SEQ, seed=SEED, min_len=model.window+1)
print(f"Device: {DEVICE}, window={model.window}, rows={len(scorer.dataset)}")
res = scorer.score(model)
print(f"Mean R²: {res['mean_r2']:.6f}")
feats = list(scorer.features[:5])
for f in feats:
    print(f"{f}: {res[f]:.6f}")
print(f"Total features: {len(scorer.features)}")

Device: cuda, window=100, rows=517000


  0%|          | 0/517000 [00:00<?, ?it/s]

KeyboardInterrupt: 