In [31]:
import os
import random
import numpy as np
import pandas as pd
from dataclasses import dataclass

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [32]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

set_seed(42)

In [33]:
class CFG:
    path_train_x: str = "train_x.csv"
    path_train_y: str = "train_y.csv"
    path_test_x: str  = "test_x.csv"

    test_size: float = 0.2
    batch_size: int = 256
    epochs: int = 100
    lr: float = 1.5e-3
    weight_decay: float = 1e-4
    pct_start: float = 0.2
    max_grad_norm: float = 1.0
    patience: int = 20
    seed: int = 42

cfg = CFG()
set_seed(cfg.seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cpu


Загрузка данных

In [21]:
X_train = pd.read_csv(cfg.path_train_x, index_col=0)
y_train = pd.read_csv(cfg.path_train_y, index_col=0)['year']
X_test  = pd.read_csv(cfg.path_test_x).set_index("id")

assert X_train.shape[1] == X_test.shape[1], "train/test feature mismatch"

X_train.head()


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,80,81,82,83,84,85,86,87,88,89
133081,37.47518,-14.34414,40.54872,-9.10171,16.77175,-17.77113,-16.50156,-4.09543,2.49723,-0.46428,...,104.03557,-115.62803,-13.7966,31.60436,28.95927,-25.93164,67.6467,-25.76691,-81.90373,-61.48682
111484,48.17393,-7.02208,-30.36086,-2.41924,2.15406,-8.44502,-1.68191,-8.71434,-7.83802,-5.58019,...,11.58664,24.5895,-36.95682,0.73922,-0.0633,53.75838,-81.0533,8.42811,12.08694,-1.91676
448402,36.58141,26.03203,-4.92774,35.7162,8.5308,3.73167,-7.98443,-7.43976,-1.69797,10.75028,...,23.6119,-366.07968,-62.38201,113.48188,4.72741,181.64459,-134.46216,8.50795,94.15573,-8.47276
254414,41.15615,-17.77029,-32.30961,-21.03778,12.8033,-13.48031,-3.14951,-7.62647,-4.48901,-4.29075,...,25.11398,-79.64532,-77.08169,38.88094,28.52025,24.17783,-86.62542,-1.19418,-74.73449,-17.2813
272580,40.54855,78.77563,-23.29877,98.60192,-30.11496,26.9422,-8.87771,-3.2328,-1.04841,31.69655,...,-9.89388,-252.61021,118.93768,-155.8739,51.85666,-365.15815,59.34936,52.47311,99.00695,-10.1884


Трейн/валидация + масштабирование
Разделение train на обучающую и валидационную выборки (train_test_split). Масштабирование признаков X стандартным скейлером (StandardScaler) и отдельное масштабирование целевой переменной y

In [34]:
X_tr_df, X_val_df, y_tr_ser, y_val_ser = train_test_split(
    X_train, y_train, test_size=cfg.test_size, random_state=cfg.seed
)

# scalers
x_scaler = StandardScaler()
y_scaler = StandardScaler()

X_tr = x_scaler.fit_transform(X_tr_df.values)
X_val = x_scaler.transform(X_val_df.values)
X_te  = x_scaler.transform(X_test.values)

y_tr = y_scaler.fit_transform(y_tr_ser.values.reshape(-1, 1)).ravel()
y_val = y_scaler.transform(y_val_ser.values.reshape(-1, 1)).ravel()

Датасет для PyTorch
Определяется класс SongsDS (наследует torch.utils.data.Dataset): возвращает пары (X, y) для батчевой загрузки. Это обёртка над numpy-массивами/тензорами для использования с DataLoader.

In [35]:
class SongsDS(Dataset):
    def __init__(self, X: np.ndarray, y: np.ndarray | None = None):
        self.X = X.astype(np.float32)
        self.y = None if y is None else y.astype(np.float32)

    def __len__(self):
        return self.X.shape[0]

    def __getitem__(self, i):
        if self.y is None:
            return torch.from_numpy(self.X[i])
        return torch.from_numpy(self.X[i]), torch.from_numpy(np.array(self.y[i]))

ds_tr  = SongsDS(X_tr, y_tr)
ds_val = SongsDS(X_val, y_val)
ds_te  = SongsDS(X_te, None)

dl_tr  = DataLoader(ds_tr,  batch_size=cfg.batch_size, shuffle=True,  num_workers=0, drop_last=False)
dl_val = DataLoader(ds_val, batch_size=cfg.batch_size, shuffle=False, num_workers=0, drop_last=False)
dl_te  = DataLoader(ds_te,  batch_size=cfg.batch_size, shuffle=False, num_workers=0, drop_last=False)



Архитектура модели (Residual MLP)
ResidualBlock — блок с пропуском (skip-connection) и нелинейностью. ResidualMLP — многослойная полносвязная сеть с несколькими такими блоками, на выходе один нейрон (регрессия года).

In [36]:
class ResidualBlock(nn.Module):
    def __init__(self, dim: int, p: float = 0.1):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(dim, dim),
            nn.GELU(),
            nn.Dropout(p),
            nn.Linear(dim, dim),
            nn.LayerNorm(dim),
        )
    def forward(self, x):
        return x + self.net(x)

class ResidualMLP(nn.Module):
    def __init__(self, in_dim: int):
        super().__init__()
        widths = [in_dim, 256, 128, 64]
        layers = []
        layers += [nn.Linear(widths[0], widths[1]), nn.LayerNorm(widths[1]), nn.GELU(), nn.Dropout(0.15)]
        layers += [ResidualBlock(widths[1], p=0.10)]
        layers += [nn.Linear(widths[1], widths[2]), nn.LayerNorm(widths[2]), nn.GELU(), nn.Dropout(0.15)]
        layers += [ResidualBlock(widths[2], p=0.10)]
        layers += [nn.Linear(widths[2], widths[3]), nn.LayerNorm(widths[3]), nn.GELU(), nn.Dropout(0.10)]
        layers += [nn.Linear(widths[3], 1)]
        self.model = nn.Sequential(*layers)
        self._init_weights_xavier()

    def _init_weights_xavier(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                nn.init.zeros_(m.bias)

    def forward(self, x):
        return self.model(x).squeeze(-1)

Оптимизатор SAM
Реализация SAM (Sharpness-Aware Minimization) как обёртки над базовым оптимизатором (AdamW): первый проход делает «выпуклую» шаговую оценку по направлению увеличения лосса, второй — обычный шаг базового оптимизатора. Это помогает находить более плоские минимумы и лучше обобщать.

In [None]:
class SAM(optim.Optimizer):
    def __init__(self, params, base_optimizer, rho=0.05, adaptive=True, **kwargs):
        assert rho >= 0.0
        if not isinstance(base_optimizer, type):
            raise ValueError("base_optimizer должен быть классом, а не инстансом.")
        self.base_optimizer = base_optimizer(params, **kwargs)
        self.rho = rho
        self.adaptive = adaptive
        defaults = dict(rho=rho, adaptive=adaptive, **kwargs)
        super().__init__(self.base_optimizer.param_groups, defaults)

    @torch.no_grad()
    def first_step(self, zero_grad: bool = True):
        eps = 1e-12
        for group in self.base_optimizer.param_groups:
            scale = self.rho / (self._grad_norm(group) + eps)
            for p in group['params']:
                if p.grad is None:
                    continue
                e_w = (torch.pow(p, 2) if self.adaptive else 1.0) * p.grad * scale
                self.state[p]['e_w'] = e_w
                p.add_(e_w)
        if zero_grad:
            self.base_optimizer.zero_grad(set_to_none=True)

    @torch.no_grad()
    def second_step(self, zero_grad: bool = True):
        for group in self.base_optimizer.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                p.sub_(self.state[p]['e_w'])
        self.base_optimizer.step()
        if zero_grad:
            self.base_optimizer.zero_grad(set_to_none=True)

    def step(self):
        raise NotImplementedError("SAM: вызывай first_step/second_step явно.")

    @staticmethod
    def _grad_norm(group):
        norms = []
        for p in group['params']:
            if p.grad is None:
                continue
            g = p.grad.coalesce().values() if p.grad.is_sparse else p.grad
            scale = torch.abs(p) if group.get('adaptive', True) else 1.0
            norms.append(torch.norm(scale * g))
        if not norms:
            # на случай пустых градиентов
            device = group['params'][0].device
            return torch.tensor(0., device=device)
        return torch.norm(torch.stack(norms))

Оценка MSE в реальном масштабе
evaluate_mse_real_scale считает MSE в исходном масштабе целевой: предсказания модели из валидатора преобразуются обратно через y_scaler.inverse_transform, затем mean_squared_error(y_true, y_pred).

In [38]:
def evaluate_mse_real_scale(model: nn.Module, dl: DataLoader, y_scaler: StandardScaler) -> float:
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for xb, yb in dl:
            xb = xb.to(device)
            yb = yb.to(device)
            yhat = model(xb).detach().cpu().numpy()
            yb_np = yb.detach().cpu().numpy()
            yhat_real = y_scaler.inverse_transform(yhat.reshape(-1, 1)).ravel()
            y_real    = y_scaler.inverse_transform(yb_np.reshape(-1, 1)).ravel()
            preds.append(yhat_real)
            trues.append(y_real)
    preds = np.concatenate(preds)
    trues = np.concatenate(trues)
    return mean_squared_error(trues, preds)

Ранняя остановка
Класс EarlyStopper: ранняя остановка по метрике валидации с параметрами patience и min_delta. Если улучшений нет заданное число эпох, обучение прерывается, запоминается лучшая метрика.

In [39]:
class EarlyStopper:
    def __init__(self, patience: int = 20, min_delta: float = 0.0):
        self.patience = patience
        self.min_delta = min_delta
        self.best = np.inf
        self.count = 0
        self.best_state = None

    def step(self, value, model):
        if value < self.best - self.min_delta:
            self.best = value
            self.count = 0
            self.best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
            return False
        else:
            self.count += 1
            return self.count > self.patience

    def load_best(self, model):
        if self.best_state is not None:
            model.load_state_dict(self.best_state)

Функция обучения
train_model(...): создаёт ResidualMLP, лосс MSE, выбирает оптимизатор (AdamW или SAM) и планировщик OneCycleLR. Цикл обучения: train-шаги, step шедулера, оценка на валидации (в реальном масштабе), логирование истории, ранняя остановка, сохранение лучшей модели. Возвращает (best_val_mse, history, best_model).

In [None]:
def train_model(optimizer_name: str = "AdamW", epochs: int = 100,
                sam_rho: float = 0.05, sam_adaptive: bool = True):

    model = ResidualMLP(in_dim=X_tr.shape[1]).to(device)
    criterion = nn.MSELoss()

    if optimizer_name == "AdamW":
        optimizer = optim.AdamW(model.parameters(), lr=cfg.lr, weight_decay=cfg.weight_decay)
        scheduler = optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=cfg.lr, pct_start=cfg.pct_start,
            steps_per_epoch=len(dl_tr), epochs=epochs,
            div_factor=10.0, final_div_factor=1e3,
        )
    elif optimizer_name == "SAM":
        optimizer = SAM(
            model.parameters(),
            base_optimizer=optim.AdamW,
            lr=cfg.lr, weight_decay=cfg.weight_decay,
            rho=sam_rho, adaptive=sam_adaptive,
        )
        scheduler = optim.lr_scheduler.OneCycleLR(
            optimizer.base_optimizer,        
            max_lr=cfg.lr, pct_start=cfg.pct_start,
            steps_per_epoch=len(dl_tr), epochs=epochs,
            div_factor=10.0, final_div_factor=1e3,
        )
    else:
        raise ValueError("optimizer_name должен быть 'AdamW' или 'SAM'")

    history = []
    early = EarlyStopper(patience=cfg.patience, min_delta=0.0)

    for epoch in range(1, epochs + 1):
        model.train()
        running = 0.0

        for xb, yb in dl_tr:
            xb = xb.to(device)
            yb = yb.to(device)

            if optimizer_name == "AdamW":
                optimizer.zero_grad(set_to_none=True)
                yhat = model(xb)
                loss = criterion(yhat, yb)
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
                optimizer.step()

            else:  # SAM
                # step 1
                optimizer.base_optimizer.zero_grad(set_to_none=True)
                yhat = model(xb)
                loss = criterion(yhat, yb)
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
                optimizer.first_step(zero_grad=True)

                # step 2
                yhat2 = model(xb)
                loss2 = criterion(yhat2, yb)
                loss2.backward()
                nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
                optimizer.second_step(zero_grad=True)

            running += loss.item() * xb.size(0)
            scheduler.step()

        train_loss = running / len(ds_tr)
        val_mse = evaluate_mse_real_scale(model, dl_val, y_scaler)
        history.append({"epoch": epoch, "train_mse_z": train_loss, "val_mse_year": val_mse})

        if epoch % 10 == 0 or epoch == 1:
            print(f"[{optimizer_name}] Epoch {epoch:03d}/{epochs} | train(z): {train_loss:.4f} | val_MSE(year): {val_mse:.3f}")

        if early.step(val_mse, model):
            print(f"[{optimizer_name}] Early stop @ {epoch}, best val_MSE: {early.best:.3f}")
            break

    early.load_best(model)
    best_val = early.best
    return best_val, history, model

Запуск экспериментов и сабмит
Две серии обучения: с AdamW и c SAM (заданы rho, adaptive). Сравнение лучших MSE, выбор лучшей модели. Затем инференс на тесте, обратное масштабирование y, формирование submission.csv (id, year).

In [None]:
val_mse_adam, hist_adam, model_adam = train_model("AdamW", epochs=cfg.epochs)
val_mse_sam,  hist_sam,  model_sam  = train_model("SAM",   epochs=cfg.epochs, sam_rho=0.05, sam_adaptive=True)

print("\n==============================")
print(f"AdamW   val_MSE: {val_mse_adam:.3f}")
print(f"SAM     val_MSE: {val_mse_sam:.3f}")
print("==============================")

# inference (на лучшей модели по валидации)
best_model = model_sam if val_mse_sam <= val_mse_adam else model_adam
best_model.eval()

preds_test = []
with torch.no_grad():
    for xb in dl_te:
        xb = xb.to(device)
        yhat = best_model(xb).detach().cpu().numpy()
        preds_test.append(yhat)

preds_test = np.concatenate(preds_test)
preds_year = y_scaler.inverse_transform(preds_test.reshape(-1, 1)).ravel()

submission = pd.DataFrame({
    "id": X_test.index,
    "year": preds_year.round().astype(int) 
})
submission.to_csv("submission.csv", index=False)
print("Saved -> submission.csv")

[AdamW] Epoch 001/100 | train(z): 1.1672 | val_MSE(year): 104.055
[AdamW] Epoch 010/100 | train(z): 0.5893 | val_MSE(year): 85.300
[AdamW] Epoch 020/100 | train(z): 0.3533 | val_MSE(year): 102.533
[AdamW] Early stop @ 28, best val_MSE: 84.313
[SAM] Epoch 001/100 | train(z): 1.2102 | val_MSE(year): 106.432
[SAM] Epoch 010/100 | train(z): 0.5899 | val_MSE(year): 86.045
[SAM] Epoch 020/100 | train(z): 0.3492 | val_MSE(year): 93.790
[SAM] Early stop @ 28, best val_MSE: 81.841

AdamW   val_MSE: 84.313
SAM     val_MSE: 81.841
Saved -> submission.csv
