# HW08-09: PyTorch MLP — регуляризация и оптимизация обучения

**Датасет:** CIFAR-10 (Вариант C)  
**Курс:** AIE DPO 2025

## 2.3.1. Импорты, seed и устройство

In [None]:
import os
import json
import csv
from pathlib import Path

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, random_split

import torchvision
import torchvision.transforms as transforms

import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt

print(f"torch:       {torch.__version__}")
print(f"torchvision: {torchvision.__version__}")

In [None]:
SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Устройство: {device}")

# CIFAR-10: 3 канала × 32×32
INPUT_SIZE = 3 * 32 * 32  # 3072
NUM_CLASSES = 10
BATCH_SIZE = 64

ARTIFACTS_DIR = Path('artifacts')
FIGURES_DIR = ARTIFACTS_DIR / 'figures'
ARTIFACTS_DIR.mkdir(exist_ok=True)
FIGURES_DIR.mkdir(exist_ok=True)
print(f"artifacts → {ARTIFACTS_DIR.resolve()}")

## 2.3.2. Данные и DataLoader

In [None]:
# Нормализация CIFAR-10: mean и std по каждому каналу
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
])

# Загрузка через torchvision (скачивается автоматически)
full_train_dataset = torchvision.datasets.CIFAR10(
    root='./data', train=True, download=True, transform=transform
)
test_dataset = torchvision.datasets.CIFAR10(
    root='./data', train=False, download=True, transform=transform
)

print(f"Train (полный): {len(full_train_dataset)}, Test: {len(test_dataset)}")
print(f"Классы: {full_train_dataset.classes}")

In [None]:
# Воспроизводимое разбиение train / val = 80/20
n_total = len(full_train_dataset)
n_train = int(0.8 * n_total)
n_val   = n_total - n_train

generator = torch.Generator().manual_seed(SEED)
train_dataset, val_dataset = random_split(
    full_train_dataset, [n_train, n_val], generator=generator
)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                          generator=torch.Generator().manual_seed(SEED))
val_loader   = DataLoader(val_dataset,   batch_size=BATCH_SIZE, shuffle=False)
test_loader  = DataLoader(test_dataset,  batch_size=BATCH_SIZE, shuffle=False)

print(f"Train: {len(train_dataset)}, Val: {len(val_dataset)}, Test: {len(test_dataset)}")

# Sanity check
x, y = next(iter(train_loader))
print(f"\nSanity check:")
print(f"  x.shape = {x.shape}")
print(f"  y.shape = {y.shape}")
print(f"  x range = [{x.min():.3f}, {x.max():.3f}]")
print(f"  y unique = {y.unique().tolist()}")

## 2.3.3. Модель MLP

In [None]:
class MLP(nn.Module):
    """
    Универсальный MLP: Flatten → [Linear → [BN] → ReLU → [Dropout]] × N → Linear.
    """
    def __init__(self, input_size=3072, hidden_sizes=None, num_classes=10,
                 dropout_p=0.0, use_batchnorm=False):
        super().__init__()
        if hidden_sizes is None:
            hidden_sizes = [512, 256]
        layers = [nn.Flatten()]
        in_f = input_size
        for h in hidden_sizes:
            layers.append(nn.Linear(in_f, h))
            if use_batchnorm:
                layers.append(nn.BatchNorm1d(h))
            layers.append(nn.ReLU())
            if dropout_p > 0.0:
                layers.append(nn.Dropout(dropout_p))
            in_f = h
        layers.append(nn.Linear(in_f, num_classes))
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)


# Проверка прямого прохода
_m = MLP(input_size=INPUT_SIZE)
_x = torch.zeros(4, 3, 32, 32)
print(f"MLP forward: {_x.shape} → {_m(_x).shape}")
del _m, _x

## Утилиты обучения

In [None]:
def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss, correct, total = 0.0, 0, 0
    for x, y in loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * x.size(0)
        correct += (logits.argmax(1) == y).sum().item()
        total   += x.size(0)
    return total_loss / total, correct / total


def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss, correct, total = 0.0, 0, 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            logits = model(x)
            loss   = criterion(logits, y)
            total_loss += loss.item() * x.size(0)
            correct    += (logits.argmax(1) == y).sum().item()
            total      += x.size(0)
    return total_loss / total, correct / total


class EarlyStopping:
    """Останавливает, если val_accuracy не растёт patience эпох."""
    def __init__(self, patience=5, min_delta=1e-4):
        self.patience, self.min_delta = patience, min_delta
        self.best_val_acc = -float('inf')
        self.counter = 0
        self.best_state = None
        self.stopped_epoch = None

    def step(self, val_acc, model, epoch):
        if val_acc > self.best_val_acc + self.min_delta:
            self.best_val_acc = val_acc
            self.counter = 0
            self.best_state = {k: v.clone() for k, v in model.state_dict().items()}
        else:
            self.counter += 1
        if self.counter >= self.patience:
            self.stopped_epoch = epoch
            return True
        return False

    def restore_best(self, model):
        if self.best_state:
            model.load_state_dict(self.best_state)


def run_experiment(model, train_loader, val_loader, optimizer, criterion, device,
                   max_epochs=15, early_stopping=None, tag=''):
    history = {'train_loss': [], 'val_loss': [], 'train_acc': [], 'val_acc': []}
    for epoch in range(1, max_epochs + 1):
        tl, ta = train_one_epoch(model, train_loader, criterion, optimizer, device)
        vl, va = evaluate(model, val_loader, criterion, device)
        history['train_loss'].append(tl)
        history['val_loss'].append(vl)
        history['train_acc'].append(ta)
        history['val_acc'].append(va)
        print(f"[{tag}] {epoch:02d}/{max_epochs}  "
              f"train_loss={tl:.4f} train_acc={ta:.4f} | "
              f"val_loss={vl:.4f} val_acc={va:.4f}")
        if early_stopping and early_stopping.step(va, model, epoch):
            print(f"  >>> EarlyStopping на эпохе {epoch}, "
                  f"best_val_acc={early_stopping.best_val_acc:.4f}")
            early_stopping.restore_best(model)
            break
    return history

print("Утилиты готовы.")

---
## 3.1. Часть A (S08): Регуляризация (E1–E4)

In [None]:
criterion    = nn.CrossEntropyLoss()
MAX_EPOCHS_A = 15

# ── E1: Base MLP ─────────────────────────────────────────────────────────────
print("=" * 60)
print("E1: Base MLP (no Dropout, no BatchNorm)")
print("=" * 60)
torch.manual_seed(SEED)
model_e1 = MLP(input_size=INPUT_SIZE, dropout_p=0.0, use_batchnorm=False).to(device)
history_e1 = run_experiment(
    model_e1, train_loader, val_loader,
    optim.Adam(model_e1.parameters(), lr=1e-3),
    criterion, device, max_epochs=MAX_EPOCHS_A, tag='E1')
best_val_acc_e1  = max(history_e1['val_acc'])
best_val_loss_e1 = history_e1['val_loss'][history_e1['val_acc'].index(best_val_acc_e1)]
print(f"\n[E1] best val_acc = {best_val_acc_e1:.4f}")

In [None]:
# ── E2: Dropout ───────────────────────────────────────────────────────────────
print("=" * 60)
print("E2: MLP + Dropout(p=0.3)")
print("=" * 60)
torch.manual_seed(SEED)
model_e2 = MLP(input_size=INPUT_SIZE, dropout_p=0.3, use_batchnorm=False).to(device)
history_e2 = run_experiment(
    model_e2, train_loader, val_loader,
    optim.Adam(model_e2.parameters(), lr=1e-3),
    criterion, device, max_epochs=MAX_EPOCHS_A, tag='E2')
best_val_acc_e2  = max(history_e2['val_acc'])
best_val_loss_e2 = history_e2['val_loss'][history_e2['val_acc'].index(best_val_acc_e2)]
print(f"\n[E2] best val_acc = {best_val_acc_e2:.4f}")

In [None]:
# ── E3: BatchNorm ─────────────────────────────────────────────────────────────
print("=" * 60)
print("E3: MLP + BatchNorm")
print("=" * 60)
torch.manual_seed(SEED)
model_e3 = MLP(input_size=INPUT_SIZE, dropout_p=0.0, use_batchnorm=True).to(device)
history_e3 = run_experiment(
    model_e3, train_loader, val_loader,
    optim.Adam(model_e3.parameters(), lr=1e-3),
    criterion, device, max_epochs=MAX_EPOCHS_A, tag='E3')
best_val_acc_e3  = max(history_e3['val_acc'])
best_val_loss_e3 = history_e3['val_loss'][history_e3['val_acc'].index(best_val_acc_e3)]
print(f"\n[E3] best val_acc = {best_val_acc_e3:.4f}")

In [None]:
# ── E4: лучший из E2/E3 + EarlyStopping ─────────────────────────────────────
if best_val_acc_e3 >= best_val_acc_e2:
    best_base_tag = 'E3';  e4_dropout_p = 0.0;  e4_batchnorm = True
else:
    best_base_tag = 'E2';  e4_dropout_p = 0.3;  e4_batchnorm = False

print(f"Лучший между E2/E3: {best_base_tag}")
print("=" * 60)
print(f"E4: {best_base_tag} + EarlyStopping(patience=5)")
print("=" * 60)
torch.manual_seed(SEED)
model_e4 = MLP(input_size=INPUT_SIZE, dropout_p=e4_dropout_p,
               use_batchnorm=e4_batchnorm).to(device)
es = EarlyStopping(patience=5)
history_e4 = run_experiment(
    model_e4, train_loader, val_loader,
    optim.Adam(model_e4.parameters(), lr=1e-3),
    criterion, device, max_epochs=30, early_stopping=es, tag='E4')

best_val_acc_e4  = es.best_val_acc
best_val_loss_e4 = history_e4['val_loss'][history_e4['val_acc'].index(max(history_e4['val_acc']))]
epochs_e4        = len(history_e4['val_acc'])
print(f"\n[E4] best val_acc = {best_val_acc_e4:.4f}, остановлен на эпохе {es.stopped_epoch}")

torch.save(model_e4.state_dict(), str(ARTIFACTS_DIR / 'best_model.pt'))
print("Сохранено: artifacts/best_model.pt")

---
## 3.2. Часть B (S09): LR, оптимизаторы, weight decay (O1–O3)

In [None]:
# ── O1: lr слишком большой ───────────────────────────────────────────────────
print("=" * 60)
print("O1: Adam, lr=0.1 (слишком большой)")
print("=" * 60)
torch.manual_seed(SEED)
model_o1 = MLP(input_size=INPUT_SIZE, dropout_p=e4_dropout_p,
               use_batchnorm=e4_batchnorm).to(device)
history_o1 = run_experiment(
    model_o1, train_loader, val_loader,
    optim.Adam(model_o1.parameters(), lr=0.1),
    criterion, device, max_epochs=8, tag='O1')
best_val_acc_o1  = max(history_o1['val_acc'])
best_val_loss_o1 = min(history_o1['val_loss'])
print(f"\n[O1] best val_acc = {best_val_acc_o1:.4f}")

In [None]:
# ── O2: lr слишком маленький ─────────────────────────────────────────────────
print("=" * 60)
print("O2: Adam, lr=1e-5 (слишком маленький)")
print("=" * 60)
torch.manual_seed(SEED)
model_o2 = MLP(input_size=INPUT_SIZE, dropout_p=e4_dropout_p,
               use_batchnorm=e4_batchnorm).to(device)
history_o2 = run_experiment(
    model_o2, train_loader, val_loader,
    optim.Adam(model_o2.parameters(), lr=1e-5),
    criterion, device, max_epochs=8, tag='O2')
best_val_acc_o2  = max(history_o2['val_acc'])
best_val_loss_o2 = min(history_o2['val_loss'])
print(f"\n[O2] best val_acc = {best_val_acc_o2:.4f}")

In [None]:
# ── O3: SGD + momentum + weight_decay ────────────────────────────────────────
print("=" * 60)
print("O3: SGD, momentum=0.9, weight_decay=1e-4, lr=0.01")
print("=" * 60)
torch.manual_seed(SEED)
model_o3 = MLP(input_size=INPUT_SIZE, dropout_p=e4_dropout_p,
               use_batchnorm=e4_batchnorm).to(device)
history_o3 = run_experiment(
    model_o3, train_loader, val_loader,
    optim.SGD(model_o3.parameters(), lr=0.01, momentum=0.9, weight_decay=1e-4),
    criterion, device, max_epochs=15, tag='O3')
best_val_acc_o3  = max(history_o3['val_acc'])
best_val_loss_o3 = history_o3['val_loss'][history_o3['val_acc'].index(best_val_acc_o3)]
epochs_o3        = len(history_o3['val_acc'])
print(f"\n[O3] best val_acc = {best_val_acc_o3:.4f}")

---
## 4. Артефакты

In [None]:
# ── runs.csv ──────────────────────────────────────────────────────────────────
DATASET = 'CIFAR10'
bn_tag  = '+BN' if e4_batchnorm else 'Dropout(0.3)'

rows = [
    dict(experiment_id='E1', dataset=DATASET, seed=SEED,
         model_summary='512-256, ReLU, no-Dropout, no-BN',
         optimizer='Adam', lr=1e-3, momentum='', weight_decay=0,
         epochs_trained=len(history_e1['val_acc']),
         best_val_accuracy=round(best_val_acc_e1, 4),
         best_val_loss=round(best_val_loss_e1, 4)),
    dict(experiment_id='E2', dataset=DATASET, seed=SEED,
         model_summary='512-256, ReLU, Dropout(0.3), no-BN',
         optimizer='Adam', lr=1e-3, momentum='', weight_decay=0,
         epochs_trained=len(history_e2['val_acc']),
         best_val_accuracy=round(best_val_acc_e2, 4),
         best_val_loss=round(best_val_loss_e2, 4)),
    dict(experiment_id='E3', dataset=DATASET, seed=SEED,
         model_summary='512-256, ReLU, no-Dropout, BatchNorm',
         optimizer='Adam', lr=1e-3, momentum='', weight_decay=0,
         epochs_trained=len(history_e3['val_acc']),
         best_val_accuracy=round(best_val_acc_e3, 4),
         best_val_loss=round(best_val_loss_e3, 4)),
    dict(experiment_id='E4', dataset=DATASET, seed=SEED,
         model_summary=f'512-256, ReLU, {bn_tag}, EarlyStopping(p=5)',
         optimizer='Adam', lr=1e-3, momentum='', weight_decay=0,
         epochs_trained=epochs_e4,
         best_val_accuracy=round(best_val_acc_e4, 4),
         best_val_loss=round(best_val_loss_e4, 4)),
    dict(experiment_id='O1', dataset=DATASET, seed=SEED,
         model_summary=f'512-256, ReLU, {bn_tag}',
         optimizer='Adam', lr=0.1, momentum='', weight_decay=0,
         epochs_trained=len(history_o1['val_acc']),
         best_val_accuracy=round(best_val_acc_o1, 4),
         best_val_loss=round(best_val_loss_o1, 4)),
    dict(experiment_id='O2', dataset=DATASET, seed=SEED,
         model_summary=f'512-256, ReLU, {bn_tag}',
         optimizer='Adam', lr=1e-5, momentum='', weight_decay=0,
         epochs_trained=len(history_o2['val_acc']),
         best_val_accuracy=round(best_val_acc_o2, 4),
         best_val_loss=round(best_val_loss_o2, 4)),
    dict(experiment_id='O3', dataset=DATASET, seed=SEED,
         model_summary=f'512-256, ReLU, {bn_tag}',
         optimizer='SGD', lr=0.01, momentum=0.9, weight_decay=1e-4,
         epochs_trained=epochs_o3,
         best_val_accuracy=round(best_val_acc_o3, 4),
         best_val_loss=round(best_val_loss_o3, 4)),
]

fieldnames = ['experiment_id','dataset','seed','model_summary','optimizer',
              'lr','momentum','weight_decay','epochs_trained',
              'best_val_accuracy','best_val_loss']
with open(ARTIFACTS_DIR / 'runs.csv', 'w', newline='') as f:
    w = csv.DictWriter(f, fieldnames=fieldnames)
    w.writeheader(); w.writerows(rows)

print("Сохранено: artifacts/runs.csv")
print(f"\n{'ID':<4} {'opt':<5} {'lr':<8} {'wd':<8} {'ep':<4} {'val_acc':<9} val_loss")
print('-' * 50)
for r in rows:
    print(f"{r['experiment_id']:<4} {r['optimizer']:<5} {r['lr']:<8} "
          f"{str(r['weight_decay']):<8} {r['epochs_trained']:<4} "
          f"{r['best_val_accuracy']:<9} {r['best_val_loss']}")

In [None]:
# ── best_config.json ──────────────────────────────────────────────────────────
best_config = {
    'dataset': DATASET, 'seed': SEED,
    'model': {
        'class': 'MLP', 'input_size': INPUT_SIZE,
        'hidden_sizes': [512, 256], 'num_classes': NUM_CLASSES,
        'activation': 'ReLU', 'dropout_p': e4_dropout_p,
        'use_batchnorm': e4_batchnorm,
    },
    'training': {
        'optimizer': 'Adam', 'lr': 1e-3, 'weight_decay': 0,
        'batch_size': BATCH_SIZE, 'max_epochs': 30,
        'early_stopping_patience': 5, 'epochs_trained': epochs_e4,
    },
    'results': {
        'best_val_accuracy': round(best_val_acc_e4, 4),
        'best_val_loss':     round(best_val_loss_e4, 4),
    }
}
with open(ARTIFACTS_DIR / 'best_config.json', 'w') as f:
    json.dump(best_config, f, indent=2)
print("Сохранено: artifacts/best_config.json")
print(json.dumps(best_config, indent=2))

## 5. Графики

In [None]:
# ── curves_best.png ───────────────────────────────────────────────────────────
ep4 = range(1, len(history_e4['train_loss']) + 1)
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
ax1.plot(ep4, history_e4['train_loss'], 'o-', ms=3, label='train')
ax1.plot(ep4, history_e4['val_loss'],   's-', ms=3, label='val')
ax1.set(xlabel='Epoch', ylabel='Loss',
        title=f'E4 ({best_base_tag}+EarlyStopping): Loss')
ax1.legend(); ax1.grid(alpha=0.3)
ax2.plot(ep4, history_e4['train_acc'], 'o-', ms=3, label='train')
ax2.plot(ep4, history_e4['val_acc'],   's-', ms=3, label='val')
ax2.set(xlabel='Epoch', ylabel='Accuracy',
        title=f'E4 ({best_base_tag}+EarlyStopping): Accuracy')
ax2.legend(); ax2.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(str(FIGURES_DIR / 'curves_best.png'), dpi=120, bbox_inches='tight')
plt.show()
print("Сохранено: artifacts/figures/curves_best.png")

In [None]:
# ── curves_lr_extremes.png ────────────────────────────────────────────────────
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
for ax, hist, title in [
    (axes[0], history_o1, 'O1: Adam lr=0.1 (слишком большой)'),
    (axes[1], history_o2, 'O2: Adam lr=1e-5 (слишком маленький)'),
]:
    ep = range(1, len(hist['train_loss']) + 1)
    ax.plot(ep, hist['train_loss'], 'o-', ms=4, label='train')
    ax.plot(ep, hist['val_loss'],   's-', ms=4, label='val')
    ax.set(xlabel='Epoch', ylabel='Loss', title=title)
    ax.legend(); ax.grid(alpha=0.3)
plt.tight_layout()
plt.savefig(str(FIGURES_DIR / 'curves_lr_extremes.png'), dpi=120, bbox_inches='tight')
plt.show()
print("Сохранено: artifacts/figures/curves_lr_extremes.png")

---
## 6. Финальная оценка на test (один раз)

In [None]:
# Загружаем сохранённые веса и оцениваем на test — ОДИН РАЗ
model_e4.load_state_dict(
    torch.load(str(ARTIFACTS_DIR / 'best_model.pt'), map_location=device)
)
test_loss, test_acc = evaluate(model_e4, test_loader, criterion, device)

print("=" * 60)
print("Финальная оценка E4 на test")
print("=" * 60)
print(f"test_loss     = {test_loss:.4f}")
print(f"test_accuracy = {test_acc:.4f}")
print("\nОценка test-выборки выполнена ОДИН РАЗ.")