**Использование псевдоразметки. ДЗ.**

In [1]:
import argparse
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from torch.autograd import Variable
import random
import numpy as np

In [2]:
torch.manual_seed(123)
torch.cuda.manual_seed(123)
np.random.seed(123)
random.seed(123)
torch.backends.cudnn.deterministic = True

Начнем с загрузки датасета. Речевые данные (и модели, обучаемые на них) очень тяжелые, поэтому мы обойдемся чем-нибудь попроще.

In [3]:
train_dataset = \
    datasets.MNIST('./data', train=True, download=True,
                   transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))
test_dataset = \
    datasets.MNIST('./data', train=False, transform=transforms.Compose([
                       transforms.ToTensor(),
                       transforms.Normalize((0.1307,), (0.3081,))
                   ]))

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw



In [4]:
len(train_dataset), len(test_dataset)

(60000, 10000)

Итак, трейн состоит из 60000 картинок цифр. Для того, чтобы получше увидеть эффект от псевдолейблов, мы оставим только 100 этих картинок в качестве размеченных данных. Остальные 59900 будут в качестве неразмеченных. 

На масштабах 100 записей могут проявиться неприятные эффекты, если какие-то из классов не будут достаточно хорошо представлены. Чтобы этого избежать, будем аккуратно семплировать. Самый простой вариант - просто случайно разделять, пока не получится удачное разбиение.

Для начала определим удачность разбиения. Будем считать размеченный датасет хорошим, если из 100 примеров в нем есть хотя бы по 8 представителей каждого класса. Напишите функцию, которая делает такую проверку.

In [5]:
...

def check_dataset(dataset, labels_num=10, min_samples_num=8):
  labels = np.array([item[1] for item in dataset])
  unique_labels_num = len(np.unique(labels))
  if unique_labels_num < labels_num:
    return False
  for label in labels:
    if np.sum(labels == label) < min_samples_num:
      return False
  return True

In [6]:
sampling_iteration = 0
while True:
    labeled_train_dataset, unlabeled_train_dataset = torch.utils.data.random_split(train_dataset, [100, 59900])
    if check_dataset(labeled_train_dataset):
        break
    sampling_iteration += 1
print(f'Split the dataset after {sampling_iteration} resamplings')

Split the dataset after 32 resamplings


In [7]:
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=64, shuffle=False)
labeled_train_loader = torch.utils.data.DataLoader(
    labeled_train_dataset, batch_size=64, shuffle=True)
unlabeled_train_loader = torch.utils.data.DataLoader(
    unlabeled_train_dataset, batch_size=64, shuffle=False)

Теперь, когда мы получили данные, определим архитектуру сети. Возьмем простую сверточную сетку с droupout'ом.

In [8]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, kernel_size=5)
        self.conv2 = nn.Conv2d(20, 40, kernel_size=5)
        self.dropout = nn.Dropout2d(p=0.5)
        self.fc1 = nn.Linear(640, 150)
        self.fc2 = nn.Linear(150, 10)
        self.log_softmax = nn.LogSoftmax(dim=1)

    def forward(self, x):
        x = x.view(-1, 1, 28, 28)
        x = F.relu(F.max_pool2d(self.conv1(x), 2))
        x = F.relu(F.max_pool2d(self.dropout(self.conv2(x)), 2))
        x = x.view(-1, 640)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        #x = self.log_softmax(x)
        return x

Опишем вспомогательные функции.

In [9]:
def train(epoch_idx, model, optimizer, train_loader, loss_func=F.nll_loss):
    model.train()
    for batch_idx, (x, target) in enumerate(train_loader):
        x, target = x.cuda(), target.cuda()
        optimizer.zero_grad()
        output = F.log_softmax(model(x), dim=1)
        loss = loss_func(output, target)
        loss.backward()
        optimizer.step()

In [10]:
def test(epoch_idx, model, test_loader, need_explicit_softmax=False):
    model.eval()
    test_loss = 0.0
    correct = 0
    with torch.no_grad():
        for x, target in test_loader:
            x, target = x.cuda(), target.cuda()
            output = F.log_softmax(model(x), dim=1)
            test_loss += F.nll_loss(output, target, size_average=False).item()
            pred = output.data.max(1, keepdim=True)[1]
            correct += pred.eq(target.data.view_as(pred)).long().cpu().sum()

    test_loss /= len(test_loader.dataset)
    print('Epoch {}: Average loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)'.format(
        epoch_idx, test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

In [11]:
def predict(model, loader):
    model.eval()
    result = []
    with torch.no_grad():
        for x, _ in loader:
            result.append(model(x.cuda()))
    return torch.cat(result)

Создадим модель и обучим ее на нашем размеченном датасете.

In [12]:
model = Net().cuda()
optimizer = torch.optim.SGD(model.parameters(), lr=0.1)

In [13]:
from tqdm import tqdm

for i in tqdm(range(400)):
    train(i, model, optimizer, labeled_train_loader)
    if i % 10 == 0:
        test(i, model, test_loader)

  2%|▏         | 6/400 [00:08<07:00,  1.07s/it]

Epoch 0: Average loss: 2.2969, Accuracy: 985/10000 (10%)


  5%|▍         | 19/400 [00:10<01:55,  3.31it/s]

Epoch 10: Average loss: 1.7321, Accuracy: 5573/10000 (56%)


  7%|▋         | 27/400 [00:12<01:30,  4.13it/s]

Epoch 20: Average loss: 0.7752, Accuracy: 7474/10000 (75%)


  9%|▉         | 36/400 [00:14<01:14,  4.87it/s]

Epoch 30: Average loss: 0.5837, Accuracy: 8025/10000 (80%)


 12%|█▏        | 49/400 [00:16<00:57,  6.07it/s]

Epoch 40: Average loss: 0.7819, Accuracy: 7384/10000 (74%)


 14%|█▍        | 57/400 [00:18<01:01,  5.54it/s]

Epoch 50: Average loss: 0.5545, Accuracy: 8304/10000 (83%)


 16%|█▋        | 66/400 [00:20<01:00,  5.55it/s]

Epoch 60: Average loss: 0.5442, Accuracy: 8506/10000 (85%)


 19%|█▉        | 76/400 [00:22<00:55,  5.80it/s]

Epoch 70: Average loss: 0.5475, Accuracy: 8538/10000 (85%)


 22%|██▏       | 89/400 [00:24<00:49,  6.34it/s]

Epoch 80: Average loss: 0.5836, Accuracy: 8569/10000 (86%)


 24%|██▍       | 97/400 [00:26<00:52,  5.81it/s]

Epoch 90: Average loss: 0.6261, Accuracy: 8607/10000 (86%)


 26%|██▋       | 105/400 [00:28<00:54,  5.40it/s]

Epoch 100: Average loss: 0.6011, Accuracy: 8585/10000 (86%)


 30%|██▉       | 119/400 [00:30<00:42,  6.60it/s]

Epoch 110: Average loss: 0.6617, Accuracy: 8601/10000 (86%)


 32%|███▏      | 127/400 [00:32<00:46,  5.85it/s]

Epoch 120: Average loss: 0.6722, Accuracy: 8559/10000 (86%)


 34%|███▍      | 136/400 [00:33<00:45,  5.79it/s]

Epoch 130: Average loss: 0.6765, Accuracy: 8634/10000 (86%)


 36%|███▋      | 146/400 [00:35<00:42,  6.02it/s]

Epoch 140: Average loss: 0.7477, Accuracy: 8546/10000 (85%)


 39%|███▉      | 156/400 [00:37<00:40,  6.05it/s]

Epoch 150: Average loss: 0.6773, Accuracy: 8666/10000 (87%)


 42%|████▏     | 166/400 [00:39<00:38,  6.09it/s]

Epoch 160: Average loss: 0.6992, Accuracy: 8501/10000 (85%)


 44%|████▍     | 176/400 [00:41<00:36,  6.13it/s]

Epoch 170: Average loss: 0.7007, Accuracy: 8601/10000 (86%)


 46%|████▋     | 186/400 [00:43<00:34,  6.19it/s]

Epoch 180: Average loss: 0.6862, Accuracy: 8636/10000 (86%)


 49%|████▉     | 196/400 [00:45<00:35,  5.81it/s]

Epoch 190: Average loss: 0.7642, Accuracy: 8566/10000 (86%)


 52%|█████▏    | 206/400 [00:47<00:32,  6.03it/s]

Epoch 200: Average loss: 0.6916, Accuracy: 8701/10000 (87%)


 54%|█████▍    | 216/400 [00:49<00:29,  6.17it/s]

Epoch 210: Average loss: 0.7204, Accuracy: 8663/10000 (87%)


 57%|█████▋    | 229/400 [00:51<00:25,  6.62it/s]

Epoch 220: Average loss: 0.7713, Accuracy: 8558/10000 (86%)


 59%|█████▉    | 237/400 [00:53<00:28,  5.70it/s]

Epoch 230: Average loss: 0.7074, Accuracy: 8721/10000 (87%)


 62%|██████▏   | 246/400 [00:55<00:27,  5.68it/s]

Epoch 240: Average loss: 0.8491, Accuracy: 8568/10000 (86%)


 64%|██████▍   | 255/400 [00:57<00:25,  5.60it/s]

Epoch 250: Average loss: 0.7942, Accuracy: 8588/10000 (86%)


 67%|██████▋   | 267/400 [00:59<00:21,  6.14it/s]

Epoch 260: Average loss: 0.7971, Accuracy: 8569/10000 (86%)


 69%|██████▉   | 276/400 [01:01<00:20,  5.98it/s]

Epoch 270: Average loss: 0.8196, Accuracy: 8598/10000 (86%)


 71%|███████▏  | 285/400 [01:03<00:22,  5.13it/s]

Epoch 280: Average loss: 0.8827, Accuracy: 8529/10000 (85%)


 74%|███████▍  | 297/400 [01:05<00:18,  5.44it/s]

Epoch 290: Average loss: 0.7915, Accuracy: 8622/10000 (86%)


 76%|███████▋  | 306/400 [01:07<00:16,  5.64it/s]

Epoch 300: Average loss: 0.8875, Accuracy: 8512/10000 (85%)


 79%|███████▉  | 316/400 [01:09<00:13,  6.06it/s]

Epoch 310: Average loss: 0.7732, Accuracy: 8578/10000 (86%)


 82%|████████▏ | 326/400 [01:11<00:11,  6.20it/s]

Epoch 320: Average loss: 0.7570, Accuracy: 8641/10000 (86%)


 84%|████████▍ | 336/400 [01:13<00:10,  6.30it/s]

Epoch 330: Average loss: 0.7673, Accuracy: 8646/10000 (86%)


 86%|████████▋ | 346/400 [01:15<00:08,  6.27it/s]

Epoch 340: Average loss: 0.7885, Accuracy: 8650/10000 (86%)


 89%|████████▉ | 356/400 [01:16<00:06,  6.33it/s]

Epoch 350: Average loss: 0.8160, Accuracy: 8635/10000 (86%)


 92%|█████████▏| 367/400 [01:18<00:04,  6.70it/s]

Epoch 360: Average loss: 0.8165, Accuracy: 8647/10000 (86%)


 94%|█████████▍| 376/400 [01:20<00:03,  6.23it/s]

Epoch 370: Average loss: 0.7781, Accuracy: 8694/10000 (87%)


 96%|█████████▋| 386/400 [01:22<00:02,  6.36it/s]

Epoch 380: Average loss: 0.7721, Accuracy: 8708/10000 (87%)


100%|██████████| 400/400 [01:24<00:00,  4.74it/s]

Epoch 390: Average loss: 0.8648, Accuracy: 8614/10000 (86%)





Теперь попробуем побить этот результат с помощью псевдолейблов. Напишем функцию, которая принимает модель и возращает DataLoader с хард-лейблами, и запустим обучение.

In [14]:
def predict_batch(model, batch, soft=False):
  model.eval()
  logits = model(batch.cuda())
  if not soft:
    return logits.argmax(axis=1)
  else:
    return logits

In [15]:
def get_pseudo_loader(model):
    dataset_samples = [item[0] for item in list(unlabeled_train_dataset)]
    dataset_labels = []
    loader = torch.utils.data.DataLoader(
        unlabeled_train_dataset,
        batch_size=1024
    )
    for batch, labels in tqdm(loader):
        labels_batch = predict_batch(model, batch).detach().cpu()
        dataset_labels.extend(labels_batch)
    
    dataset = list(zip(dataset_samples, dataset_labels))

    return torch.utils.data.DataLoader(
        dataset, batch_size=64, shuffle=True
        )

In [16]:
model_hard = Net().cuda()
model_hard.load_state_dict(model.state_dict())
optimizer_hard = torch.optim.SGD(model_hard.parameters(), lr=0.1)

In [17]:
hard_labeled_loader = get_pseudo_loader(model)

for i in tqdm(range(10)):
    train(i, model_hard, optimizer_hard, hard_labeled_loader)
    train(i, model_hard, optimizer_hard, labeled_train_loader)
    test(i, model_hard, test_loader)

100%|██████████| 59/59 [00:10<00:00,  5.73it/s]
 10%|█         | 1/10 [00:04<00:37,  4.19s/it]

Epoch 0: Average loss: 0.5022, Accuracy: 8742/10000 (87%)


 20%|██        | 2/10 [00:08<00:33,  4.17s/it]

Epoch 1: Average loss: 0.6198, Accuracy: 8523/10000 (85%)


 30%|███       | 3/10 [00:12<00:28,  4.13s/it]

Epoch 2: Average loss: 0.6050, Accuracy: 8627/10000 (86%)


 40%|████      | 4/10 [00:17<00:26,  4.41s/it]

Epoch 3: Average loss: 0.5581, Accuracy: 8712/10000 (87%)


 50%|█████     | 5/10 [00:21<00:21,  4.36s/it]

Epoch 4: Average loss: 0.5604, Accuracy: 8693/10000 (87%)


 60%|██████    | 6/10 [00:25<00:17,  4.30s/it]

Epoch 5: Average loss: 0.6458, Accuracy: 8588/10000 (86%)


 70%|███████   | 7/10 [00:29<00:12,  4.26s/it]

Epoch 6: Average loss: 0.6200, Accuracy: 8639/10000 (86%)


 80%|████████  | 8/10 [00:34<00:08,  4.22s/it]

Epoch 7: Average loss: 0.6087, Accuracy: 8651/10000 (87%)


 90%|█████████ | 9/10 [00:38<00:04,  4.20s/it]

Epoch 8: Average loss: 0.6507, Accuracy: 8681/10000 (87%)


100%|██████████| 10/10 [00:42<00:00,  4.23s/it]

Epoch 9: Average loss: 0.6688, Accuracy: 8648/10000 (86%)





**Итеративная псевдоразметка.**

Мы уже видим небольшое улучшение, но можно пойти дальше.

In [18]:
model_hard_iter = Net().cuda()
model_hard_iter.load_state_dict(model.state_dict())
optimizer_hard_iter = torch.optim.SGD(model_hard_iter.parameters(), lr=0.1)

In [19]:
for i in range(20):
    hard_labeled_loader = get_pseudo_loader(model_hard_iter)
    train(i, model_hard_iter, optimizer_hard_iter, hard_labeled_loader)
    train(i, model_hard_iter, optimizer_hard_iter, labeled_train_loader)
    test(i, model_hard_iter, test_loader)

100%|██████████| 59/59 [00:09<00:00,  5.97it/s]


Epoch 0: Average loss: 0.5071, Accuracy: 8701/10000 (87%)


100%|██████████| 59/59 [00:10<00:00,  5.90it/s]


Epoch 1: Average loss: 0.4840, Accuracy: 8890/10000 (89%)


100%|██████████| 59/59 [00:09<00:00,  6.06it/s]


Epoch 2: Average loss: 0.5850, Accuracy: 8856/10000 (89%)


100%|██████████| 59/59 [00:10<00:00,  5.59it/s]


Epoch 3: Average loss: 0.5299, Accuracy: 8912/10000 (89%)


100%|██████████| 59/59 [00:09<00:00,  6.15it/s]


Epoch 4: Average loss: 0.5311, Accuracy: 8923/10000 (89%)


100%|██████████| 59/59 [00:09<00:00,  6.08it/s]


Epoch 5: Average loss: 0.4486, Accuracy: 9047/10000 (90%)


100%|██████████| 59/59 [00:09<00:00,  5.92it/s]


Epoch 6: Average loss: 0.4448, Accuracy: 9098/10000 (91%)


100%|██████████| 59/59 [00:09<00:00,  5.96it/s]


Epoch 7: Average loss: 0.4717, Accuracy: 9086/10000 (91%)


100%|██████████| 59/59 [00:10<00:00,  5.55it/s]


Epoch 8: Average loss: 0.4268, Accuracy: 9125/10000 (91%)


100%|██████████| 59/59 [00:09<00:00,  6.03it/s]


Epoch 9: Average loss: 0.4345, Accuracy: 9142/10000 (91%)


100%|██████████| 59/59 [00:09<00:00,  6.13it/s]


Epoch 10: Average loss: 0.4142, Accuracy: 9200/10000 (92%)


100%|██████████| 59/59 [00:09<00:00,  6.03it/s]


Epoch 11: Average loss: 0.4382, Accuracy: 9196/10000 (92%)


100%|██████████| 59/59 [00:09<00:00,  6.11it/s]


Epoch 12: Average loss: 0.5127, Accuracy: 9164/10000 (92%)


100%|██████████| 59/59 [00:10<00:00,  5.59it/s]


Epoch 13: Average loss: 0.4487, Accuracy: 9213/10000 (92%)


100%|██████████| 59/59 [00:09<00:00,  6.11it/s]


Epoch 14: Average loss: 0.4077, Accuracy: 9250/10000 (92%)


100%|██████████| 59/59 [00:09<00:00,  6.03it/s]


Epoch 15: Average loss: 0.4150, Accuracy: 9281/10000 (93%)


100%|██████████| 59/59 [00:09<00:00,  5.98it/s]


Epoch 16: Average loss: 0.3989, Accuracy: 9286/10000 (93%)


100%|██████████| 59/59 [00:09<00:00,  6.09it/s]


Epoch 17: Average loss: 0.4008, Accuracy: 9275/10000 (93%)


100%|██████████| 59/59 [00:09<00:00,  6.07it/s]


Epoch 18: Average loss: 0.3642, Accuracy: 9287/10000 (93%)


100%|██████████| 59/59 [00:09<00:00,  6.00it/s]


Epoch 19: Average loss: 0.4058, Accuracy: 9285/10000 (93%)


**Оценивание.**

В предыдущем пункте нужно получить accuracy 91% или выше (5 баллов).

Следующие шаги:

Модифицировать функцию `get_pseudo_loader`, чтобы она могла возвращать софт-лейблы (+1 балл).

Правильно запустить обучение - в качестве лосса используем KL-дивергенцию. Получить accuracy 90% или выше. (+3 балла).

Интуитивно кажется, что модель не должна ничему учиться, т.к. ее выход будет полностью совпадать с софт-лейблами. Напишите (текстом), почему тем не менее удается сильно выиграть относительно бейзлайна. (+1 балл).

In [75]:
model_soft_iter = Net().cuda()
model_soft_iter.load_state_dict(model.state_dict())
optimizer_soft_iter = torch.optim.SGD(model_soft_iter.parameters(), lr=0.1)

In [76]:
def get_pseudo_loader(model, soft=False):
    dataset_samples = [item[0] for item in list(unlabeled_train_dataset)]
    dataset_labels = []
    loader = torch.utils.data.DataLoader(
        unlabeled_train_dataset,
        batch_size=256
    )
    for batch, labels in tqdm(loader):
        labels_batch = predict_batch(model, batch, soft=soft).detach().cpu()
        dataset_labels.extend(labels_batch)
    
    dataset = list(zip(dataset_samples, dataset_labels))

    return torch.utils.data.DataLoader(
        dataset, batch_size=64, shuffle=True
        )

In [77]:
def train_with_kd(epoch_idx, model, optimizer, train_loader):
    model.train()
    loss_func = torch.nn.KLDivLoss(reduction='batchmean', log_target=True)
    for batch_idx, (x, target) in enumerate(train_loader):
        x, target = x.cuda(), target.cuda()
        optimizer.zero_grad()
        output = model(x)
        pred = F.log_softmax(output, dim=1)
        target = F.log_softmax(target, dim=1)
        loss = loss_func(pred, target)
        loss.backward()
        optimizer.step()


In [78]:
for i in range(20):
    soft_labeled_loader = get_pseudo_loader(model_soft_iter, soft=True)
    train_with_kd(i, model_soft_iter, optimizer_soft_iter, soft_labeled_loader)
    train(i, model_soft_iter, optimizer_soft_iter, labeled_train_loader)
    test(i, model_soft_iter, test_loader)

100%|██████████| 234/234 [00:09<00:00, 24.45it/s]


Epoch 0: Average loss: 0.4816, Accuracy: 8675/10000 (87%)


100%|██████████| 234/234 [00:09<00:00, 25.09it/s]


Epoch 1: Average loss: 0.4206, Accuracy: 8737/10000 (87%)


100%|██████████| 234/234 [00:09<00:00, 24.74it/s]


Epoch 2: Average loss: 0.3955, Accuracy: 8795/10000 (88%)


100%|██████████| 234/234 [00:09<00:00, 24.93it/s]


Epoch 3: Average loss: 0.3709, Accuracy: 8872/10000 (89%)


100%|██████████| 234/234 [00:10<00:00, 22.39it/s]


Epoch 4: Average loss: 0.3457, Accuracy: 8939/10000 (89%)


100%|██████████| 234/234 [00:09<00:00, 24.15it/s]


Epoch 5: Average loss: 0.3390, Accuracy: 8950/10000 (90%)


100%|██████████| 234/234 [00:13<00:00, 16.97it/s]


Epoch 6: Average loss: 0.3420, Accuracy: 8947/10000 (89%)


100%|██████████| 234/234 [00:12<00:00, 18.98it/s]


Epoch 7: Average loss: 0.3107, Accuracy: 9058/10000 (91%)


100%|██████████| 234/234 [00:12<00:00, 19.29it/s]


Epoch 8: Average loss: 0.3015, Accuracy: 9080/10000 (91%)


100%|██████████| 234/234 [00:10<00:00, 22.42it/s]


Epoch 9: Average loss: 0.2926, Accuracy: 9135/10000 (91%)


 22%|██▏       | 51/234 [00:02<00:08, 20.81it/s]


KeyboardInterrupt: ignored

На самом деле выход модели не будет совпадать с софт-лейблами. Главная, как мне кажется, причина - дропаут, который вносит неопределенность в поведение модели. За счет него выход модели будет отличаться от таргета. Таким образом мы учимся уже на ненулевой лосс и это дает профит.

Но даже при нулевом дропауте обучение все равно будет идти. Я проверял, первые несколько батчей значения будут одинаковые (на самом деле я смотрел только на `output.mean()`), но постепенно таргет и аутпут начнут расходиться. Поначалу расхождение будет небольшим, но потом, после обновлений модели на этот лосс, пусть и небольшой, расхождение будет расти и модель будет учиться.
Думаю что изначальное расхождение может быть связано с численными ошибками в вычислениях.