# Нейронные сети
__Суммарное количество баллов: 10__

__Решение отправлять на `ml.course.practice@gmail.com`__

__Тема письма: `[ML][HW06] <ФИ>`, где вместо `<ФИ>` указаны фамилия и имя__

Для начала вам предстоит реализовать свой собственный backpropagation и протестировать его на реальных данных, а затем научиться обучать нейронные сети при помощи библиотеки `PyTorch` и использовать это умение для классификации классического набора данных CIFAR10.

In [None]:
import numpy as np
import copy
from sklearn.datasets import make_blobs, make_moons

In [None]:
np.random.seed(9)

### Задание 1 (3 балла)
Нейронные сети состоят из слоев, поэтому для начала понадобится реализовать их. Пока нам понадобятся только три:

`Linear` - полносвязный слой, в котором `y = Wx + b`, где `y` - выход, `x` - вход, `W` - матрица весов, а `b` - смещение. 

`ReLU` - слой, соответствующий функции активации `y = max(0, x)`.

`Softmax` - слой, соответствующий функции активации [softmax](https://ru.wikipedia.org/wiki/Softmax)


#### Методы
`forward(X)` - возвращает предсказанные для `X`. `X` может быть как вектором, так и батчем

`backward(d)` - считает градиент при помощи обратного распространения ошибки. Возвращает новое значение `d`

`update(alpha)` - обновляет веса (если необходимо) с заданой скоростью обучения

In [None]:
class Module:
    def forward(self, x):
        raise NotImplementedError()
    
    def backward(self, d):
        raise NotImplementedError()
        
    def update(self, alpha):
        pass
        
class Linear(Module):
    def __init__(self, in_features, out_features):
        self.in_features = in_features
        self.out_features = out_features
        self.weight = np.random.normal(scale = 0.5, size = (in_features, out_features))
        self.b = np.zeros(out_features)
    
    def forward(self, x):
        self.X = np.copy(x)
        self.z = x@self.weight + self.b
        return self.z
    
    def backward(self, d):
        n = self.X.shape[0]
        self.W_grad = (self.X).T@d
        self.b_grad = np.sum(d, axis = 0)
        return d@(self.weight).T
        
    def update(self, alpha):
        self.weight -= alpha*self.W_grad
        self.b -= alpha*self.b_grad 
        
class ReLU(Module):
    def __init__(self):
        pass
    
    def forward(self, x):
        self.neg = x < 0
        z = np.copy(x)
        z[z < 0] = 0
        return z
        
    def backward(self, d):
        d_new = np.copy(d)
        d_new[self.neg] = 0
        return d_new
        
class Softmax(Module):
    def __init__(self):
        pass
        
    def forward(self, x):
        exp = np.exp(x - np.max(x, axis = 1, keepdims = True))
        s = np.sum(exp, axis = 1, keepdims = True)
        self.prob = exp/s
        return self.prob
        
    def backward(self, d):
        p = self.prob
        n = len(p)
        p[range(n), d] -= 1
        p = p/n
        return p        
        
        

### Задание 2 (2 балла)
Теперь сделаем саму нейронную сеть.

#### Методы
`fit(X, y)` - обучает нейронную сеть заданное число эпох. В каждой эпохе необходимо использовать [cross-entropy loss](https://ml-cheatsheet.readthedocs.io/en/latest/loss_functions.html#cross-entropy) для обучения, а так же производить обновления не по одному элементу, а используя батчи.

`predict_proba(X)` - предсказывает вероятности классов для элементов `X`

#### Параметры конструктора
`modules` - список, состоящий из ранее реализованных модулей и описывающий слои нейронной сети. В конец необходимо добавить `Softmax`

`epochs` - количество эпох обучения

`alpha` - скорость обучения

In [None]:
class MLPClassifier:
    def __init__(self, modules, epochs=40, alpha=0.01, batch = 10):
        self.modules = modules
        self.modules.append(Softmax())
        self.epochs = epochs
        self.alpha = alpha
        self.batch = batch
            
    def fit(self, X, y):
        y = np.array(y)
        n = X.shape[0]
        batch = self.batch
        for i in range(self.epochs):
            
            p = np.random.permutation(n)
    
            X_perm = X[p, :]
            y_perm = y[p]
            ind = 0
            probs = np.zeros((n, len(np.unique(y))) )
            while ind < n:
                
                X_batch = X_perm[ind:ind + batch, :]
                y_batch = y_perm[ind:ind + batch]  
                
                a = self.forward(X_batch)
                probs[ind : ind + batch, :] = a
                self.backward(y_batch)
                ind = ind + batch
                
            cost = self.cross_entropy_loss(probs, y_perm)
            
        print('cost:', cost)
                
    def forward(self, X):
        a = X
        for mod in self.modules:
            a = mod.forward(a)
        return a
    
    def backward(self, y):
        d = y
        for mod in reversed(self.modules):
            d = mod.backward(d)
            if isinstance(mod, Linear):
                        mod.update(self.alpha)
            
    def cross_entropy_loss(self, probs, y):
        n = probs.shape[0]
        log = -np.log(probs[range(n), y])
        cost = np.sum(log) / n
        return cost
        
    def predict_proba(self, X):
        return self.forward(X)

    def predict(self, X):
        p = self.predict_proba(X)
        return np.argmax(p, axis=1)


In [None]:
p = MLPClassifier([
    Linear(4, 64),
    ReLU(),
    Linear(64, 64),
    ReLU(),
    Linear(64, 2)
], epochs = 100, alpha=0.025)

X = np.random.randn(50, 4)
y = [(0 if x[0] > x[2]**2 or x[3]**3 > 0.5 else 1) for x in X]
p.fit(X, y)
print(p.predict(X))
print(np.mean(p.predict(X) == y))

### Задание 3 (2 балла)
Протестируем наше решение на синтетических данных. Необходимо подобрать гиперпараметры, при которых качество полученных классификаторов будет достаточным.

#### Оценка
Accuracy на первом датасете больше 0.85 - +1 балл

Accuracy на втором датасете больше 0.85 - +1 балл

In [None]:
X, y = make_moons(400, noise=0.075)
X_test, y_test = make_moons(400, noise=0.075)

best_acc = 0

layer = [
    [X.shape[1], 2, 3, 4, 5, 2],
    [X.shape[1], 4, 8, 4, 2],
    [X.shape[1], 64, 64, 2],
    [X.shape[1], 32, 32, 2],
    [X.shape[1], 4, 4, 4, 2]
]
for size in layer:
    modules = []
    for l in range(0, len(size)-1):
        modules.append(Linear(size[l], size[l+1]))
        if l < len(size)-2:
            modules.append(ReLU())
    p = MLPClassifier(modules, epochs = 50, alpha= 0.2)
    print('----------')
    p.fit(X, y)
    acc = np.mean(p.predict(X_test) == y_test)

    print(acc, 'размер слоев', size)
    best_acc = max(acc, best_acc)
print("Accuracy", best_acc)



In [None]:
p = MLPClassifier([
    Linear(2, 64),
    ReLU(),
    Linear(64, 64),
    ReLU(),
    Linear(64, 2)
], epochs = 50, alpha=0.2)
p.fit(X, y)
acc = np.mean(p.predict(X_test) == y_test)
print("Accuracy", acc)

In [None]:
X, y = make_blobs(400, 2, centers=[[0, 0], [2.5, 2.5], [-2.5, 3]])
X_test, y_test = make_blobs(400, 2, centers=[[0, 0], [2.5, 2.5], [-2.5, 3]])
best_acc = 0
layer = [
    [X.shape[1], 2, 3, 4, 5, 3],
    [X.shape[1], 4, 8, 4, 3],
    [X.shape[1], 64, 64, 3],
    [X.shape[1], 32, 32, 3],
    [X.shape[1], 4, 4, 4, 3]
]
for size in layer:
    modules = []

    for l in range(0, len(size)-1):
        modules.append(Linear(size[l], size[l+1]))
        if l < len(size)-2:
            modules.append(ReLU())
    p = MLPClassifier(modules, epochs = 100, alpha= 0.2)
    print('----------')
    p.fit(X, y)
    acc = np.mean(p.predict(X_test) == y_test)

    print(acc, 'размер слоев', size)

    best_acc = max(acc, best_acc)
print("Accuracy", best_acc)



In [None]:
p = MLPClassifier([
    Linear(2, 64),
    ReLU(),
    Linear(64, 64),
    ReLU(),
    Linear(64, 3)
], epochs = 50, alpha=0.2)
p.fit(X, y)
acc = np.mean(p.predict(X_test) == y_test)
print("Accuracy", acc)

## PyTorch

Для выполнения следующего задания понадобится PyTorch. [Инструкция по установке](https://pytorch.org/get-started/locally/)

Если у вас нет GPU, то можно использовать [Google Colab](https://colab.research.google.com/)

In [1]:
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
import torch
from tqdm import tqdm
from torch import nn
import torch.nn.functional as F
import matplotlib.pyplot as plt

In [2]:
torch.cuda.is_available()

True

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

t = transforms.ToTensor()

cifar_train = datasets.CIFAR10("datasets/cifar10", download=True, train=True, transform=t)
train_loader = DataLoader(cifar_train, batch_size=1024, shuffle=True, pin_memory=torch.cuda.is_available())
cifar_test = datasets.CIFAR10("datasets/cifar10", download=True, train=False, transform=t)
test_loader = DataLoader(cifar_test, batch_size=1024, shuffle=False, pin_memory=torch.cuda.is_available())

Files already downloaded and verified
Files already downloaded and verified


In [12]:
def outputSize(in_size, kernel_size, stride, padding):
    output = int((in_size - kernel_size + 2*(padding)) / stride) + 1
    return(output)

### Задание 4 (3 балла)
А теперь поработаем с настоящими нейронными сетями и настоящими данными. Необходимо реализовать сверточную нейронную сеть, которая будет классифицировать изображения из датасета CIFAR10. Имплементируйте класс `Model` и функцию `calculate_loss`. 

Обратите внимание, что `Model` должна считать в конце `softmax`, т.к. мы решаем задачу классификации. Соответствено, функция `calculate_loss` считает cross-entropy.

Для успешного выполнения задания необходимо, чтобы `accuracy`, `mean precision` и `mean recall` были больше 0.5

__Можно пользоваться всем содержимым библиотеки PyTorch.__

In [6]:
class Model(nn.Module):
    def __init__(self):
        super(Model, self).__init__()
        self.conv1 = nn.Conv2d(3, 8, 3)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(8, 16, 3)
        self.linear1 = nn.Linear(16 * 6 * 6, 128)
        self.linear2 = nn.Linear(128, 64)
        self.linear3 = nn.Linear(64, 10)
        
    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 6 * 6)
        x = F.relu(self.linear1(x))
        x = F.relu(self.linear2(x))
        x = self.linear3(x)
        return x
        
criterion = nn.CrossEntropyLoss()
def calculate_loss(X, y, model):
    return criterion(model(X), y)

Теперь обучим нашу модель. Для этого используем ранее созданные batch loader'ы.

In [7]:
def train(model, epochs = 100):
    optimizer = torch.optim.Adam(model.parameters())
    train_losses = []
    test_losses = []
    for i in range(epochs):
        #Train
        loss_mean = 0
        elements = 0
        for X, y in iter(train_loader):
            X = X.to(device)
            y = y.to(device)
            loss = calculate_loss(X, y, model)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            loss_mean += loss.item() * len(X)
            elements += len(X)
        train_losses.append(loss_mean / elements)
        #Test
        loss_mean = 0 
        elements = 0
        for X, y in iter(test_loader):
            X = X.to(device)
            y = y.to(device)
            loss = calculate_loss(X, y, model)
            loss_mean += loss.item() * len(X)
            elements += len(X)
        test_losses.append(loss_mean / elements)
        print("Epoch", i, "| Train loss", train_losses[-1], "| Test loss", test_losses[-1])
    return train_losses, test_losses

In [8]:
model = Model().to(device)
train_l, test_l = train(model)

tensor([[ 0.0323, -0.0323, -0.0848,  ..., -0.0114,  0.0340,  0.0602],
        [ 0.0316, -0.0331, -0.0808,  ..., -0.0125,  0.0314,  0.0521],
        [ 0.0309, -0.0314, -0.0768,  ..., -0.0130,  0.0310,  0.0549],
        ...,
        [ 0.0355, -0.0330, -0.0703,  ..., -0.0165,  0.0309,  0.0635],
        [ 0.0269, -0.0295, -0.0897,  ..., -0.0095,  0.0357,  0.0469],
        [ 0.0294, -0.0265, -0.0775,  ..., -0.0158,  0.0364,  0.0551]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([6, 9, 4,  ..., 7, 5, 7], device='cuda:0')
tensor([[ 0.0355, -0.0107, -0.0536,  ...,  0.0156,  0.0372,  0.0830],
        [ 0.0293, -0.0145, -0.0408,  ...,  0.0150,  0.0352,  0.0746],
        [ 0.0277, -0.0212, -0.0633,  ...,  0.0093,  0.0331,  0.0742],
        ...,
        [ 0.0315, -0.0195, -0.0652,  ...,  0.0031,  0.0352,  0.0800],
        [ 0.0272, -0.0194, -0.0680,  ...,  0.0091,  0.0341,  0.0755],
        [ 0.0308, -0.0152, -0.0555,  ...,  0.0109,  0.0361,  0.0730]],
       device='cuda:0', grad_fn=<

tensor([[-0.1669, -0.0433,  0.0158,  ...,  0.1539, -0.0898,  0.0493],
        [-0.1813, -0.0990, -0.0149,  ...,  0.0903,  0.0210, -0.0477],
        [-0.1151, -0.0999,  0.0440,  ...,  0.1596,  0.0120,  0.0532],
        ...,
        [-0.2529, -0.0705, -0.0311,  ...,  0.0662, -0.0354, -0.0889],
        [-0.0467, -0.1244,  0.0072,  ...,  0.1167,  0.0253,  0.0345],
        [ 0.0888, -0.0206,  0.1046,  ...,  0.2488,  0.1124,  0.3309]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([4, 0, 2,  ..., 1, 8, 8], device='cuda:0')
tensor([[ 2.4461e-01,  1.1730e-01,  1.1488e-01,  ...,  1.1412e-01,
          2.2028e-01,  4.7261e-01],
        [-1.8716e-01, -2.3215e-02, -1.2196e-02,  ...,  4.5422e-02,
         -2.4063e-02,  1.2734e-02],
        [-2.4645e-01, -1.4418e-01,  4.4057e-04,  ...,  1.4961e-01,
         -1.3868e-01, -4.8292e-02],
        ...,
        [-2.8791e-01,  3.3607e-02, -1.2342e-02,  ...,  1.1525e-01,
         -1.7493e-01,  6.3215e-02],
        [-2.8866e-01, -7.2637e-02, -5.1535

          2.4810e+00,  2.0165e+00]], device='cuda:0', grad_fn=<AddmmBackward>) tensor([9, 0, 5,  ..., 4, 6, 8], device='cuda:0')
tensor([[-0.2852, -0.7913,  0.3903,  ...,  0.5523, -0.4501, -1.4712],
        [ 0.1070,  0.5992,  0.0931,  ...,  0.1554, -0.0389,  0.3810],
        [-1.0658,  0.3799,  0.1545,  ...,  0.1583, -0.9027, -0.3528],
        ...,
        [ 0.6662, -0.3527,  0.4049,  ...,  0.6015,  0.0285, -0.2847],
        [ 1.3612, -0.2191,  0.4041,  ...,  0.3634,  0.6195, -0.0782],
        [ 0.0586, -0.2518,  0.4512,  ...,  0.4857, -0.3810, -0.6132]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([5, 9, 2,  ..., 0, 7, 4], device='cuda:0')
tensor([[-1.7943, -0.5434,  0.6188,  ...,  0.9422, -1.8645, -1.2202],
        [-1.7999, -0.2399,  0.5230,  ...,  0.8048, -1.7842, -1.1272],
        [-1.1186, -0.9031,  0.5957,  ...,  0.8193, -1.2815, -1.4520],
        ...,
        [-0.1392, -0.0509,  0.2992,  ...,  0.4191, -0.3562, -0.2201],
        [-0.1515,  0.5303,  0.0994,  ..., -0.

       device='cuda:0', grad_fn=<AddmmBackward>) tensor([3, 5, 9,  ..., 4, 0, 3], device='cuda:0')
tensor([[ 0.8569, -0.2904,  0.5438,  ..., -0.2900,  0.8031, -1.2034],
        [ 0.6519,  0.5731,  0.0532,  ..., -0.0461,  0.4315,  0.4980],
        [-0.8608, -0.1189,  0.5455,  ...,  0.3183, -0.9624, -0.7256],
        ...,
        [ 1.9331,  0.8053, -0.1183,  ..., -1.0147,  2.4199,  0.5603],
        [-2.6605, -0.2669,  0.9201,  ...,  1.2883, -3.0460, -0.8664],
        [ 1.0401,  0.0634,  0.1532,  ..., -0.0761,  0.8359,  0.2255]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([8, 8, 2, 4, 3, 9, 7, 2, 1, 2, 9, 8, 7, 7, 6, 3, 2, 3, 2, 9, 2, 6, 0, 4,
        9, 2, 7, 4, 7, 1, 2, 7, 1, 4, 1, 5, 3, 3, 7, 9, 2, 1, 9, 9, 7, 6, 7, 9,
        9, 7, 3, 0, 4, 7, 0, 0, 1, 9, 6, 2, 0, 3, 1, 2, 5, 4, 2, 2, 4, 9, 8, 7,
        1, 4, 2, 5, 0, 4, 6, 7, 6, 8, 3, 9, 5, 8, 1, 1, 9, 4, 9, 6, 4, 6, 2, 9,
        9, 3, 2, 2, 7, 0, 1, 6, 7, 2, 6, 1, 7, 8, 5, 2, 5, 1, 6, 8, 6, 7, 8, 4,
        3, 4, 2, 8

       device='cuda:0', grad_fn=<AddmmBackward>) tensor([6, 9, 3, 9, 8, 7, 7, 1, 6, 5, 3, 1, 3, 1, 2, 7, 1, 8, 2, 0, 9, 7, 9, 8,
        8, 6, 7, 3, 7, 1, 3, 9, 0, 9, 3, 6, 7, 2, 7, 3, 0, 5, 9, 7, 5, 5, 0, 6,
        5, 1, 8, 2, 7, 5, 9, 0, 0, 0, 8, 8, 7, 3, 7, 8, 9, 3, 7, 9, 7, 8, 7, 9,
        8, 5, 4, 8, 3, 7, 6, 3, 8, 2, 1, 9, 5, 7, 3, 9, 5, 5, 8, 7, 3, 5, 3, 5,
        9, 7, 6, 7, 3, 6, 4, 3, 9, 4, 2, 1, 9, 6, 0, 2, 6, 7, 4, 7, 9, 0, 7, 4,
        3, 5, 3, 1, 1, 2, 6, 8, 2, 1, 7, 8, 5, 9, 6, 1, 1, 5, 0, 6, 0, 9, 2, 6,
        5, 8, 9, 5, 5, 6, 2, 9, 1, 5, 8, 8, 7, 1, 7, 3, 5, 4, 9, 7, 5, 2, 9, 9,
        4, 7, 4, 1, 3, 8, 7, 9, 0, 4, 5, 7, 5, 2, 8, 7, 6, 9, 6, 9, 3, 8, 5, 6,
        6, 9, 5, 7, 8, 0, 5, 0, 7, 4, 8, 2, 5, 1, 3, 2, 2, 6, 2, 1, 7, 4, 6, 3,
        1, 3, 7, 2, 1, 3, 7, 0, 8, 4, 4, 5, 7, 9, 5, 4, 3, 9, 6, 8, 2, 3, 3, 1,
        6, 1, 7, 0, 3, 4, 2, 9, 4, 5, 8, 2, 7, 0, 9, 6, 8, 0, 8, 2, 8, 5, 7, 7,
        2, 2, 0, 0, 0, 7, 4, 1, 6, 6, 8, 8, 9, 0, 9, 0, 1, 3, 3, 0, 9, 

tensor([[-1.0429, -0.0259,  0.5416,  ...,  0.3564, -1.4239, -0.9296],
        [ 0.9739,  1.0508, -0.3962,  ...,  0.5049,  0.2310,  1.7992],
        [ 0.3315,  1.4142, -0.8821,  ..., -0.1300,  0.4333,  1.9049],
        ...,
        [-0.3991,  0.0506,  0.1948,  ...,  0.8953, -1.5093,  0.4297],
        [ 2.7477, -1.4023,  1.0067,  ...,  0.1642,  1.2123, -1.9011],
        [-0.5953,  0.3975,  0.0331,  ...,  0.5140, -1.0267,  0.7358]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([7, 0, 0,  ..., 1, 5, 3], device='cuda:0')
tensor([[ 0.3380,  0.4642,  0.1779,  ..., -0.0255, -0.0325, -0.2151],
        [-0.3810, -0.5440,  0.7399,  ...,  0.1958, -0.6749, -1.6929],
        [ 0.8970,  0.1822, -0.0319,  ...,  1.0327, -0.5425,  1.2266],
        ...,
        [-2.7785, -0.0064,  0.6342,  ...,  0.8158, -2.8377, -0.6399],
        [ 0.3571, -0.4776,  0.5888,  ...,  0.1397, -0.0451, -1.0645],
        [ 2.9594, -0.3515,  0.3766,  ..., -0.6412,  2.3619, -1.1207]],
       device='cuda:0', grad_fn=<

tensor([[ 0.7972,  0.5543,  0.0649,  ...,  0.4961,  0.0923,  0.7854],
        [-0.6985,  0.4852,  0.3412,  ...,  1.0411, -1.5666, -0.1560],
        [-1.6561, -0.5817,  1.0746,  ...,  1.0931, -2.1508, -0.6143],
        ...,
        [-0.6905, -0.3846,  0.7907,  ...,  0.6866, -1.1257, -0.8481],
        [-0.8143,  0.0237,  0.4690,  ...,  0.6486, -1.1297, -0.1530],
        [-0.2128, -0.5068,  1.0875,  ...,  0.4388, -0.3044, -0.7562]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([9, 3, 4,  ..., 3, 4, 4], device='cuda:0')
tensor([[-1.0968e+00,  4.0567e-02,  4.7547e-01,  ...,  1.2828e+00,
         -2.1469e+00,  6.1463e-01],
        [ 6.0407e-01, -3.2508e-01,  1.0215e+00,  ...,  2.4586e-03,
          3.7203e-01, -1.1252e+00],
        [-2.3134e+00, -4.4658e-01,  1.0628e+00,  ...,  7.0199e-01,
         -2.2216e+00, -8.3999e-01],
        ...,
        [-8.3905e-01, -6.9823e-04,  4.3555e-01,  ...,  7.8868e-01,
         -1.1021e+00,  1.7806e-01],
        [ 1.9202e+00, -1.4319e-01,  9.9702

tensor([[-0.5589,  1.2981, -0.7341,  ..., -0.2558,  0.1181,  1.6666],
        [ 0.7038, -1.9385,  1.4189,  ..., -0.0826,  0.5985, -1.7715],
        [-2.2450, -1.1981,  1.2329,  ...,  0.7247, -2.2782, -1.5299],
        ...,
        [ 0.4013,  1.3449, -0.8118,  ..., -0.4774,  0.9611,  1.1935],
        [-0.9730, -2.5895,  2.0986,  ...,  0.8603, -1.5875, -2.4611],
        [-0.8436, -0.8036,  0.6821,  ...,  0.1570, -0.7322, -1.8440]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([8, 6, 2,  ..., 1, 2, 5], device='cuda:0')
tensor([[-2.0446, -0.1963,  0.3803,  ...,  1.1051, -2.6549, -0.7865],
        [-1.3351, -0.5063,  0.6546,  ...,  0.3220, -1.3953, -0.5918],
        [-2.3149,  0.5232,  0.0212,  ...,  0.4875, -2.4009,  0.4053],
        ...,
        [ 2.3715,  2.3336, -1.1537,  ..., -1.8727,  3.5674,  2.0588],
        [ 1.7522, -0.6910,  0.3968,  ...,  0.1654,  1.3392, -0.2131],
        [-0.3771, -0.3995,  0.4817,  ...,  0.9278, -1.5643,  0.0575]],
       device='cuda:0', grad_fn=<

tensor([[ 1.8077,  1.2294, -0.6555,  ..., -1.0734,  2.5741,  1.2566],
        [-2.8259, -1.0467,  1.2468,  ...,  1.0176, -2.8016, -1.4431],
        [ 2.5080,  1.8208, -1.1053,  ..., -1.4280,  3.6428,  1.9379],
        ...,
        [-0.7122, -1.1356,  1.2567,  ...,  1.6604, -1.8677, -0.9735],
        [ 1.3159, -1.2022,  0.9376,  ...,  0.4906,  0.5675, -0.8119],
        [-1.3039, -0.1468,  0.5029,  ...,  0.2912, -0.8997, -1.4875]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([8, 3, 8,  ..., 4, 0, 7], device='cuda:0')
tensor([[-1.2426,  1.0272, -0.2207,  ...,  0.0608, -0.8917, -0.5924],
        [-2.4465, -0.6567,  0.6505,  ...,  1.2773, -2.3394, -1.2107],
        [-0.3184,  3.0532, -1.2955,  ..., -0.5973,  0.3194,  1.0792],
        ...,
        [-0.9272, -0.2033,  0.4521,  ...,  0.9526, -1.3592, -0.9425],
        [-1.1956, -0.7936,  1.0276,  ...,  0.3796, -0.8647, -2.2048],
        [-0.0853,  0.3753,  0.0223,  ...,  0.4916, -0.8033, -0.6908]],
       device='cuda:0', grad_fn=<

tensor([[-0.9007,  1.6072, -0.6291,  ...,  0.3374, -1.0942,  1.0044],
        [-2.2344,  0.7388, -0.2986,  ...,  0.7918, -2.2596,  0.4817],
        [ 0.9012,  0.2905,  0.3189,  ..., -0.6922,  0.9123, -0.5472],
        ...,
        [-0.6619, -2.1136,  1.9397,  ...,  0.6212, -1.0720, -1.8298],
        [ 0.1147, -0.8044,  1.0593,  ..., -0.2801,  0.3891, -0.6904],
        [ 1.6419,  1.5625, -0.7707,  ..., -1.2880,  2.6643,  2.0614]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([3, 5, 4,  ..., 3, 2, 0], device='cuda:0')
tensor([[ 0.4934,  1.7879, -1.0449,  ...,  0.1858, -0.0647,  1.1568],
        [ 0.7651,  0.2151,  0.5348,  ...,  0.0426,  0.1597, -0.7979],
        [-0.5104, -2.1184,  1.8868,  ...,  0.5799, -1.2474, -2.0368],
        ...,
        [-1.4793, -1.9976,  1.8858,  ...,  1.7437, -2.7514, -1.9721],
        [-0.5045, -0.2929,  0.6670,  ...,  0.8389, -1.2352, -0.5970],
        [-1.1852,  0.7069, -0.0048,  ...,  1.1345, -2.0187,  0.0693]],
       device='cuda:0', grad_fn=<

tensor([[ 0.6459,  1.1927, -1.0167,  ..., -0.7906,  1.1480,  1.6202],
        [ 0.4479,  1.8370, -1.2510,  ..., -1.0574,  1.3676,  1.1416],
        [-0.3404,  1.4156, -0.5737,  ..., -0.0209, -0.5265, -0.0256],
        ...,
        [ 2.8336,  0.8423, -0.2459,  ..., -1.3744,  2.8475,  1.1919],
        [ 2.4618, -0.1526,  0.2793,  ..., -0.1172,  1.7439,  0.3851],
        [ 1.1946, -2.5110,  2.0085,  ..., -0.7391,  1.2123, -2.0244]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([9, 1, 3,  ..., 8, 0, 8], device='cuda:0')
tensor([[ 0.1665,  1.4964, -1.3237,  ..., -0.3685,  0.0900,  0.5039],
        [ 1.1924, -0.1485,  0.2230,  ...,  0.3609,  0.2237, -0.4012],
        [-1.5092,  0.2926, -0.2096,  ...,  0.5046, -1.4805,  0.6649],
        ...,
        [-2.4211,  0.6040, -0.8743,  ...,  1.5117, -2.9860,  0.1152],
        [-2.0621,  0.8071, -0.9910,  ...,  0.3514, -1.2744,  0.7230],
        [ 1.5754,  0.5113, -0.2503,  ...,  0.0426,  1.2051,  0.0192]],
       device='cuda:0', grad_fn=<

tensor([[-0.7578,  1.9586, -1.5259,  ...,  0.4974, -0.6576,  2.6990],
        [-1.0318, -3.0072,  2.2637,  ...,  0.6951, -1.4173, -2.4031],
        [-1.9793, -0.0484, -0.0319,  ...,  0.5260, -1.9109,  0.3963],
        ...,
        [ 0.4665, -0.7233,  0.7484,  ...,  0.9739, -0.6848, -1.1773],
        [-1.0820, -1.0781,  0.9390,  ...,  0.8606, -1.3480, -0.6867],
        [-0.8019, -2.1091,  1.6822,  ...,  0.9206, -1.5948, -2.0173]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([1, 4, 6,  ..., 4, 4, 2], device='cuda:0')
tensor([[-0.2549, -2.4852,  1.8109,  ...,  0.2915, -0.4699, -2.8993],
        [ 1.6701, -0.4331,  0.6885,  ..., -0.1131,  0.5622, -0.5615],
        [ 0.4848, -0.3586,  0.8075,  ...,  0.1235, -0.4858, -1.1222],
        ...,
        [ 1.3239,  1.3346, -0.3605,  ..., -0.5228,  1.1727,  0.9697],
        [-1.4247, -1.5418,  1.2568,  ...,  0.8031, -1.7615, -1.3323],
        [ 1.9206, -0.9776,  1.1175,  ...,  1.1597,  0.4969, -0.7150]],
       device='cuda:0', grad_fn=<

tensor([[-1.4573, -0.6727,  0.7098,  ...,  0.0334, -1.4831, -1.3731],
        [ 2.4660,  2.0895, -1.3500,  ..., -2.3739,  3.9885,  2.2182],
        [ 1.0823,  0.9175, -0.8120,  ..., -0.9371,  2.1448,  0.9248],
        ...,
        [-1.5618, -2.3858,  1.6649,  ...,  0.0439, -1.2963, -2.5220],
        [ 2.9128, -1.9111,  1.1819,  ..., -0.1668,  1.8893, -1.6726],
        [ 2.1751,  1.6575, -0.9152,  ..., -1.2603,  2.5359,  1.1421]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([3, 8, 8,  ..., 1, 0, 0], device='cuda:0')
tensor([[-1.1443, -2.9926,  1.8000,  ...,  1.2069, -2.0029, -2.7000],
        [-1.1181, -2.7030,  1.3284,  ...,  1.5752, -1.5918, -3.2426],
        [-0.9807, -1.9327,  1.5692,  ...,  1.1415, -1.8484, -1.6546],
        ...,
        [-0.6990,  1.7611, -1.4940,  ...,  0.1809, -0.7342,  1.8470],
        [-1.3963, -0.9786,  0.2708,  ...,  1.1307, -1.9822, -1.8945],
        [-1.8734, -1.9109,  1.0995,  ...,  0.9028, -2.2701, -1.7869]],
       device='cuda:0', grad_fn=<

tensor([[ 1.5738,  3.5315, -2.0109,  ..., -1.3620,  2.3128,  1.6989],
        [ 0.5691,  1.2771, -1.3227,  ..., -0.4760,  1.0414,  2.7110],
        [ 0.3731,  1.8617, -1.6755,  ..., -0.3903,  1.0773,  2.2025],
        ...,
        [ 0.2217, -0.6400,  0.5870,  ..., -0.6093,  0.6544, -0.7327],
        [ 0.7087,  2.2930, -1.5349,  ..., -0.3219,  0.9296,  0.9985],
        [ 3.4398,  1.1537, -0.4560,  ..., -0.8461,  3.1429,  1.1662]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([1, 9, 8,  ..., 5, 7, 0], device='cuda:0')
tensor([[ 1.8184, -0.0258,  0.2540,  ..., -2.0513,  2.6591,  0.1265],
        [-2.0724,  0.6301, -0.4527,  ...,  0.6477, -2.1457,  0.4352],
        [-0.6065, -0.7569,  0.5360,  ..., -1.4865,  0.5755,  0.2136],
        ...,
        [-1.3763,  0.0110, -0.4964,  ...,  0.8291, -1.6307, -0.1639],
        [-1.9855,  0.8687, -0.8871,  ...,  0.5716, -2.0051, -0.3966],
        [-1.0410, -0.7849,  0.6886,  ...,  0.0906, -1.0414, -0.8249]],
       device='cuda:0', grad_fn=<

tensor([[-0.6264,  1.3457, -1.5545,  ...,  0.3216, -0.4440,  2.4250],
        [-1.0003, -3.5146,  2.3079,  ..., -0.1322, -0.9821, -2.5218],
        [-1.3041, -1.4611,  0.9049,  ...,  0.7563, -1.6870, -0.4031],
        ...,
        [ 1.2662,  1.0723, -0.9531,  ..., -1.1806,  2.1258,  2.1760],
        [-0.4423, -2.0532,  0.7379,  ...,  0.3597, -0.4237, -2.6225],
        [-0.9441, -5.4402,  3.5265,  ...,  0.7936, -1.8187, -3.7248]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([1, 2, 6,  ..., 9, 5, 2], device='cuda:0')
tensor([[-1.3031,  3.7464, -2.7689,  ..., -0.3340, -0.9394,  2.4159],
        [-1.8239, -4.6301,  3.2254,  ...,  0.7415, -2.4800, -2.9243],
        [-1.7281, -3.5181,  2.4305,  ...,  2.1848, -3.4732, -3.2413],
        ...,
        [ 0.8431,  3.6903, -2.4881,  ..., -0.4308,  0.0526,  2.2245],
        [-0.6577, -0.3816, -0.0773,  ..., -0.2062, -0.8671, -1.1246],
        [-0.6505,  1.7875, -1.9084,  ...,  1.1790, -1.9538,  2.0937]],
       device='cuda:0', grad_fn=<

tensor([[ 0.5385,  2.0081, -1.8567,  ..., -1.2023,  1.7248,  3.2448],
        [-0.2788,  2.4167, -1.8831,  ..., -1.2713,  0.6882,  2.9154],
        [-0.4604,  1.7131, -0.7920,  ..., -0.6320, -1.0615,  0.8268],
        ...,
        [-1.2365, -0.5981, -0.6813,  ...,  2.3631, -2.0634,  0.0197],
        [-0.5850, -1.6531,  0.8108,  ...,  2.2953, -1.8974, -0.0572],
        [ 1.6628,  0.6115, -0.0148,  ..., -1.6485,  1.9274, -1.0699]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([9, 1, 1,  ..., 3, 0, 0], device='cuda:0')
tensor([[ 2.4047,  1.5688, -0.8042,  ..., -2.1995,  2.6815,  1.5418],
        [ 4.2695, -0.5714,  1.6738,  ..., -0.9302,  3.5350, -0.9522],
        [-0.3082,  0.2716,  0.3644,  ...,  0.2239, -1.3216,  0.0162],
        ...,
        [-1.7479, -2.0340,  1.5487,  ...,  0.8953, -2.5292, -1.9168],
        [-0.4314, -2.8313,  2.1949,  ...,  0.2710, -0.8038, -1.9262],
        [-1.0666, -2.2397,  2.0038,  ...,  2.3160, -3.1443, -2.7637]],
       device='cuda:0', grad_fn=<

tensor([[-2.5946, -0.4332,  0.0985,  ...,  1.1860, -2.8811, -1.0336],
        [ 3.5039,  1.0640, -0.1887,  ..., -1.6166,  3.2712,  0.2780],
        [ 0.9171, -2.6419,  2.5559,  ..., -0.4682, -0.1589, -3.0541],
        ...,
        [ 1.6945, -0.4393,  0.3667,  ...,  1.9700,  0.0701, -0.1435],
        [-0.6368,  0.4374, -0.1265,  ...,  1.2403, -2.1716,  0.1144],
        [ 0.9412, -0.9962,  0.2561,  ..., -0.6902,  0.9188,  0.3615]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([3, 8, 0,  ..., 9, 5, 4], device='cuda:0')
tensor([[ 0.4887,  3.4510, -1.9689,  ..., -0.5436,  0.3961,  1.6789],
        [ 1.5575,  0.3419, -0.2392,  ..., -0.7157,  1.7807, -0.0584],
        [-1.3128, -1.4378,  1.2561,  ...,  0.0784, -1.4106, -1.0083],
        ...,
        [-0.9861, -0.3669,  0.2433,  ..., -0.9247, -0.7150, -0.3296],
        [-0.6124, -1.5202,  1.3466,  ...,  2.3594, -2.9135, -2.2159],
        [ 3.1860,  0.3889,  0.0565,  ..., -0.4558,  2.5973,  0.2156]],
       device='cuda:0', grad_fn=<

tensor([[-0.1688,  0.8103, -0.6063,  ..., -0.7168,  0.6636,  0.3570],
        [ 1.8084,  1.4063, -1.3733,  ...,  2.2835, -0.7086,  0.5846],
        [-2.3937, -1.2865,  1.0534,  ...,  1.1595, -3.3671, -0.8258],
        ...,
        [-2.4465, -2.4443,  1.4357,  ...,  2.1427, -3.0286, -2.3108],
        [-0.4257,  0.6857, -0.5347,  ..., -0.0332,  0.1136, -0.4583],
        [-0.6648, -1.7679,  0.9675,  ...,  2.5852, -1.9788, -0.7106]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([6, 9, 3, 9, 8, 7, 7, 1, 6, 5, 3, 1, 3, 1, 2, 7, 1, 8, 2, 0, 9, 7, 9, 8,
        8, 6, 7, 3, 7, 1, 3, 9, 0, 9, 3, 6, 7, 2, 7, 3, 0, 5, 9, 7, 5, 5, 0, 6,
        5, 1, 8, 2, 7, 5, 9, 0, 0, 0, 8, 8, 7, 3, 7, 8, 9, 3, 7, 9, 7, 8, 7, 9,
        8, 5, 4, 8, 3, 7, 6, 3, 8, 2, 1, 9, 5, 7, 3, 9, 5, 5, 8, 7, 3, 5, 3, 5,
        9, 7, 6, 7, 3, 6, 4, 3, 9, 4, 2, 1, 9, 6, 0, 2, 6, 7, 4, 7, 9, 0, 7, 4,
        3, 5, 3, 1, 1, 2, 6, 8, 2, 1, 7, 8, 5, 9, 6, 1, 1, 5, 0, 6, 0, 9, 2, 6,
        5, 8, 9, 5, 5, 6, 2, 9, 1, 5,

tensor([[ 3.8303,  2.8619, -1.3089,  ..., -1.9398,  4.0524,  0.4623],
        [-0.0070, -3.9079,  2.9519,  ...,  0.3676, -1.1157, -2.9919],
        [-1.6362, -1.0247,  0.1226,  ...,  1.6915, -2.3771, -0.8280],
        ...,
        [ 1.1408, -1.8225,  0.9830,  ..., -0.9117, -0.0149, -0.8084],
        [-0.0952,  1.7414, -0.4722,  ..., -1.8620,  0.0196,  0.9706],
        [ 1.7048,  0.3718, -0.1889,  ..., -2.1759,  2.3499,  0.5654]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([8, 2, 5,  ..., 4, 1, 8], device='cuda:0')
tensor([[ 0.3249,  0.4368, -0.8908,  ..., -2.1427,  1.7569,  2.1235],
        [-0.8452,  2.8918, -1.7114,  ...,  0.4043, -1.6184,  1.1627],
        [-2.1965,  0.1793, -0.0932,  ...,  0.8685, -3.0273,  0.3817],
        ...,
        [-0.2984,  0.6523, -0.1995,  ...,  1.1462, -1.5828,  0.3955],
        [ 3.5495,  3.5113, -2.2296,  ..., -3.6266,  4.8207,  3.0638],
        [ 0.1911, -0.6135,  0.6108,  ...,  1.4060, -1.1262, -2.1353]],
       device='cuda:0', grad_fn=<

tensor([[ 2.5520e+00,  4.5873e-01, -5.1161e-01,  ..., -1.6643e+00,
          2.5673e+00,  3.4198e-01],
        [-1.9755e+00, -1.6606e+00,  8.7594e-01,  ...,  1.5441e-01,
         -2.9203e+00, -6.0819e-01],
        [ 3.7047e+00, -2.5805e+00,  1.6216e+00,  ...,  5.8708e-01,
          1.3000e+00, -1.7345e+00],
        ...,
        [ 2.4024e+00, -5.5333e-01,  4.8378e-01,  ...,  1.1355e+00,
          1.1434e+00, -1.2732e-01],
        [-2.0660e+00, -7.1521e-01,  7.6860e-02,  ..., -5.8452e-01,
         -1.2876e+00,  4.2747e-01],
        [ 1.0002e+00, -5.3219e-01,  2.6171e-01,  ...,  1.9859e+00,
          2.6526e-03,  8.8252e-02]], device='cuda:0', grad_fn=<AddmmBackward>) tensor([8, 6, 0,  ..., 2, 6, 8], device='cuda:0')
tensor([[-1.9688e+00, -2.0130e+00,  1.1430e+00,  ...,  1.0106e-01,
         -2.3126e+00, -1.3989e+00],
        [ 1.7399e+00, -1.1290e+00,  4.5607e-01,  ..., -2.3685e+00,
          2.0503e+00, -1.6162e-01],
        [ 1.4212e+00, -3.0225e+00,  2.3722e+00,  ...,  1.0836e+00,
   

tensor([[-0.8583, -0.6293,  0.5333,  ..., -0.0711, -1.4057, -0.4166],
        [-1.3498, -1.4368,  0.9916,  ...,  0.7285, -2.3861, -1.4338],
        [-0.7212, -0.6860,  0.1216,  ...,  0.2674, -0.4412, -0.1098],
        ...,
        [ 0.1893, -1.3942,  0.5571,  ...,  0.6330, -1.2975, -0.8402],
        [-0.6531, -2.1307,  0.5918,  ...,  2.3989, -2.2195, -0.1383],
        [-0.5660, -1.1619,  0.9735,  ..., -0.8148, -0.2193, -1.4297]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([4, 6, 5,  ..., 6, 7, 6], device='cuda:0')
tensor([[ 2.6711,  0.9728, -0.7915,  ..., -4.1509,  3.6988,  1.4760],
        [-0.5959,  0.1003, -0.0770,  ...,  1.0712, -1.3796,  0.4194],
        [ 0.7092, -0.3045, -0.0445,  ..., -1.6755,  1.3491,  0.2534],
        ...,
        [ 4.2602, -0.2144,  0.9696,  ..., -1.0268,  2.7944, -0.7769],
        [-1.8834, -2.5117,  1.1992,  ...,  0.9658, -2.7707, -0.9411],
        [ 0.0602, -0.7783,  0.1950,  ...,  1.1004, -0.6077, -0.8150]],
       device='cuda:0', grad_fn=<

tensor([[-0.8780,  2.1407, -1.3322,  ..., -0.7954, -0.4680,  0.5794],
        [-2.7494, -1.6584,  0.3714,  ...,  1.5273, -2.7751, -0.9759],
        [ 0.2798,  3.7527, -2.1280,  ...,  0.5668, -0.5887,  1.6016],
        ...,
        [-0.8827, -0.3510, -0.3819,  ...,  0.8097, -0.8893, -0.7533],
        [-1.6671, -1.2565,  0.4530,  ...,  0.3522, -1.3860, -1.9288],
        [ 0.9623,  0.9033, -0.7408,  ..., -0.4748,  1.0028, -1.0756]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([8, 3, 1,  ..., 3, 3, 8], device='cuda:0')
tensor([[ 2.1097,  0.1168,  0.7083,  ..., -1.8362,  2.6610, -0.7933],
        [-1.6806, -1.2719,  0.6583,  ...,  0.7383, -2.5609, -0.8118],
        [ 1.2227,  1.0119, -1.1234,  ..., -0.1064,  1.5298,  1.4748],
        ...,
        [-1.3171, -1.5955,  0.4115,  ...,  2.5159, -1.9717, -1.5303],
        [ 0.1681, -2.4971,  2.2299,  ..., -0.9178,  0.1042, -2.2635],
        [ 0.4225, -2.7282,  2.0197,  ...,  1.5927, -1.4460, -2.0447]],
       device='cuda:0', grad_fn=<

tensor([[-2.2874, -3.6878,  2.2232,  ...,  1.5640, -3.0892, -2.6258],
        [-1.6495,  0.0496,  0.2410,  ..., -0.0140, -2.3130, -0.1041],
        [-0.8505, -1.0992,  0.3594,  ..., -0.9265, -0.3913, -1.3315],
        ...,
        [-1.1919, -2.7280,  1.8630,  ...,  1.3638, -2.4437, -2.0424],
        [-1.4391,  0.7052, -0.5901,  ...,  1.6119, -2.3534,  1.0440],
        [-1.2374, -1.0811,  0.7732,  ..., -0.7944, -1.1968, -0.4250]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([5, 6, 4,  ..., 2, 1, 9], device='cuda:0')
tensor([[ 0.2859,  1.9627, -1.8312,  ..., -1.2018,  0.5921,  2.0241],
        [-1.4659, -0.8340,  0.5678,  ..., -0.8435, -1.5628, -0.3492],
        [ 1.5323,  2.5184, -1.1252,  ..., -1.2102,  1.1257,  0.6235],
        ...,
        [ 3.9866,  4.0624, -2.7401,  ..., -4.2153,  5.4633,  3.4874],
        [ 1.5067,  0.0273,  0.0741,  ..., -0.0896,  0.7061, -0.5540],
        [-1.2288,  1.1950, -1.2307,  ..., -1.3755, -0.7789,  1.7141]],
       device='cuda:0', grad_fn=<

tensor([[-7.4068e-02,  1.0622e+00, -1.5554e-01,  ...,  8.0656e-01,
         -8.6658e-01, -4.4484e-03],
        [ 1.3937e+00, -7.6181e-01,  4.1021e-02,  ..., -8.6740e-01,
          4.6457e-01, -4.3354e-01],
        [-4.4799e-01, -2.5197e+00,  1.4135e+00,  ...,  6.1189e-01,
         -2.1069e+00, -9.4411e-01],
        ...,
        [-7.9655e-01, -3.1550e+00,  2.0098e+00,  ..., -3.0281e-01,
         -1.6984e+00, -2.3738e+00],
        [-1.1375e+00, -4.4260e+00,  2.3192e+00,  ...,  1.8703e+00,
         -3.1096e+00, -2.8170e+00],
        [ 1.6768e-01, -3.2425e+00,  2.4868e+00,  ..., -3.1484e-03,
         -5.3383e-01, -2.0874e+00]], device='cuda:0', grad_fn=<AddmmBackward>) tensor([7, 4, 3,  ..., 3, 5, 4], device='cuda:0')
tensor([[ 0.5827, -3.5833,  2.5508,  ...,  4.5128, -2.8841, -3.2278],
        [ 0.9709,  1.0980, -1.4145,  ...,  0.1222,  0.0483,  2.1052],
        [-0.9160, -3.1252,  2.6273,  ...,  0.8884, -1.9887, -2.8488],
        ...,
        [-1.2951, -1.9608,  0.3946,  ...,  1.0965, -2

tensor([[-1.7872, -1.4918, -0.1010,  ..., -1.0881, -2.1799, -0.5253],
        [-1.6112, -1.7857,  0.8761,  ...,  0.1659, -3.5574, -2.0091],
        [ 0.2304,  0.6902, -1.4364,  ...,  1.0068, -0.5463,  1.7132],
        ...,
        [-0.4611,  1.1913, -0.9442,  ...,  0.0875, -1.3074,  2.0627],
        [-1.3555, -1.3731,  0.3406,  ...,  0.6177, -0.8668, -1.2810],
        [-1.2135, -4.1297,  2.5415,  ...,  2.4501, -3.4301, -2.7407]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([6, 3, 9,  ..., 9, 2, 5], device='cuda:0')
tensor([[-1.5843,  0.4061, -0.5575,  ..., -1.1453, -1.9108,  0.3059],
        [-1.3180, -1.2871,  0.9033,  ..., -0.0470, -0.8970, -1.4159],
        [ 0.1557,  2.2332, -2.4207,  ..., -2.2088,  0.6692,  2.5057],
        ...,
        [-1.8613, -2.2769,  1.0386,  ..., -0.3019, -1.0919, -1.4025],
        [ 1.4187, -2.7698,  1.6390,  ...,  2.1073, -1.9832, -1.8322],
        [-1.0153, -2.4462,  1.9878,  ...,  3.4837, -3.2340, -1.7105]],
       device='cuda:0', grad_fn=<

tensor([[-1.6913, -0.6883,  0.8505,  ..., -1.5840, -1.1550, -1.2773],
        [ 2.9531,  2.9698, -1.5620,  ..., -3.4110,  4.7989,  2.7632],
        [ 1.4954,  2.2521, -0.9861,  ..., -2.5456,  3.0124,  2.0067],
        ...,
        [-1.0428, -1.6956,  1.4121,  ..., -1.1173, -0.7235, -2.3327],
        [ 3.4568, -0.7274,  0.5582,  ..., -0.8074,  1.9252, -1.1134],
        [ 2.2412,  2.3906, -0.6030,  ..., -2.6016,  2.6507,  1.5391]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([3, 8, 8,  ..., 1, 0, 0], device='cuda:0')
tensor([[-1.0028, -3.4929,  2.1565,  ...,  1.4031, -2.5937, -2.8894],
        [-1.8300, -3.9778,  1.7185,  ...,  3.5457, -3.0889, -2.8073],
        [-0.3135, -1.3973,  0.7689,  ...,  0.8503, -0.8948, -1.2638],
        ...,
        [-1.0499,  1.9988, -1.5243,  ..., -0.1497, -1.4477,  2.2988],
        [-1.4917, -0.1859, -0.5645,  ...,  0.6398, -1.8116, -0.9938],
        [-1.7276, -2.0037,  1.1960,  ...,  1.2008, -2.7053, -1.8869]],
       device='cuda:0', grad_fn=<

tensor([[ 1.2646,  1.8368, -1.1676,  ...,  0.0890,  1.8837,  1.6545],
        [-0.7530,  0.5934, -0.4694,  ..., -0.8401, -0.3059,  0.6505],
        [ 1.7265,  0.0370,  0.3393,  ..., -2.1334,  2.3822,  0.0166],
        ...,
        [-0.8503,  2.1727, -2.6082,  ..., -1.9184, -0.1812,  3.3834],
        [-1.9264, -3.1944,  1.7648,  ...,  1.0368, -2.7618, -2.5358],
        [ 0.1298,  1.4391, -1.7585,  ...,  0.4604,  0.1170,  2.6757]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([9, 8, 8,  ..., 9, 6, 1], device='cuda:0')
tensor([[-1.5638, -1.0071,  1.0913,  ..., -0.8479, -1.1880, -1.0577],
        [ 0.7699,  4.0343, -1.9019,  ..., -1.7217,  1.1369,  2.1163],
        [-0.9041,  1.0243, -0.5356,  ..., -0.6565, -1.0419,  0.1611],
        ...,
        [ 1.9787,  0.2392, -0.5130,  ..., -0.4603,  1.6133,  0.7647],
        [-1.2206,  2.5590, -1.9043,  ..., -0.3113, -1.1213,  3.1808],
        [-1.0029, -1.5092,  0.7554,  ...,  0.0548, -0.9757, -0.2707]],
       device='cuda:0', grad_fn=<

tensor([[ 0.3578,  1.7730, -0.6561,  ...,  2.5981, -1.4223,  1.0252],
        [-1.7251, -2.2883,  1.1796,  ...,  0.9278, -2.7218, -1.8564],
        [-1.1662, -2.3771,  1.7911,  ...,  1.5597, -3.2691, -1.7483],
        ...,
        [-1.0362, -4.1395,  2.4238,  ...,  3.3230, -3.1932, -1.8082],
        [-1.6762, -0.3158, -0.2108,  ...,  0.3633, -1.9504,  1.1250],
        [-0.0622, -2.2887,  0.9241,  ...,  0.3259, -0.0880, -0.5041]],
       device='cuda:0', grad_fn=<AddmmBackward>) tensor([1, 2, 5,  ..., 7, 9, 4], device='cuda:0')
tensor([[-1.9159, -5.0065,  2.5139,  ...,  2.4085, -2.7876, -3.6940],
        [-1.0167, -4.7298,  3.1753,  ...,  2.2172, -3.3642, -2.7002],
        [-1.3518,  0.5255, -1.6010,  ..., -0.6012, -0.4346,  2.9205],
        ...,
        [-0.2485,  2.8394, -2.5602,  ..., -0.2196,  0.1604,  3.7828],
        [ 0.4078, -3.6399,  2.7348,  ..., -0.2023, -1.0491, -2.0530],
        [-1.6617, -1.3896,  1.0282,  ...,  0.6303, -1.7958, -1.5917]],
       device='cuda:0', grad_fn=<

KeyboardInterrupt: 

Построим график функции потерь

In [None]:
plt.figure(figsize=(12, 6))
plt.plot(range(len(train_l)), train_l, label="train")
plt.plot(range(len(test_l)), test_l, label="test")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.legend()
plt.tight_layout()
plt.show()

И, наконец, посчитаем метрики

In [None]:
true_positive = np.zeros(10)
true_negative = np.zeros(10)
false_positive = np.zeros(10)
false_negative = np.zeros(10)
accuracy = 0
ctn = 0
for X, y in iter(test_loader):
    X = X.to(device)
    y = y.to(device)
    with torch.no_grad():
        y_pred = model(X).max(dim=1)[1]
    for i in range(10):
        for pred, real in zip(y_pred, y):
            if real == i:
                if pred == real:
                    true_positive[i] += 1
                else:
                    false_negative[i] += 1
            else:
                if pred == i:
                    false_positive[i] += 1
                else:
                    true_negative[i] += 1
            
    accuracy += torch.sum(y_pred == y).item()
    ctn += len(y)
print("Overall accuracy", accuracy / ctn)
print("Precision", true_positive / (true_positive + false_positive))
print("Recall", true_positive / (true_positive + false_negative))
print("Mean Precision", np.mean(true_positive / (true_positive + false_positive)))
print("Mean Recall", np.mean(true_positive / (true_positive + false_negative)))