# PyTorch and Neural Networks 2

[YuJa recording of lecture](https://uci.yuja.com/V/Video?v=4417722&node=14870050&a=1646074732&autoplay=1)

Topics mentioned at the board (not in this notebook):
* Importance of using activation functions to break linearity.
* Common choices of activation functions: sigmoid and relu.
* Concept of *one hot encoding*.

In [None]:
from tqdm.std import tqdm, trange
from tqdm import notebook
notebook.tqdm = tqdm
notebook.trange = trange

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch import nn
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader

In [None]:
# Load the data
training_data = datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

test_data = datasets.MNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

Second YouTube video on *Neural Networks* from 3Blue1Brown.  This video is on *gradient descent*.  Recommended clips:
* 0:25-1:24
* 3:18-4:05
* 5:15-7:50

<iframe width="560" height="315" src="https://www.youtube.com/embed/IHZwWFHWa-w" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe>

This is what we finished with on Monday:

In [None]:
class ThreeBlue(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.layers = nn.Sequential(
            nn.Linear(784,10)
        )

    def forward(self,x):
        y = self.flatten(x)
        z = self.layers(y)
        return z

We instantiate an object in this class as follows.

In [1]:
wed = ThreeBlue()

NameError: name 'ThreeBlue' is not defined

In [None]:
class ThreeBlue(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.layers = nn.Sequential(
            nn.Linear(784,16),
            nn.Sigmoid(),
            nn.Linear(16,16),
            nn.Sigmoid(),
            nn.Linear(16,10),
            nn.Sigmoid()
        )

    def forward(self,x):
        x = x/255
        y = self.flatten(x)
        z = self.layers(y)
        return z

In [None]:
wed = ThreeBlue()

In [None]:
for p in wed.parameters():
    print(p.shape)

torch.Size([16, 784])
torch.Size([16])
torch.Size([16, 16])
torch.Size([16])
torch.Size([10, 16])
torch.Size([10])


In [None]:
for p in wed.parameters():
    print(p.numel())

12544
16
256
16
160
10


In [None]:
sum([p.numel() for p in wed.parameters()])

13002

In [None]:
sum(p.numel() for p in wed.parameters())

13002

In [None]:
wed

ThreeBlue(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (layers): Sequential(
    (0): Linear(in_features=784, out_features=16, bias=True)
    (1): Linear(in_features=16, out_features=16, bias=True)
    (2): Linear(in_features=16, out_features=10, bias=True)
    (3): Sigmoid()
  )
)

In [None]:
wed.layers

Sequential(
  (0): Linear(in_features=784, out_features=16, bias=True)
  (1): Linear(in_features=16, out_features=16, bias=True)
  (2): Linear(in_features=16, out_features=10, bias=True)
  (3): Sigmoid()
)

In [None]:
wed.layers[2].weight.shape

torch.Size([10, 16])

In [None]:
wed.layers[2].bias.shape

torch.Size([10])

In [None]:
wed(training_data.data.to(torch.float))

tensor([[-55.2283, -56.4610,  54.5960,  ...,  -3.8436,  57.0099,  69.1157],
        [-33.1065, -24.6902,  22.6003,  ...,  10.9671,  21.9622,  97.5362],
        [-61.3301,  -8.5647, -27.5800,  ..., -41.4074,   2.2047,  66.7116],
        ...,
        [-40.8012, -53.5293, -59.4709,  ..., -31.5671, -49.3599,  82.7872],
        [ 10.5128,  -4.7770,  14.8668,  ...,   5.6984,  58.1751,   7.1557],
        [ -7.9452, -10.8654, -14.2437,  ...,  27.7169,  49.4617,  40.3107]],
       grad_fn=<AddmmBackward0>)

In [None]:
wed(training_data.data/255)

tensor([[-0.1932, -0.2259,  0.2297,  ..., -0.0067,  0.2211,  0.2962],
        [-0.1065, -0.1013,  0.1042,  ...,  0.0514,  0.0836,  0.4076],
        [-0.2171, -0.0381, -0.0926,  ..., -0.1540,  0.0061,  0.2868],
        ...,
        [-0.1366, -0.2144, -0.2176,  ..., -0.1154, -0.1961,  0.3498],
        [ 0.0646, -0.0232,  0.0739,  ...,  0.0307,  0.2256,  0.0532],
        [-0.0078, -0.0471, -0.0403,  ...,  0.1171,  0.1915,  0.1832]],
       grad_fn=<AddmmBackward0>)

In [None]:
training_data.data.shape

torch.Size([60000, 28, 28])

In [None]:
(training_data.data/255).shape

torch.Size([60000, 28, 28])

In [None]:
wed(training_data.data/255).shape

torch.Size([60000, 10])

In [None]:
wed(training_data.data)[:3]

tensor([[0.5288, 0.5101, 0.4396, 0.4913, 0.4751, 0.5183, 0.4840, 0.4438, 0.4579,
         0.5159],
        [0.5285, 0.4803, 0.4635, 0.4609, 0.3703, 0.4872, 0.4798, 0.4920, 0.4342,
         0.5217],
        [0.4437, 0.4366, 0.4977, 0.4573, 0.5392, 0.5174, 0.5090, 0.5097, 0.3944,
         0.5027]], grad_fn=<SliceBackward0>)

In [None]:
y_pred = wed(training_data.data)

In [None]:
training_data.targets[:3]

tensor([5, 0, 4])

In [None]:
from torch.nn.functional import one_hot

In [None]:
one_hot(training_data.targets[:3], num_classes=10).to(torch.float)

tensor([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]])

In [None]:
y_true = one_hot(training_data.targets, num_classes=10).to(torch.float)

In [None]:
y_true.shape

torch.Size([60000, 10])

In [None]:
loss_fn = nn.MSELoss()

In [None]:
loss_fn(y_pred, y_true)

tensor(0.3006, grad_fn=<MseLossBackward0>)

In [None]:
optimizer = torch.optim.SGD(wed.parameters(), lr=0.1)

In [None]:
for p in wed.parameters():
    print(p.grad)

None
None
None
None
None
None


In [None]:
loss = loss_fn(y_pred, y_true)

In [None]:
for p in wed.parameters():
    print(p.grad)

None
None
None
None
None
None


In [None]:
loss.backward()

In [None]:
for p in wed.parameters():
    print(p.grad)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
tensor([ 3.1897e-04, -8.1853e-04, -1.6588e-04, -1.8961e-04, -8.2379e-05,
         8.7279e-04, -2.6662e-04,  1.0267e-04,  3.3947e-04,  5.7145e-04,
         5.8615e-04, -6.6343e-05,  1.0942e-03,  4.5065e-04,  4.0523e-04,
        -7.4472e-05])
tensor([[ 0.0005,  0.0004,  0.0004,  0.0005,  0.0004,  0.0004,  0.0004,  0.0005,
          0.0004,  0.0004,  0.0004,  0.0005,  0.0004,  0.0004,  0.0004,  0.0004],
        [-0.0014, -0.0014, -0.0014, -0.0015, -0.0013, -0.0013, -0.0013, -0.0015,
         -0.0014, -0.0014, -0.0014, -0.0016, -0.0013, -0.0012, -0.0015, -0.0014],
        [ 0.0006,  0.0006,  0.0006,  0.0007,  0.0005,  0.0006,  0.0006,  0.0006,
          0.0006,  0.0006,  0.0006,  0.0007,  0.0006,  0.0005,  0.0006,  0.0006],
        [ 0.0038

In [None]:
optimizer.step()

In [None]:
wed(training_data.data)[:3]

tensor([[0.5898, 0.5361, 0.5328, 0.7045, 0.4790, 0.4328, 0.5631, 0.4541, 0.4985,
         0.6916],
        [0.5903, 0.5355, 0.5326, 0.7043, 0.4787, 0.4321, 0.5617, 0.4532, 0.4974,
         0.6917],
        [0.5899, 0.5350, 0.5340, 0.7043, 0.4788, 0.4327, 0.5630, 0.4531, 0.4975,
         0.6917]], grad_fn=<SliceBackward0>)

In [None]:
epochs = 10

for i in range(epochs):
    y_true = one_hot(training_data.targets, num_classes=10).to(torch.float)
    y_pred = wed(training_data.data)
    loss = loss_fn(y_true,y_pred)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(loss)

tensor(0.2980, grad_fn=<MseLossBackward0>)
tensor(0.2955, grad_fn=<MseLossBackward0>)
tensor(0.2930, grad_fn=<MseLossBackward0>)
tensor(0.2905, grad_fn=<MseLossBackward0>)
tensor(0.2881, grad_fn=<MseLossBackward0>)
tensor(0.2856, grad_fn=<MseLossBackward0>)
tensor(0.2832, grad_fn=<MseLossBackward0>)
tensor(0.2808, grad_fn=<MseLossBackward0>)
tensor(0.2785, grad_fn=<MseLossBackward0>)
tensor(0.2762, grad_fn=<MseLossBackward0>)


In [None]:
epochs = 100

for i in range(epochs):
    y_true = one_hot(training_data.targets, num_classes=10).to(torch.float)
    y_pred = wed(training_data.data)
    loss = loss_fn(y_true,y_pred)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if i%2 == 0:
        print(loss)

tensor(0.1072, grad_fn=<MseLossBackward0>)
tensor(0.1069, grad_fn=<MseLossBackward0>)
tensor(0.1066, grad_fn=<MseLossBackward0>)
tensor(0.1064, grad_fn=<MseLossBackward0>)
tensor(0.1061, grad_fn=<MseLossBackward0>)
tensor(0.1058, grad_fn=<MseLossBackward0>)
tensor(0.1056, grad_fn=<MseLossBackward0>)
tensor(0.1053, grad_fn=<MseLossBackward0>)
tensor(0.1050, grad_fn=<MseLossBackward0>)
tensor(0.1048, grad_fn=<MseLossBackward0>)
tensor(0.1045, grad_fn=<MseLossBackward0>)
tensor(0.1043, grad_fn=<MseLossBackward0>)
tensor(0.1041, grad_fn=<MseLossBackward0>)
tensor(0.1038, grad_fn=<MseLossBackward0>)
tensor(0.1036, grad_fn=<MseLossBackward0>)
tensor(0.1034, grad_fn=<MseLossBackward0>)
tensor(0.1032, grad_fn=<MseLossBackward0>)
tensor(0.1030, grad_fn=<MseLossBackward0>)
tensor(0.1028, grad_fn=<MseLossBackward0>)
tensor(0.1026, grad_fn=<MseLossBackward0>)
tensor(0.1024, grad_fn=<MseLossBackward0>)
tensor(0.1022, grad_fn=<MseLossBackward0>)
tensor(0.1020, grad_fn=<MseLossBackward0>)
tensor(0.10

In [None]:
optimizer = torch.optim.SGD(wed.parameters(), lr=5000)

In [None]:
wed = ThreeBlue()

In [None]:
epochs = 100

for i in range(epochs):
    y_true = one_hot(training_data.targets, num_classes=10).to(torch.float)
    y_pred = wed(training_data.data)
    loss = loss_fn(y_true,y_pred)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if i%2 == 0:
        print(loss)

tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.2433, grad_fn=<MseLossBackward0>)
tensor(0.24

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=7aba66cf-2a09-47de-a62a-78713e9a79bd' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>