In [1]:
from tqdm.std import tqdm, trange
from tqdm import notebook
notebook.tqdm = tqdm
notebook.trange = trange

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import torch
from torch import nn
from torchvision import datasets
from torchvision.transforms import ToTensor
from torch.utils.data import DataLoader

In [2]:
# Load the data
training_data = datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor(),
)

test_data = datasets.MNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor(),
)

In [3]:
class ThreeBlue(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.layers = nn.Sequential(
            nn.Linear(784,10)
        )

    def forward(self,x):
        y = self.flatten(x)
        z = self.layers(y)
        return z

In [4]:
wed = ThreeBlue()

In [5]:
class ThreeBlue(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.layers = nn.Sequential(
            nn.Linear(784,16),
            nn.Sigmoid(),
            nn.Linear(16,16),
            nn.Sigmoid(),
            nn.Linear(16,10),
            nn.Sigmoid()
        )

    def forward(self,x):
        x = x/255
        y = self.flatten(x)
        z = self.layers(y)
        return z

In [6]:
wed = ThreeBlue()

In [7]:
for p in wed.parameters():
    print(p.shape)

torch.Size([16, 784])
torch.Size([16])
torch.Size([16, 16])
torch.Size([16])
torch.Size([10, 16])
torch.Size([10])


In [8]:
for p in wed.parameters():
    print(p.numel())

12544
16
256
16
160
10


In [9]:
sum([p.numel() for p in wed.parameters()])

13002

In [10]:
sum(p.numel() for p in wed.parameters())

13002

In [11]:
wed

ThreeBlue(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (layers): Sequential(
    (0): Linear(in_features=784, out_features=16, bias=True)
    (1): Sigmoid()
    (2): Linear(in_features=16, out_features=16, bias=True)
    (3): Sigmoid()
    (4): Linear(in_features=16, out_features=10, bias=True)
    (5): Sigmoid()
  )
)

In [12]:
wed.layers

Sequential(
  (0): Linear(in_features=784, out_features=16, bias=True)
  (1): Sigmoid()
  (2): Linear(in_features=16, out_features=16, bias=True)
  (3): Sigmoid()
  (4): Linear(in_features=16, out_features=10, bias=True)
  (5): Sigmoid()
)

In [13]:
wed.layers[2]

Linear(in_features=16, out_features=16, bias=True)

In [14]:
wed.layers[2].weight.shape

torch.Size([16, 16])

In [15]:
wed.layers[2].bias.shape

torch.Size([16])

In [16]:
wed(training_data.data)[:3]

tensor([[0.5944, 0.4146, 0.3592, 0.4494, 0.4882, 0.4373, 0.4342, 0.6071, 0.4790,
         0.4979],
        [0.5941, 0.4140, 0.3608, 0.4500, 0.4895, 0.4370, 0.4352, 0.6085, 0.4780,
         0.4987],
        [0.5956, 0.4160, 0.3602, 0.4501, 0.4876, 0.4370, 0.4341, 0.6052, 0.4817,
         0.4979]], grad_fn=<SliceBackward0>)

In [17]:
y_pred = wed(training_data.data)

In [18]:
training_data.targets[:3]

tensor([5, 0, 4])

In [19]:
from torch.nn.functional import one_hot

In [20]:
one_hot(training_data.targets[:3], num_classes=10).to(torch.float)

tensor([[0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
        [0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]])

In [21]:
y_true = one_hot(training_data.targets, num_classes=10).to(torch.float)

In [22]:
y_true.shape

torch.Size([60000, 10])

In [23]:
loss_fn = nn.MSELoss()

In [24]:
loss_fn(y_pred, y_true)

tensor(0.2370, grad_fn=<MseLossBackward0>)

In [25]:
optimizer = torch.optim.SGD(wed.parameters(), lr=0.1)

In [26]:
for p in wed.parameters():
    print(p.grad)

None
None
None
None
None
None


In [27]:
loss = loss_fn(y_pred, y_true)

In [28]:
for p in wed.parameters():
    print(p.grad)

None
None
None
None
None
None


In [29]:
loss.backward()

In [30]:
for p in wed.parameters():
    print(p.grad)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]])
tensor([ 1.9265e-04,  3.3771e-04, -2.6521e-04, -4.0215e-04,  3.1495e-04,
        -5.7406e-05,  5.0352e-04, -3.9258e-04, -7.1039e-05,  3.1419e-05,
         4.4699e-05,  3.7495e-05, -1.6951e-04, -3.8638e-04,  4.2303e-04,
         2.2844e-04])
tensor([[ 0.0002,  0.0002,  0.0002,  0.0002,  0.0002,  0.0002,  0.0002,  0.0002,
          0.0002,  0.0002,  0.0002,  0.0002,  0.0002,  0.0002,  0.0002,  0.0002],
        [ 0.0011,  0.0014,  0.0011,  0.0011,  0.0011,  0.0012,  0.0012,  0.0011,
          0.0012,  0.0011,  0.0012,  0.0011,  0.0012,  0.0012,  0.0011,  0.0012],
        [ 0.0003,  0.0003,  0.0003,  0.0003,  0.0003,  0.0003,  0.0003,  0.0003,
          0.0003,  0.0003,  0.0003,  0.0003,  0.0003,  0.0003,  0.0003,  0.0003],
        [-0.0004

In [31]:
optimizer.step()

In [32]:
wed(training_data.data)[:3]

tensor([[0.5912, 0.4126, 0.3577, 0.4470, 0.4855, 0.4350, 0.4320, 0.6040, 0.4763,
         0.4952],
        [0.5909, 0.4120, 0.3594, 0.4476, 0.4868, 0.4348, 0.4330, 0.6054, 0.4754,
         0.4960],
        [0.5924, 0.4140, 0.3587, 0.4477, 0.4849, 0.4347, 0.4319, 0.6020, 0.4791,
         0.4952]], grad_fn=<SliceBackward0>)

In [33]:
epochs = 10

for i in range(epochs):
    y_true = one_hot(training_data.targets, num_classes=10).to(torch.float)
    y_pred = wed(training_data.data)
    loss = loss_fn(y_true,y_pred)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(loss)

tensor(0.2351, grad_fn=<MseLossBackward0>)
tensor(0.2332, grad_fn=<MseLossBackward0>)


tensor(0.2313, grad_fn=<MseLossBackward0>)
tensor(0.2294, grad_fn=<MseLossBackward0>)


tensor(0.2276, grad_fn=<MseLossBackward0>)
tensor(0.2258, grad_fn=<MseLossBackward0>)


tensor(0.2240, grad_fn=<MseLossBackward0>)
tensor(0.2222, grad_fn=<MseLossBackward0>)


tensor(0.2205, grad_fn=<MseLossBackward0>)
tensor(0.2188, grad_fn=<MseLossBackward0>)


In [34]:
epochs = 100

for i in range(epochs):
    y_true = one_hot(training_data.targets, num_classes=10).to(torch.float)
    y_pred = wed(training_data.data)
    loss = loss_fn(y_true,y_pred)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    if i%2 == 0:
        print(loss)

tensor(0.2171, grad_fn=<MseLossBackward0>)


tensor(0.2138, grad_fn=<MseLossBackward0>)


tensor(0.2105, grad_fn=<MseLossBackward0>)


tensor(0.2074, grad_fn=<MseLossBackward0>)


tensor(0.2043, grad_fn=<MseLossBackward0>)


tensor(0.2013, grad_fn=<MseLossBackward0>)


tensor(0.1984, grad_fn=<MseLossBackward0>)


tensor(0.1956, grad_fn=<MseLossBackward0>)


tensor(0.1929, grad_fn=<MseLossBackward0>)


tensor(0.1902, grad_fn=<MseLossBackward0>)


tensor(0.1876, grad_fn=<MseLossBackward0>)


tensor(0.1851, grad_fn=<MseLossBackward0>)


tensor(0.1826, grad_fn=<MseLossBackward0>)


tensor(0.1803, grad_fn=<MseLossBackward0>)


tensor(0.1780, grad_fn=<MseLossBackward0>)


tensor(0.1757, grad_fn=<MseLossBackward0>)


tensor(0.1735, grad_fn=<MseLossBackward0>)


tensor(0.1714, grad_fn=<MseLossBackward0>)


tensor(0.1694, grad_fn=<MseLossBackward0>)


tensor(0.1674, grad_fn=<MseLossBackward0>)


tensor(0.1654, grad_fn=<MseLossBackward0>)


tensor(0.1635, grad_fn=<MseLossBackward0>)


tensor(0.1617, grad_fn=<MseLossBackward0>)


tensor(0.1599, grad_fn=<MseLossBackward0>)


tensor(0.1582, grad_fn=<MseLossBackward0>)


tensor(0.1565, grad_fn=<MseLossBackward0>)


tensor(0.1549, grad_fn=<MseLossBackward0>)


tensor(0.1533, grad_fn=<MseLossBackward0>)


tensor(0.1518, grad_fn=<MseLossBackward0>)


tensor(0.1503, grad_fn=<MseLossBackward0>)


tensor(0.1489, grad_fn=<MseLossBackward0>)


tensor(0.1475, grad_fn=<MseLossBackward0>)


tensor(0.1461, grad_fn=<MseLossBackward0>)


tensor(0.1448, grad_fn=<MseLossBackward0>)


tensor(0.1435, grad_fn=<MseLossBackward0>)


tensor(0.1423, grad_fn=<MseLossBackward0>)


tensor(0.1410, grad_fn=<MseLossBackward0>)


tensor(0.1399, grad_fn=<MseLossBackward0>)


tensor(0.1387, grad_fn=<MseLossBackward0>)


tensor(0.1376, grad_fn=<MseLossBackward0>)


tensor(0.1365, grad_fn=<MseLossBackward0>)


tensor(0.1355, grad_fn=<MseLossBackward0>)


tensor(0.1345, grad_fn=<MseLossBackward0>)


tensor(0.1335, grad_fn=<MseLossBackward0>)


tensor(0.1325, grad_fn=<MseLossBackward0>)


tensor(0.1316, grad_fn=<MseLossBackward0>)


tensor(0.1306, grad_fn=<MseLossBackward0>)


tensor(0.1298, grad_fn=<MseLossBackward0>)


tensor(0.1289, grad_fn=<MseLossBackward0>)


tensor(0.1281, grad_fn=<MseLossBackward0>)


In [35]:
wed = ThreeBlue()

In [36]:
optimizer = torch.optim.SGD(wed.parameters(), lr=500)

In [37]:
epochs = 10

for i in range(epochs):
    y_true = one_hot(training_data.targets, num_classes=10).to(torch.float)
    y_pred = wed(training_data.data)
    loss = loss_fn(y_true,y_pred)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    print(loss)

tensor(0.2423, grad_fn=<MseLossBackward0>)
tensor(0.1000, grad_fn=<MseLossBackward0>)


tensor(0.1000, grad_fn=<MseLossBackward0>)
tensor(0.1000, grad_fn=<MseLossBackward0>)


tensor(0.1000, grad_fn=<MseLossBackward0>)
tensor(0.1000, grad_fn=<MseLossBackward0>)


tensor(0.1000, grad_fn=<MseLossBackward0>)
tensor(0.1000, grad_fn=<MseLossBackward0>)


tensor(0.1000, grad_fn=<MseLossBackward0>)
tensor(0.1000, grad_fn=<MseLossBackward0>)
