In [70]:
import torch
from torch import nn
from torch.utils.data import Dataset
from torchvision import datasets
from torch.utils.data import DataLoader
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [24]:
torch.manual_seed(0)

<torch._C.Generator at 0x1747e3843b0>

In [6]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cpu device


In [7]:
training_data = datasets.MNIST(
    root="data",
    train=True,
    download=True,
    transform=ToTensor()
)

In [8]:
test_data = datasets.MNIST(
    root="data",
    train=False,
    download=True,
    transform=ToTensor()
)

In [44]:
batch_size = 64

# Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)

for X, y in test_dataloader:
    print(f"Shape of X [N, C, H, W]: {X.shape}")
    print(f"Shape of y: {y.shape} {y.dtype}")
    break

Shape of X [N, C, H, W]: torch.Size([64, 1, 28, 28])
Shape of y: torch.Size([64]) torch.int64


In [9]:
# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 18),
            nn.ReLU(),
            nn.Linear(18, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [10]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

In [11]:
def test(dataloader, model, loss_fn):
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()

    test_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")

In [94]:
torch.manual_seed(0)
model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=18, bias=True)
    (1): ReLU()
    (2): Linear(in_features=18, out_features=10, bias=True)
  )
)


In [95]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)


In [96]:
train(train_dataloader, model, loss_fn, optimizer)

loss: 2.311446  [   64/60000]
loss: 2.318978  [ 6464/60000]
loss: 2.292953  [12864/60000]
loss: 2.208490  [19264/60000]
loss: 2.216908  [25664/60000]
loss: 2.212726  [32064/60000]
loss: 2.156218  [38464/60000]
loss: 2.183332  [44864/60000]
loss: 2.104259  [51264/60000]
loss: 2.085171  [57664/60000]


In [56]:
test(test_dataloader, model, loss_fn)

Test Error: 
 Accuracy: 34.2%, Avg loss: 2.087204 



In [57]:
old = model.state_dict()

In [58]:
old

OrderedDict([('linear_relu_stack.0.weight',
              tensor([[-0.0003,  0.0192, -0.0294,  ...,  0.0219,  0.0037,  0.0021],
                      [-0.0198, -0.0150, -0.0104,  ..., -0.0203, -0.0060, -0.0299],
                      [-0.0201,  0.0149, -0.0333,  ..., -0.0203,  0.0012,  0.0080],
                      ...,
                      [-0.0215,  0.0106,  0.0308,  ..., -0.0199,  0.0161, -0.0342],
                      [ 0.0350, -0.0297, -0.0037,  ...,  0.0171,  0.0238, -0.0001],
                      [ 0.0085,  0.0223, -0.0324,  ..., -0.0296,  0.0182, -0.0296]])),
             ('linear_relu_stack.0.bias',
              tensor([ 0.0106,  0.0085,  0.0134, -0.0227, -0.0049,  0.0033, -0.0191, -0.0014,
                       0.0076,  0.0316, -0.0083, -0.0311,  0.0095,  0.0278, -0.0272,  0.0390,
                       0.0321,  0.0125])),
             ('linear_relu_stack.2.weight',
              tensor([[ 0.1389, -0.1488, -0.2273,  0.2386,  0.1592, -0.1654, -0.1655,  0.0761,
          

In [97]:
model.state_dict()

OrderedDict([('linear_relu_stack.0.weight',
              tensor([[-0.0003,  0.0192, -0.0294,  ...,  0.0219,  0.0037,  0.0021],
                      [-0.0198, -0.0150, -0.0104,  ..., -0.0203, -0.0060, -0.0299],
                      [-0.0201,  0.0149, -0.0333,  ..., -0.0203,  0.0012,  0.0080],
                      ...,
                      [-0.0215,  0.0106,  0.0308,  ..., -0.0199,  0.0161, -0.0342],
                      [ 0.0350, -0.0297, -0.0037,  ...,  0.0171,  0.0238, -0.0001],
                      [ 0.0085,  0.0223, -0.0324,  ..., -0.0296,  0.0182, -0.0296]])),
             ('linear_relu_stack.0.bias',
              tensor([ 0.0106,  0.0085,  0.0134, -0.0227, -0.0049,  0.0033, -0.0191, -0.0014,
                       0.0076,  0.0316, -0.0083, -0.0311,  0.0095,  0.0278, -0.0272,  0.0390,
                       0.0321,  0.0125])),
             ('linear_relu_stack.2.weight',
              tensor([[ 0.1389, -0.1488, -0.2273,  0.2386,  0.1592, -0.1654, -0.1655,  0.0761,
          

In [91]:
sum(old['linear_relu_stack.2.weight'] != model.state_dict()['linear_relu_stack.2.weight'])

tensor([10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10])

In [92]:
old['linear_relu_stack.2.weight']

tensor([[ 0.1389, -0.1488, -0.2273,  0.2386,  0.1592, -0.1654, -0.1655,  0.0761,
          0.1549,  0.2038,  0.0227,  0.1407,  0.0915,  0.0804, -0.1196, -0.1754,
          0.2495,  0.0565],
        [-0.2206, -0.0969,  0.0425,  0.0331, -0.1933,  0.2379, -0.2168,  0.1215,
         -0.1173, -0.1680,  0.1163, -0.0595, -0.0944, -0.0247, -0.0887,  0.2312,
         -0.2317, -0.0841],
        [-0.0355, -0.0584, -0.1456,  0.0293,  0.0121,  0.1590,  0.0774,  0.1704,
         -0.1688,  0.1955, -0.0979, -0.0047,  0.0894, -0.0478, -0.0676,  0.1149,
          0.1776, -0.1549],
        [ 0.1420,  0.1724,  0.1028,  0.0800,  0.1325,  0.2368,  0.1600,  0.0119,
         -0.1670, -0.0076, -0.1387,  0.2184, -0.0699,  0.0170,  0.1075,  0.0907,
         -0.1927, -0.1540],
        [ 0.0391, -0.1812,  0.0886,  0.0566, -0.1027, -0.1981,  0.0364, -0.1420,
          0.2405,  0.1561,  0.1546,  0.1107,  0.0924, -0.1762,  0.2248, -0.0526,
          0.1374,  0.2193],
        [ 0.0487, -0.1665, -0.0167, -0.1352, -0.18

In [93]:
model.state_dict()['linear_relu_stack.2.weight']

tensor([[ 0.1390, -0.1494, -0.2147,  0.2142,  0.1166, -0.1546, -0.1654,  0.0631,
          0.1544,  0.2064,  0.0297,  0.1380,  0.0920,  0.0833, -0.1180, -0.1707,
          0.2289,  0.0489],
        [-0.2119, -0.0872,  0.0234,  0.0395, -0.1763,  0.2059, -0.2174,  0.1155,
         -0.1042, -0.1669,  0.1042, -0.0565, -0.0937, -0.0247, -0.0842,  0.2194,
         -0.2220, -0.0808],
        [-0.0286, -0.0655, -0.1446,  0.0312,  0.0048,  0.1498,  0.0774,  0.1614,
         -0.1665,  0.1966, -0.0986, -0.0030,  0.0868, -0.0487, -0.0625,  0.1071,
          0.1698, -0.1533],
        [ 0.1358,  0.1561,  0.0980,  0.0758,  0.1265,  0.2233,  0.1601,  0.0075,
         -0.1609, -0.0058, -0.1354,  0.2079, -0.0700,  0.0170,  0.0997,  0.0820,
         -0.1933, -0.1521],
        [ 0.0403, -0.1731,  0.0918,  0.0625, -0.0918, -0.1864,  0.0361, -0.1316,
          0.2262,  0.1522,  0.1508,  0.1103,  0.0919, -0.1769,  0.2244, -0.0482,
          0.1366,  0.2164],
        [ 0.0453, -0.1708, -0.0112, -0.1373, -0.19

In [89]:
old['linear_relu_stack.0.weight'][:,13] == model.state_dict()['linear_relu_stack.0.weight'][:,13]

tensor([False,  True, False,  True, False, False,  True, False, False, False,
        False, False, False, False, False, False, False,  True])

In [None]:
## Training vs Test performance vs. # of epochs.