# Training a batch of images of FashionMNIST with Forward Propagation
## input : 10 × 1 × 28 × 28 (batch size, input channels, height, width)
## output: 10 × 10 (batch size, number of prediction classes) 

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

import torchvision
import torchvision.transforms as transforms

torch.set_printoptions(linewidth=120)

In [2]:
train_set = torchvision.datasets.FashionMNIST(
    root='./data/FashionMNIST'
    ,train = True
    , download = True
    , transform =transforms.Compose([
        transforms.ToTensor()
    ])
)

In [3]:
data_loader = torch.utils.data.DataLoader(
    train_set
    ,batch_size=10
)

In [4]:
batch = next(iter(data_loader))

In [5]:
class Network(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=6, kernel_size=5)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=12, kernel_size=5)
        
        self.fc1 = nn.Linear(in_features=12*4*4, out_features=120)
        self.fc2 = nn.Linear(in_features=120, out_features=60)
        self.out = nn.Linear(in_features=60, out_features=10)
        
    def forward(self, t):
        
        
        # (2) hidden conv layer
        
        t = F.relu(self.conv1(t))
        t = F.max_pool2d(t, kernel_size = 2, stride = 2)
        
        # (3) hidden conv layer
        
        t = F.relu(self.conv2(t))
        t = F.max_pool2d(t, kernel_size = 2, stride = 2)
        
        # (4) hidden linear layer
        t = t.reshape(-1, 12 * 4 * 4)
        
        t = F.relu(self.fc1(t))
        
        # (5) hidden linear layer
        
        t = F.relu(self.fc2(t))
        
        # (6) output layer
        t = self.out(t)
        #t = F.softmax(t,dim = 1)
        
        return t

In [6]:
torch.set_grad_enabled(False)

<torch.autograd.grad_mode.set_grad_enabled at 0x1b568904a90>

In [7]:
images, labels = batch
images.shape #  a batch of images

torch.Size([10, 1, 28, 28])

In [8]:
labels

tensor([9, 0, 0, 3, 0, 2, 7, 2, 5, 5])

In [9]:
network = Network()

In [10]:
preds = network(images)

In [11]:
preds.shape

torch.Size([10, 10])

In [12]:
preds

tensor([[ 0.0698,  0.0035, -0.0743,  0.1250, -0.0914,  0.0294, -0.0745,  0.0299,  0.0903,  0.0782],
        [ 0.0688, -0.0031, -0.0830,  0.1218, -0.0914,  0.0256, -0.0784,  0.0295,  0.0902,  0.0774],
        [ 0.0603,  0.0018, -0.0803,  0.1213, -0.0883,  0.0215, -0.0711,  0.0246,  0.0856,  0.0798],
        [ 0.0615,  0.0036, -0.0794,  0.1208, -0.0900,  0.0300, -0.0766,  0.0280,  0.0892,  0.0779],
        [ 0.0634,  0.0014, -0.0815,  0.1242, -0.0880,  0.0291, -0.0775,  0.0298,  0.0908,  0.0792],
        [ 0.0679, -0.0050, -0.0778,  0.1197, -0.0956,  0.0374, -0.0818,  0.0283,  0.0870,  0.0733],
        [ 0.0740,  0.0014, -0.0828,  0.1286, -0.0900,  0.0313, -0.0742,  0.0258,  0.0864,  0.0719],
        [ 0.0708, -0.0038, -0.0763,  0.1165, -0.0929,  0.0386, -0.0820,  0.0314,  0.0909,  0.0750],
        [ 0.0610, -0.0009, -0.0788,  0.1184, -0.0884,  0.0159, -0.0724,  0.0287,  0.0902,  0.0830],
        [ 0.0589,  0.0056, -0.0757,  0.1230, -0.0885,  0.0120, -0.0728,  0.0305,  0.0908,  0.0858]])

The prediction tensor has a shape of 10 by 10, which gives us two axes that each have a length of ten. This reflects the fact that we have ten images and for each of these ten images we have ten prediction classes.

The elements of the first dimension are arrays of length ten. Each of these array elements contain the ten predictions for each category for the corresponding image. 

The elements of the second dimension are numbers. Each number is the assigned value of the specific output class. The output classes are encoded by the indexes, so each index represents a specific output class. 

In [13]:
labels

tensor([9, 0, 0, 3, 0, 2, 7, 2, 5, 5])

In [14]:
preds.argmax(dim = 1) # the prediction in this case is incorrect

tensor([3, 3, 3, 3, 3, 3, 3, 3, 3, 3])

In [15]:
preds.argmax(dim=1).eq(labels)

tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 0], dtype=torch.uint8)

In [16]:
preds.argmax(dim=1).eq(labels).sum()

tensor(1)

In [17]:
F.softmax(preds,dim=1)

tensor([[0.1050, 0.0982, 0.0909, 0.1109, 0.0894, 0.1008, 0.0909, 0.1009, 0.1072, 0.1059],
        [0.1052, 0.0979, 0.0904, 0.1109, 0.0896, 0.1007, 0.0908, 0.1011, 0.1074, 0.1061],
        [0.1043, 0.0984, 0.0906, 0.1109, 0.0899, 0.1003, 0.0915, 0.1007, 0.1070, 0.1064],
        [0.1043, 0.0985, 0.0906, 0.1107, 0.0897, 0.1011, 0.0909, 0.1009, 0.1073, 0.1061],
        [0.1045, 0.0982, 0.0904, 0.1110, 0.0898, 0.1009, 0.0907, 0.1010, 0.1074, 0.1061],
        [0.1051, 0.0977, 0.0909, 0.1107, 0.0893, 0.1020, 0.0905, 0.1010, 0.1071, 0.1057],
        [0.1056, 0.0982, 0.0902, 0.1115, 0.0896, 0.1011, 0.0910, 0.1006, 0.1069, 0.1053],
        [0.1053, 0.0977, 0.0909, 0.1102, 0.0894, 0.1019, 0.0904, 0.1012, 0.1074, 0.1057],
        [0.1044, 0.0981, 0.0908, 0.1105, 0.0899, 0.0998, 0.0913, 0.1011, 0.1075, 0.1067],
        [0.1040, 0.0986, 0.0909, 0.1109, 0.0898, 0.0992, 0.0912, 0.1011, 0.1074, 0.1069]])

Network weights are randomly generated. Each time we create a new instance of our network, the weights within the network will be different. This means that the predictions we get will be different if we create different networks. 

In [18]:
def get_num_correct(preds, labels):
    return preds.argmax(dim=1).eq(labels).sum().item()

In [19]:
get_num_correct(preds,labels)

1

In [20]:
F.softmax(preds, dim=1).sum()

tensor(10.)

In [21]:
print(network)

Network(
  (conv1): Conv2d(1, 6, kernel_size=(5, 5), stride=(1, 1))
  (conv2): Conv2d(6, 12, kernel_size=(5, 5), stride=(1, 1))
  (fc1): Linear(in_features=192, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=60, bias=True)
  (out): Linear(in_features=60, out_features=10, bias=True)
)


In [22]:
network.conv1.weight

Parameter containing:
tensor([[[[-0.1641,  0.0426, -0.1928,  0.1217, -0.0350],
          [-0.0887, -0.0055,  0.1837, -0.0367,  0.1656],
          [-0.0840, -0.1275, -0.1436,  0.0632, -0.1405],
          [ 0.0735,  0.1952, -0.1149,  0.1600,  0.0350],
          [ 0.0294,  0.0723, -0.1525,  0.1415,  0.0529]]],


        [[[ 0.0618,  0.1993, -0.1922,  0.0857,  0.1780],
          [-0.1667, -0.1388, -0.0950, -0.1654,  0.0646],
          [-0.0579, -0.1718, -0.1083,  0.1002, -0.0611],
          [-0.1747, -0.1827, -0.0227,  0.0837,  0.1652],
          [ 0.0329, -0.0912, -0.1391,  0.0490, -0.0896]]],


        [[[ 0.1830, -0.0198, -0.0146, -0.1426, -0.1560],
          [-0.0991, -0.1411,  0.0905, -0.1715,  0.1383],
          [-0.1089,  0.1994, -0.0440, -0.0196, -0.0010],
          [-0.1364, -0.1437,  0.1774,  0.1472,  0.0343],
          [ 0.0750, -0.1873,  0.1725,  0.0040, -0.1847]]],


        [[[-0.0709,  0.0575,  0.0017, -0.0224,  0.0740],
          [-0.1398,  0.1878, -0.0997,  0.1016,  0.1426

In [23]:
network.fc1.weight

Parameter containing:
tensor([[-3.7620e-02, -3.0236e-02, -1.5490e-02,  ...,  5.6669e-02, -4.3223e-03,  2.2445e-02],
        [ 4.8502e-02,  4.5177e-02,  3.0869e-02,  ...,  5.8603e-02, -1.2762e-02, -5.8258e-02],
        [-6.1708e-03, -4.3056e-03, -6.6799e-02,  ..., -5.4304e-02,  6.5948e-03, -4.7317e-03],
        ...,
        [ 2.9953e-02, -3.8773e-05, -5.3876e-02,  ...,  6.1435e-02,  1.5909e-02, -5.0694e-02],
        [-2.7693e-03, -6.3158e-02, -4.2490e-02,  ...,  1.5434e-02,  3.8789e-03,  4.5823e-03],
        [-5.3507e-02, -1.1217e-02,  5.5563e-02,  ..., -6.7847e-02,  6.8375e-02,  6.0344e-02]], requires_grad=True)

In [24]:
network.fc1.weight.shape


torch.Size([120, 192])

In [25]:
for para in network.parameters():
    print(para.shape)

torch.Size([6, 1, 5, 5])
torch.Size([6])
torch.Size([12, 6, 5, 5])
torch.Size([12])
torch.Size([120, 192])
torch.Size([120])
torch.Size([60, 120])
torch.Size([60])
torch.Size([10, 60])
torch.Size([10])


In [26]:
for name, para in network.named_parameters():
    print(name, '\t\t', para.shape)

conv1.weight 		 torch.Size([6, 1, 5, 5])
conv1.bias 		 torch.Size([6])
conv2.weight 		 torch.Size([12, 6, 5, 5])
conv2.bias 		 torch.Size([12])
fc1.weight 		 torch.Size([120, 192])
fc1.bias 		 torch.Size([120])
fc2.weight 		 torch.Size([60, 120])
fc2.bias 		 torch.Size([60])
out.weight 		 torch.Size([10, 60])
out.bias 		 torch.Size([10])
