Building a multi-layer perceptron with one hidden layer from scratch and test it on MNIST data

In [1]:
import torch

# Activation function

In [2]:
def sigma(x):
    return torch.tanh(x)

In [3]:
def dsigma(x):
    return 1 - sigma(x)

# Loss

In [4]:
def loss(v, t):
    return (v-t).pow(2).sum()

In [5]:
def dloss(v, t):
    return 2*(v-t)

# Importing the data

In [6]:
import dlc_practical_prologue as prologue

train_input, train_target, test_input,test_target = prologue.load_data(one_hot_labels = True,
                                                                      normalize = True)

train_target = train_target * 0.9
test_target  = test_target * 0.9
print(train_input.shape)

nb_hidden = 50
epsilon = 1e-6
nb_classes = train_target.size(1)
nb_train_samples = train_input.size(0)
learning_rate = 0.1
step = learning_rate/nb_train_samples

* Using MNIST




** Reduce the data-set (use --full for the full thing)
** Use 1000 train and 1000 test samples
torch.Size([1000, 784])


In [7]:
x = train_input

In [8]:
w1 = torch.empty(nb_hidden, train_input.size(1)).normal_(0, epsilon)
b1 = torch.empty(nb_hidden, train_input.size(0)).normal_(0, epsilon)
w2 = torch.empty(nb_classes, nb_hidden).normal_(0, epsilon)
b2 = torch.empty(nb_classes, train_input.size(0)).normal_(0, epsilon)


dl_dw1 = torch.empty(w1.size())
dl_db1 = torch.empty(b1.size())
dl_dw2 = torch.empty(w2.size())
dl_db2 = torch.empty(b2.size())

In [9]:
w2.shape

torch.Size([10, 50])

In [13]:
def forward_pass(w1, b1, w2, b2, x):
    x0 = x
    s1 = torch.add(torch.mm(w1, torch.transpose(x0, 0, 1)), b1)
    x1 = sigma(s1)
    s2 = torch.add(torch.mm(w2,x1), b2)
    x2 = sigma(s1)
    
    return x0, s1, x1, s2, x2

In [20]:
 x0, s1, x1, s2, x2 = forward_pass(w1, b1, w2, b2, x)

In [13]:
#b1.shape
#s1 = torch.add(torch.mm(w1, torch.transpose(x, 0, 1)), b1)
#x1 = sigma(s1)

#x1.shape

In [24]:
forward_pass(w1, w2, b1, b2, x)

(tensor([[-1.5593e-05,  1.0198e-05, -2.3631e-06,  ...,  1.6863e-06,
          -1.6162e-05, -9.4213e-06],
         [-3.7875e-06, -3.2296e-05,  4.7850e-05,  ..., -4.6781e-06,
          -4.6001e-06, -1.1158e-07],
         [ 4.6948e-05,  7.5915e-05,  2.7812e-05,  ...,  2.7371e-05,
           7.3137e-05,  1.9534e-05],
         ...,
         [-1.3455e-06,  4.8133e-05,  1.0704e-05,  ...,  2.6426e-05,
           1.2760e-05, -1.0481e-05],
         [-3.2370e-06,  1.0217e-05, -8.3648e-06,  ...,  1.4750e-05,
          -3.8282e-05, -2.4374e-05],
         [-1.8823e-05, -9.4006e-06,  3.9139e-05,  ...,  5.9774e-06,
          -3.2895e-05, -4.1299e-05]]),
 tensor([[-1.5593e-05,  1.0198e-05, -2.3631e-06,  ...,  1.6863e-06,
          -1.6162e-05, -9.4213e-06],
         [-3.7875e-06, -3.2296e-05,  4.7850e-05,  ..., -4.6781e-06,
          -4.6001e-06, -1.1158e-07],
         [ 4.6948e-05,  7.5915e-05,  2.7812e-05,  ...,  2.7371e-05,
           7.3137e-05,  1.9534e-05],
         ...,
         [-1.3455e-06,  4

In [24]:
def backward_pass(w1, b1, w2, b2,
                  t,
                  x, s1, x1, s2, x2,
                 dl_dw1, dl_db1, dl_dw2, dl_db2):
    
    dl_dx2 = loss(x2, t)
    dl_ds2 = torch.mm(dl_dx2, dsigma)
    dl_dx1 = torch.mm(dl_s2, w2)
    dl_ds1 = torch.mm(dl_dx1, dsigma(s1))
    
    dl_dw2 = torch.mm(dl_ds2, x1)
    dl_db2 = torch.mm(dl_ds2, 1)
    dl_db1 = torch.mm(dl_ds1, 1)
    dl_dw1 = torch.mm(dl_ds1, x)

In [31]:
x2.shape

torch.Size([50, 1000])

In [33]:
train_input.shape

torch.Size([1000, 784])

In [34]:
train_target.shape

torch.Size([1000, 10])

In [30]:
for k in range(1000):
    
    """
    Performing 1,000 gradient steps with a step size equal to 0.1 divided
    by the number of training samples(variable--steps)
    First reset the tensors to zero for summing up the gradients and doing
    a forward and backward pass for each training example
    """
    dl_dw1 = torch.zeros(dl_dw1.shape)
    dl_db1 = torch.zeros(dl_db1.shape)
    dl_dw2 = torch.zeros(dl_dw2.shape)
    dl_db2 = torch.zeros(dl_db2.shape)
    
    for n in range(nb_train_samples):
        #forward prop
        x0, s1, x1, s2, x2 = forward_pass(w1, b1, w2, b2, x)
    
        #backward prop
        backward_pass(w1, b1, w2, b2,
                     train_target[n],
                     x, s1, x1, s2, x2,
                     dl_dw1, dl_db1, dl_dw2, dl_db2)
    
        #update rule
        w1 = w1 - step * dl_dw1
        b1 = b1 - step * dl_db1
        w2 = w2 - step * dl_dw2
        w1 = w1 - step * dl_db2
    
    
    

RuntimeError: The size of tensor a (1000) must match the size of tensor b (10) at non-singleton dimension 1

In [27]:
x, s1, x1, s2, x2 = forward_pass(w1, w2, b1, b2, x)

RuntimeError: The size of tensor a (1000) must match the size of tensor b (50) at non-singleton dimension 1