In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import torch
import dlc_practical_prologue as prologue

In [4]:
# Question 1.1

# https://dustinstansbury.github.io/theclevermachine/derivation-common-neural-network-activation-functions

"""
tanh --> gtanh(x) = (2 / (1 + e**(-2*x)) ) - 1 
"""
# param: float tensor
# return: float tensor
def sigma(x):
  f = (2 / (1 + torch.exp(x.mul(-2))) ) - 1
  # f = x.tanh()
  return f

In [5]:
# Question 1.2

"""
tanh der --> gtanh'(x) = 1 - tanh**2(x)
"""
# param: float tensor
# return: float tensor
def dsigma(x):
  df = 1 - sigma(x).pow(2)
  return df

In [6]:
x_test = torch.zeros(1,10) + 1 # torch.ones(10)
df = dsigma(x_test)
print(df)

tensor([[0.4200, 0.4200, 0.4200, 0.4200, 0.4200, 0.4200, 0.4200, 0.4200, 0.4200,
         0.4200]])


In [7]:
# Question 2.1

# p-norms: https://towardsdatascience.com/calculating-vector-p-norms-linear-algebra-for-data-science-iv-400511cffcf0
# Euclidean distance: https://www.dabblingbadger.com/blog/2020/2/27/implementing-euclidean-distance-matrix-calculations-from-scratch-in-python
def loss(v,t):
  return (v - t).pow(2).sum().mean() # https://hackernoon.com/photos/0s78blBiawOe4UYlnA9SeCIgjbA3-uy1263zz5


In [8]:
# Question 2.2

# https://towardsdatascience.com/step-by-step-the-math-behind-neural-networks-d002440227fb
def dloss(v,t):
  return (v - t).mul(2) # torch.div(((v - t).mul(2)),v.shape[0]) # 2 * (v-t) # https://hackernoon.com/photos/0s78blBiawOe4UYlnA9SeCIgjbA3-wp15t3za8

In [9]:
v_test = torch.zeros(1,10).fill_(6) #
t_test = torch.zeros(1,10).fill_(2)
l = loss(v_test, t_test)
print(l)

dl = dloss(v_test, t_test)
print(dl)

tensor(160.)
tensor([[8., 8., 8., 8., 8., 8., 8., 8., 8., 8.]])


In [12]:
# Question 3.1

# https://miro.medium.com/max/700/1*_PhOCrD3sPgPKRIaTv4-Gg.png
# https://youtu.be/bH6VnezBZfI?t=629

def forward_pass(w1, b1, w2, b2, x):

  s_1 = (w1 @ x) + b1 # dot product # L1
  x_1 = sigma(s_1)

  s_2 = (w2 @ x_1) + b2 # dot product # L2
  x_2 = sigma(s_2)

  return x, s_1, x_1 , s_2, x_2


In [None]:
"""
def backward_pass(w1, b1, w2, b2, 
                  t, 
                  x, s1, x1, s2, x2, 
                  dl_dw1, dl_db1, dl_dw2, dl_db2):
    x0 = x
    dl_dx2 = dloss(x2, t)
    dl_ds2 = dsigma(s2) * dl_dx2

    dl_dx1 = w2.t().mv(dl_ds2) 
    dl_ds1 = dsigma(s1) * dl_dx1
    
    dl_dw2.add_(dl_ds2.view(-1, 1).mm(x1.view(1, -1)))
    dl_db2.add_(dl_ds2)

    dl_dw1.add_(dl_ds1.view(-1, 1).mm(x0.view(1, -1)))
    dl_db1.add_(dl_ds1)
"""

In [15]:
# Question 3.2

# https://youtu.be/6RUwfKNdaV0?list=PLaXDtXvwY-oDvedS3f4HW0b4KxqpJ_imw&t=171

# 1 hidden layer
# 1 output layer
# wi.t() ---> https://www.kaggle.com/soham1024/basic-neural-network-from-scratch-in-python?scriptVersionId=33631402&cellId=14
"""
z = w * a + b 
Chain rule:

∂C/∂wi = ∂z/∂w * ∂a/∂z * ∂C/∂a

where 

∂z/∂w = z'(w) = a
∂a/∂z = a'(z) = activation'(z) = sigma'(z)
∂C/∂a = cost'(a) = 1/n sum(y - a)

∂C/∂w = a * activation'(z) * cost'(a)
"""

def backward_pass(  w1, b1, w2, b2,
                  t,
                  x, s1, x1, s2, x2,
                  dl_dw1, dl_db1, dl_dw2, dl_db2):

    delta3 = dloss(x2, t)  # x2 - t # [10]
    term2 = (delta3 * dsigma(s2))  # [10]
    delta2 = w2.t().mm(term2.view(-1, 1))  # [300,1] = [300,10] . [10,1]

    dl_dw2.add_(term2.view(-1, 1).mm(x1.view(-1, 1).t().view(1, -1)))  # [10,300] = [10, 1] . [1,300]
    dl_db2.add_(term2)  # [10] = [10] .* [10]

    term1 = (delta2.squeeze() * dsigma(s1)).view(-1,1)  # [300,1] = [300,1] .* [300] -> [300] .* [300]
    dl_dw1.add_((term1).view(-1, 1).mm(x.view(-1, 1).t().view(1, -1)))  # [300,784] = [300,1] . [1,784]
    dl_db1.add_(term1.squeeze())  # [300] = [300,1] --> [300]

    return dl_dw1, dl_db1, dl_dw2, dl_db2  # [300,784], [300], [10,300], [10]

In [17]:
#TRAIN
#####################################################################

train_input, train_target, test_input, test_target = prologue.load_data(one_hot_labels = True,
                                                                        normalize = True)

nb_classes = train_target.size(1)
nb_train_samples = train_input.size(0)

zeta = 0.90

train_input = train_input * zeta
test_input = test_input * zeta

nb_hidden = 50
eta = 1e-1 / nb_train_samples
epsilon = 1e-6

w1 = torch.empty(nb_hidden, train_input.size(1)).normal_(0, epsilon)
b1 = torch.empty(nb_hidden).normal_(0, epsilon)
w2 = torch.empty(nb_classes, nb_hidden).normal_(0, epsilon)
b2 = torch.empty(nb_classes).normal_(0, epsilon)

dl_dw1 = torch.empty(w1.size())
dl_db1 = torch.empty(b1.size())
dl_dw2 = torch.empty(w2.size())
dl_db2 = torch.empty(b2.size())

for k in range(200):

    # Back-prop

    acc_loss = 0
    nb_train_errors = 0

    dl_dw1.zero_()
    dl_db1.zero_()
    dl_dw2.zero_()
    dl_db2.zero_()

    for n in range(nb_train_samples):
        x0, s1, x1, s2, x2 = forward_pass(w1, b1, w2, b2, train_input[n])

        pred = x2.max(0)[1].item()
        if train_target[n, pred] < 0.5: nb_train_errors = nb_train_errors + 1
        acc_loss = acc_loss + loss(x2, train_target[n])

        backward_pass(w1, b1, w2, b2,
                      train_target[n],
                      x0, s1, x1, s2, x2,
                      dl_dw1, dl_db1, dl_dw2, dl_db2)

    # Gradient step

    w1 = w1 - eta * dl_dw1
    b1 = b1 - eta * dl_db1
    w2 = w2 - eta * dl_dw2
    b2 = b2 - eta * dl_db2

    # Test error

    nb_test_errors = 0

    for n in range(test_input.size(0)):
        _, _, _, _, x2 = forward_pass(w1, b1, w2, b2, test_input[n])

        pred = x2.max(0)[1].item()
        if test_target[n, pred] < 0.5: nb_test_errors = nb_test_errors + 1

    print('{:d} acc_train_loss {:.02f} acc_train_error {:.02f}% test_error {:.02f}%'
          .format(k,
                  acc_loss,
                  (100 * nb_train_errors) / train_input.size(0),
                  (100 * nb_test_errors) / test_input.size(0)))

* Using MNIST
** Reduce the data-set (use --full for the full thing)
** Use 1000 train and 1000 test samples
0 acc_train_loss 1000.00 acc_train_error 88.30% test_error 90.10%
1 acc_train_loss 963.68 acc_train_error 88.30% test_error 90.10%
2 acc_train_loss 940.46 acc_train_error 88.30% test_error 90.10%
3 acc_train_loss 925.61 acc_train_error 88.30% test_error 90.10%
4 acc_train_loss 916.12 acc_train_error 88.30% test_error 90.10%
5 acc_train_loss 910.03 acc_train_error 88.30% test_error 90.10%
6 acc_train_loss 906.13 acc_train_error 88.30% test_error 90.10%
7 acc_train_loss 903.63 acc_train_error 88.30% test_error 90.10%
8 acc_train_loss 902.02 acc_train_error 88.30% test_error 90.10%
9 acc_train_loss 900.98 acc_train_error 88.30% test_error 90.10%
10 acc_train_loss 900.32 acc_train_error 88.30% test_error 90.10%
11 acc_train_loss 899.88 acc_train_error 88.30% test_error 90.10%
12 acc_train_loss 899.61 acc_train_error 88.30% test_error 90.10%
13 acc_train_loss 899.43 acc_train_error 8