# Imports

In [22]:
import torch
from torch import nn
import numpy as np
import matplotlib.pyplot as plt

# Output

In [23]:
def print_values():
    global x, y, w_0, b_0, w_1, b_1, alpha, a, b, c, d, f, g, z, h, j, k, l, n1, n2, n3, n4, m, o 
    print("======Forward=======")
    print("\n-------Net----------")
    print(f"\tx: {x}")
    print(f"\tw_0: {w_0}")
    print(f"\ta = w_0*x: {a}")
    print(f"\tb_0: {b_0}")
    print(f"\tc = a+b_0: {c}")
    print(f"\td = max(0, c): {d}")
    print(f"\tw_1: {w_1}")
    print(f"\tf = w_1*d: {f}")
    print(f"\tb_1: {b_1}")
    print(f"\tg = g+b_1: {g}")
    print("\n-------Softmax----------")
    print(f"\th = exp(g): {h}")
    print(f"\tj = sum(h): {j}")
    print(f"\tk = h/j: {k}")
    print("\n-------NLL+Smoothing----------")
    print(f"\tl = -log(k): {l}")
    print(f"\ty: {torch.argmax(y)}")
    print(f"\talpha: {alpha}")
    print(f"\tn1 = (alpha ... alpha): {n1}")
    print(f"\tn2 = (alpha/(N-1) ... alpha/(N-1)): {n2}")
    print(f"\tn3 = 1.0-alpha: {n3}")
    print(f"\tn4 = (alpha/(N-1) ... 1.0-alpha  ... alpha/(N-1)): {n4}")
    print(f"\tm = l*n4: {m}")
    print(f"\to = sum(m): {o}")

def print_grads():
    global x, y, w_0, b_0, w_1, b_1, alpha, a, b, c, d, f, g, z, h, j, k, l, n1, n2, n3, n4, m, o 
    print("======Backward======")
    print("\n-------Net----------")
    print(f"\tw_0.grad = a.grad*x^T: {w_0.grad}")
    print(f"\ta.grad = c.grad*I: {a.grad}")
    print(f"\tb_0.grad = c.grad*I: {b_0.grad}")
    print(f"\tc.grad = d.grad*diag(\{{c<0:1.0,0.0}}): {c.grad}")
    print(f"\td.grad = f.grad*w_1^T: {d.grad}")
    print(f"\tw_1.grad = f.grad*d^T: {w_1.grad}")
    print(f"\tf.grad = g.grad*I: {f.grad}")
    print(f"\tb_1.grad = g.grad*I: {b_1.grad}")
    print(f"\tg.grad = h.grad*diag(exp(g)): {g.grad}")
    print("\n-------Softmax----------")
    print(f"\th.grad = k.grad*diag(j) + j.grad*(1.0 ... 1.0): {h.grad}")
    print(f"\tj.grad = k.grad*h^T: {j.grad}")
    print(f"\tk.grad = l.grad*diag(-1.0/k): {k.grad}")
    print("\n-------NLL+Smoothing----------")
    print(f"\tl.grad = m.grad*diag(n4): {l.grad}")
    print(f"\talpha.grad = n1.grad*(1.0 ... 1.0)^T + n3.grad*(-1.0): {alpha.grad}")
    print(f"\tn1.grad = n2.grad*(1.0/(N-1) ... 1.0/(N-1)): {n1.grad}")
    print(f"\tn2.grad = n3.grad*I(with ij = 0.0): {n2.grad}")
    print(f"\tn3.grad = n4.grad*(0.0 ... 1.0 ... 0.0)^T: {n3.grad}")
    print(f"\tn4.grad = m.grad*diag(l): {n3.grad}")
    print(f"\tm.grad = (1.0 ... 1.0): {m.grad}")

# Define net

In [24]:
x = torch.tensor([1.0, 1.0])
y = torch.tensor([0])

w_0 = torch.randn((2,2), requires_grad=True)
w_0.retain_grad()
b_0 = torch.randn((2), requires_grad=True)
b_0.retain_grad()
w_1 = torch.randn((2,2), requires_grad=True)
w_1.retain_grad()
b_1 = torch.randn((2), requires_grad=True)
b_1.retain_grad()
alpha = torch.rand((1), requires_grad=True)
alpha.retain_grad()

a, b, c, d, f, g, z, h, j, k, l, n1, n2, n3, n4, m, o = None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None

# Forward

In [25]:
def forward():
    global x, y, w_0, b_0, w_1, b_1, alpha, a, b, c, d, f, g, z, h, j, k, l, n1, n2, n3, n4, m, o 
    a = w_0@x
    a.retain_grad()
    c = a+b_0
    c.retain_grad()
    d = torch.maximum(c, torch.zeros_like(c))
    d.retain_grad()
    f = w_1@d
    f.retain_grad()
    g = f+b_1
    g.retain_grad()

    z = g
    z.retain_grad()

    h = torch.exp(z)
    h.retain_grad()
    j = torch.sum(h)
    j.retain_grad()
    k = h/j
    k.retain_grad()
    l = -torch.log(k)
    l.retain_grad()
    n1 = torch.ones((2))*alpha
    n1.retain_grad()
    n2 = n1/(2-1)
    n2.retain_grad()
    n3 = 1.0-alpha
    n3.retain_grad()
    n4 = torch.scatter(n2, 0, y, n3)
    n4.retain_grad()
    m = l*n4
    m.retain_grad()
    o = torch.sum(m)
    o.retain_grad()

# Backward

In [30]:
def backward():
    global x, y, w_0, b_0, w_1, b_1, alpha, a, b, c, d, f, g, z, h, j, k, l, n1, n2, n3, n4, m, o 
    w_0.grad = None
    b_0.grad = None
    w_1.grad = None
    b_1.grad = None
    alpha.grad = None
    o.backward()

# Update

In [27]:
def update_net():
    global x, y, w_0, b_0, w_1, b_1, alpha, a, b, c, d, f, g, z, h, j, k, l, n1, n2, n3, n4, m, o 
    with torch.no_grad():
        w_0.copy_(w_0+0.01*w_0.grad)
        b_0.copy_(b_0+0.01*b_0.grad)
        w_1.copy_(w_1+0.01*w_1.grad)
        b_1.copy_(b_1+0.01*b_1.grad)

def update_alpha():
    global x, y, w_0, b_0, w_1, b_1, alpha, a, b, c, d, f, g, z, h, j, k, l, n1, n2, n3, n4, m, o 
    with torch.no_grad():
        alpha.copy_(alpha+0.01*alpha.grad)

# Execute1

In [33]:
forward()
print_values()
backward()
print("\n")
print_grads()
update_net()


-------Net----------
	x: tensor([1., 1.])
	w_0: tensor([[ 0.0321, -0.8767],
        [ 0.1517,  0.5187]], requires_grad=True)
	a = w_0*x: tensor([-0.8447,  0.6704], grad_fn=<MvBackward>)
	b_0: tensor([-0.9749,  1.0810], requires_grad=True)
	c = a+b_0: tensor([-1.8196,  1.7514], grad_fn=<AddBackward0>)
	d = max(0, c): tensor([0.0000, 1.7514], grad_fn=<MaximumBackward>)
	w_1: tensor([[-0.8707, -0.7331],
        [ 0.1475,  0.2350]], requires_grad=True)
	f = w_1*d: tensor([-1.2839,  0.4115], grad_fn=<MvBackward>)
	b_1: tensor([0.4098, 1.0401], requires_grad=True)
	g = g+b_1: tensor([-0.8741,  1.4517], grad_fn=<AddBackward0>)

-------Softmax----------
	h = exp(g): tensor([0.4173, 4.2703], grad_fn=<ExpBackward>)
	j = sum(h): 4.687602519989014
	k = h/j: tensor([0.0890, 0.9110], grad_fn=<DivBackward0>)

-------NLL+Smoothing----------
	l = -log(k): tensor([2.4190, 0.0932], grad_fn=<NegBackward>)
	y: 0
	alpha: tensor([0.2914], requires_grad=True)
	n1 = (alpha ... alpha): tensor([0.2914, 0.2914],

# Execute 2

In [34]:
forward()
print_values()
backward()
print("\n")
print_grads()
update_alpha()


-------Net----------
	x: tensor([1., 1.])
	w_0: tensor([[ 0.0321, -0.8767],
        [ 0.1577,  0.5247]], requires_grad=True)
	a = w_0*x: tensor([-0.8447,  0.6824], grad_fn=<MvBackward>)
	b_0: tensor([-0.9749,  1.0870], requires_grad=True)
	c = a+b_0: tensor([-1.8196,  1.7694], grad_fn=<AddBackward0>)
	d = max(0, c): tensor([0.0000, 1.7694], grad_fn=<MaximumBackward>)
	w_1: tensor([[-0.8707, -0.7439],
        [ 0.1475,  0.2458]], requires_grad=True)
	f = w_1*d: tensor([-1.3163,  0.4350], grad_fn=<MvBackward>)
	b_1: tensor([0.4036, 1.0463], requires_grad=True)
	g = g+b_1: tensor([-0.9126,  1.4813], grad_fn=<AddBackward0>)

-------Softmax----------
	h = exp(g): tensor([0.4015, 4.3987], grad_fn=<ExpBackward>)
	j = sum(h): 4.800204753875732
	k = h/j: tensor([0.0836, 0.9164], grad_fn=<DivBackward0>)

-------NLL+Smoothing----------
	l = -log(k): tensor([2.4813, 0.0873], grad_fn=<NegBackward>)
	y: 0
	alpha: tensor([0.2914], requires_grad=True)
	n1 = (alpha ... alpha): tensor([0.2914, 0.2914],