In [68]:
import math

def sigmoid(x):
    return [1 / (1 + (math.exp(-x[i]))) for i in range(len(x))]

def softmax(outputs):
    denominator = sum(math.exp(o) for o in outputs)
        
    if denominator == 0:
        return 0
    
    return [math.exp(outputs[j]) / denominator for j in range(len(outputs))]

def d_sigmoid(x):
    s = sigmoid(x)
    return [s[i] * (1 - s[i]) for i in range(len(x))]

def cross_entropy(predictions, targets):
    loss = 0
    m = len(predictions)
    
    for j in range(m):
        loss += targets[j] * math.log(predictions[j])
    
    return -1 * loss

def lin(inputs, weights, bias):
    outputs = [0] * len(weights[0])
    for n in range(len(weights[0])): # num of next layer neurons
        for i in range(len(inputs)):
            outputs[n] += inputs[i] * weights[i][n]
        outputs[n] += bias[n]
    return outputs

def lin_grad(lin_input, output_grad, weights):
    
    dW = [[0] * len(weights[0]) for _ in range(len(weights))]
    dx = [0] * len(lin_input)

    for n in range(len(weights[0])):
        for i in range(len(lin_input)):
            dW[i][n] = output_grad[n] * lin_input[i]
            dx[i] += output_grad[n] * weights[i][n]
    
    db = output_grad.copy()
    
    return dW, dx, db

def d_cross_entropy(predictions, targets):
    # dl_do = dl_dy * dy_do
    # dl_dy = deriv of cross entropy
    # dy_do = deriv of softmax
    # deriv of loss wrt to o
    return [predictions[i] - targets[i] for i in range(len(targets))] 

size_layer_1 = 3
size_layer_2 = 2

num_of_input = 2

W = [[1.0, 1.0, 1.0], [-1.0, -1.0, -1.0]]

V = [[1.0, 1.0], [-1.0, -1.0], [-1.0, -1.0]]

b = [0] * size_layer_1  
c = [0] * size_layer_2

x = [1, -1]
t = [1, 0]

# forward
k = lin(x, W, b)
print(f'k: {k}')

h = sigmoid(k)
print(f'h: {h}')

o = lin(h, V, c)
print(f'o: {o}')

y = softmax(o)
print(f'y: {y}')

l = cross_entropy(y, t)
print(f'l: {l}')

# backward
do = d_cross_entropy(y, t)
print(f'do: {do}')

dV, dh, dc = lin_grad(h, do, V)
print(f'dV: {dV}')
print(f'dh: {dh}')
print(f'dc: {dc}')

dk = d_sigmoid(k)
dk = [dh[i] * dk[i] for i in range(len(dh))]
print(f'dk: {dk}')

dW, dx, db = lin_grad(x, dk, W)
print(f'dW: {dW}')
print(f'dx: {dx}')
print(f'db: {db}')

k: [2.0, 2.0, 2.0]
h: [0.8807970779778823, 0.8807970779778823, 0.8807970779778823]
o: [-0.8807970779778823, -0.8807970779778823]
y: [0.5, 0.5]
l: 0.6931471805599453
do: [-0.5, 0.5]
dV: [[-0.44039853898894116, 0.44039853898894116], [-0.44039853898894116, 0.44039853898894116], [-0.44039853898894116, 0.44039853898894116]]
dh: [0.0, 0.0, 0.0]
dc: [-0.5, 0.5]
dk: [0.0, 0.0, 0.0]
dW: [[0.0, 0.0, 0.0], [-0.0, -0.0, -0.0]]
dx: [0.0, 0.0]
db: [0.0, 0.0, 0.0]
