Training a simple neural network with: $\text{inputs} \rightarrow \text{hidden} \rightarrow \text{tanh} \rightarrow \text{output} \rightarrow \text{sigmoid}$ -- to mirror an XOR gate. The dimensions are hard coded and backward pass computed manually in numpy. 

The goal is to reinforce the mechanics of neural networks, which I'm afraid I abstracted away while using pytorch. `lr`, `max_steps` and `initialization` are some hyperparams to play with. 

In [60]:
import numpy as np

np.random.seed(1337)

In [80]:
class Net:

    def __init__(self):
        self.w1 = np.random.randn(2,4) * np.sqrt(2)**-1
        self.b1 = np.zeros(shape=(1,4))
        self.w2 = np.random.randn(4,1) * 0.5
        self.b2 = np.zeros(shape = (1, 1))
    
    def __call__(self, x, y=None):
        return self.forward(x, y)
    
    def tanh(self, x):
        return (np.exp(x)-np.exp(-x))/(np.exp(x)+np.exp(-x))
    
    def sigmoid(self,x):
        return 1/(1+np.exp(-x))
    
    
    def forward(self, x, y = None):
        self.z1 = x @ self.w1 + self.b1
        self.a1 = self.tanh(self.z1)
        self.z2 = self.a1 @ self.w2 + self.b2
        self.yhat = self.sigmoid(self.z2) # returns prediction 

        loss = None
        if y is not None:
            self.loss = -(np.mean(y*np.log(self.yhat)) + np.mean((1-y)*np.log(1-self.yhat)))

        return self.yhat, self.loss
    
    def backward(self, x, y):
        dz2 = self.yhat - y  # (4, 1)
        
        self.w2_grad = self.a1.T @ dz2  # (4, 4).T @ (4, 1) = (4, 1) 
        self.b2_grad = np.sum(dz2, axis=0, keepdims=True)  # (1, 1) 
        
        da1 = dz2 @ self.w2.T  # (4, 1) @ (1, 4) = (4, 4)
        dz1 = da1 * (1 - self.a1**2)  # (4, 4)
        
        self.w1_grad = x.T @ dz1  # (2, 4) @ (4, 4) = (2, 4) 
        self.b1_grad = np.sum(dz1, axis=0, keepdims=True)  # (1, 4) 

    def update(self, lr:float = 0.01):

        params = [self.w1, self.b1, self.w2, self.b2]
        grads = [self.w1_grad, self.b1_grad, self.w2_grad, self.b2_grad]

        for p,g in zip(params, grads):
            p -= lr*g



In [81]:
# data and labels

x = np.array([[0,0], [0,1], [1,0], [1,1]])
y = np.array([[0], [1], [1], [0]])

In [82]:
model = Net()

max_steps = 1000

for step in range(max_steps):
    # forward pass
    out = model(x,y)
    #backward pass
    model.backward(x,y)
    #update
    model.update(lr = 0.1)

    if step%20 ==0:
        print(f"{step}/{max_steps}: loss = {model.loss:.3f}")

print(f"Final loss = {model.loss:.3f}")


0/1000: loss = 0.708
20/1000: loss = 0.689
40/1000: loss = 0.678
60/1000: loss = 0.655
80/1000: loss = 0.610
100/1000: loss = 0.547
120/1000: loss = 0.484
140/1000: loss = 0.421
160/1000: loss = 0.336
180/1000: loss = 0.239
200/1000: loss = 0.167
220/1000: loss = 0.120
240/1000: loss = 0.091
260/1000: loss = 0.072
280/1000: loss = 0.059
300/1000: loss = 0.050
320/1000: loss = 0.043
340/1000: loss = 0.038
360/1000: loss = 0.034
380/1000: loss = 0.030
400/1000: loss = 0.028
420/1000: loss = 0.025
440/1000: loss = 0.023
460/1000: loss = 0.022
480/1000: loss = 0.020
500/1000: loss = 0.019
520/1000: loss = 0.018
540/1000: loss = 0.017
560/1000: loss = 0.016
580/1000: loss = 0.015
600/1000: loss = 0.014
620/1000: loss = 0.014
640/1000: loss = 0.013
660/1000: loss = 0.012
680/1000: loss = 0.012
700/1000: loss = 0.011
720/1000: loss = 0.011
740/1000: loss = 0.011
760/1000: loss = 0.010
780/1000: loss = 0.010
800/1000: loss = 0.010
820/1000: loss = 0.009
840/1000: loss = 0.009
860/1000: loss = 

In [83]:
# predictions
preds, _ = model(x)
print(preds)

[[0.00122615]
 [0.99247126]
 [0.99164817]
 [0.01158322]]
