# Two Layer NN

### Forward propagation:
Z1 = X.W1 + b1
A1 = ReLU(Z1)  
Z2 = A1.W2 + b2   
exp_scores = exp(Z2)  
probs = exp_scores / sum(exp_scores)

### Backward propagation:
delta3 = probs   
delta3[range(len(X)), y] -= 1   
dW2 = A1.T.dot(delta3)   
db2 = sum(delta3)   
delta2 = delta3.dot(W2.T) * (A1 > 0)   
dW1 = X.T.dot(delta2)   
db1 = sum(delta2)   

In [None]:
import numpy as np 

class NN:
    def __init__(self, input_size, hidden_size, output_size, activation='relu', lr=0.01):
        self.W1 = np.random.randn(input_size, hidden_size)
        self.W2 = np.random.randn(hidden_size, output_size)
        self.b1 = np.random.randn(hidden_size)
        self.b2 = np.random.randn(output_size)
        self.activation = activation
        
    def activation_func(self, z, activation='relu'):
        if activation == 'relu':
            return np.maximum(0, z)
        elif activation == 'sigmoid':
            return 1 / (1 + np.exp(-z))
        elif activation == 'tanh':
            return np.tanh(z)
        
    def forward(self, X):
        z1 = np.dot(X, self.W1) + self.b1
        a1 = self.activation_func(z1, self.activation) 
        z2 = np.dot(a1, self.W2) + self.b2
        a2 = self.activation_func(z2, 'sigmoid')
        
        

In [1]:
import numpy as np

class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size):
        self.params = {}
        self.params['W1'] = np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)

    def forward(self, X):
        W1, b1 = self.params['W1'], self.params['b1']
        W2, b2 = self.params['W2'], self.params['b2']
        z1 = np.dot(X, W1) + b1
        a1 = np.maximum(0, z1) # ReLU activation function
        z2 = np.dot(a1, W2) + b2
        # probs = 1 / (1 + np.exp(-z2)) # Sigmoid activation function
        exp_z = np.exp(z2)
        probs = exp_z / np.sum(exp_z, axis=1, keepdims=True)
        return probs

    def loss(self, X, y):
        probs = self.forward(X)
        correct_logprobs = -np.log(probs[range(len(X)), y])
        data_loss = np.sum(correct_logprobs)
        return 1.0/len(X) * data_loss

    def train(self, X, y, num_epochs, learning_rate=0.1):
        for epoch in range(num_epochs):
            # Forward propagation
            z1 = np.dot(X, self.params['W1']) + self.params['b1']
            a1 = np.maximum(0, z1) # ReLU activation function
            z2 = np.dot(a1, self.params['W2']) + self.params['b2']
            # probs = 1 / (1 + np.exp(-z2)) # Sigmoid activation function
            exp_z = np.exp(z2)
            probs = exp_z / np.sum(exp_z, axis=1, keepdims=True)

            # Backpropagation
            delta3 = probs
            delta3[range(len(X)), y] -= 1
            dW2 = np.dot(a1.T, delta3)
            db2 = np.sum(delta3, axis=0)
            delta2 = np.dot(delta3, self.params['W2'].T) * (a1 > 0) # derivative of ReLU
            dW1 = np.dot(X.T, delta2)
            db1 = np.sum(delta2, axis=0)

            # Update parameters
            self.params['W1'] -= learning_rate * dW1
            self.params['b1'] -= learning_rate * db1
            self.params['W2'] -= learning_rate * dW2
            self.params['b2'] -= learning_rate * db2

            # Print loss for monitoring training progress
            if epoch % 100 == 0:
                loss = self.loss(X, y)
                print("Epoch {}: loss = {}".format(epoch, loss))

In [2]:
# hyper params definition
input_size = 2
hidden_size = 10
output_size = 2

In [3]:
# Generate a toy dataset
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([0, 1, 1, 0])

# Initialize a neural network
net = TwoLayerNet(input_size=input_size, hidden_size=hidden_size, output_size=output_size)

# Train the neural network
net.train(X, y, num_epochs=1000)

# Test the neural network
probs = net.forward(X)
predictions = np.argmax(probs, axis=1)
print("Predictions: ", predictions)

Epoch 0: loss = 0.9556106562089086
Epoch 100: loss = 0.08039370954169635
Epoch 200: loss = 0.026299533066897248
Epoch 300: loss = 0.014811125174776101
Epoch 400: loss = 0.010067949022975113
Epoch 500: loss = 0.0075615969979532845
Epoch 600: loss = 0.0060023244854278904
Epoch 700: loss = 0.004960782799342221
Epoch 800: loss = 0.00421438903840939
Epoch 900: loss = 0.00364801332349237
Predictions:  [0 1 1 0]


## Follow-ups

Weight initialization: The current implementation initializes the weights randomly using a Gaussian distribution. However, it is recommended to use other weight initialization methods such as Xavier or He initialization to improve convergence and avoid vanishing or exploding gradients. One possible implementation for Xavier initialization of the weights is:

In [4]:
# Xavier initialization
class CustomizedModel(TwoLayerNet):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__(input_size, hidden_size, output_size)
        self.params['W1'] = np.random.randn(input_size, hidden_size) / np.sqrt(input_size)
        self.params['W2'] = np.random.randn(hidden_size, output_size) / np.sqrt(hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['b2'] = np.zeros(output_size)

Learning rate decay: The learning rate is a hyperparameter that determines the step size at each iteration during training. However, using a fixed learning rate may lead to suboptimal performance or slow convergence. A common technique is to gradually decrease the learning rate over time, known as learning rate decay, to fine-tune the network weights as the optimization process progresses.


In [5]:
# Learning rate decay
learning_rate = 0.1
lr_decay = 0.95
lr_decay_epoch = 100
num_epochs = 10
for epoch in range(num_epochs):
    # ...
    if epoch % lr_decay_epoch == 0:
        learning_rate *= lr_decay

Regularization: Overfitting can occur when the model is too complex and the training data is limited. Regularization techniques such as L1 or L2 regularization can be applied to the loss function to prevent overfitting and improve the generalization performance of the model.

In [6]:
# L2 regularization

class CustomizedModel(TwoLayerNet):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__(input_size, hidden_size, output_size)
        self.reg_lambda = 0.1
    
    def loss(self, X, y):
        probs = self.forward(X)
        correct_logprobs = -np.log(probs[range(len(X)), y])
        data_loss = np.sum(correct_logprobs)
        data_loss += 0.5 * self.reg_lambda * (np.sum(self.params['W1'] ** 2) + np.sum(self.params['W2'] ** 2))
        return 1.0/len(X) * data_loss

Mini-batch training: The current implementation updates the weights using the entire training set at each iteration, which can be computationally expensive for large datasets. An alternative is to use mini-batch training, where a random subset of the training data is used at each iteration to update the weights. This can speed up the training process and improve convergence.


In [7]:
# Mini-batch training
batch_size = 64
num_batches = len(X) // batch_size
for epoch in range(num_epochs):
    for i in range(num_batches):
        # Select a random batch of data
        batch_mask = np.random.choice(len(X), batch_size)
        X_batch = X[batch_mask]
        y_batch = y[batch_mask]

        # Forward and backward propagation using the batch data
        # ...