In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
from activations import sigmoid, sigmoid_derivative
from cost import MSE_derivative

In [3]:
from data_loaders import MNISTLoader
mloader = MNISTLoader()
train_data, train_labels = mloader.get_training_set()

num_samples = train_labels.shape[0]
# get images as vectors
x = train_data.reshape(num_samples, -1)
# one-hot encode labels
y = np.zeros((num_samples, 10))
y[np.arange(num_samples),train_labels] = 1

In [65]:
def random_init_weights(size):
    return np.random.normal(0, 1, size)

class Network():
    def __init__(self, layers=[]):
        self.weights = []
        self.biases = []
        for i in range(1, len(layers)):
            self.weights.append(random_init_weights((layers[i-1], layers[i])))
            self.biases.append(np.zeros((1, layers[i])))
    
    def forward(self, batch):
        activations = batch
        for weights, biases in zip(self.weights, self.biases):
            activations = sigmoid(np.dot(activations, weights) + biases)
        return activations
    
    def backward(self, batch, targets, dropout=0.5):
        # Do the forward pass and cache the values
        zs, activations = [], [batch]
        for weights, biases in zip(self.weights, self.biases):
            zs.append(np.dot(activations[-1], weights) + biases)
            activation = sigmoid(zs[-1])
            if dropout > 0 and len(activations) < len(self.weights):
                dropout_mask = (np.random.rand(*activation.shape) < dropout) / dropout 
                activation *= dropout_mask
            activations.append(activation)
            
        correct = np.count_nonzero(np.argmax(activations[-1], 1) == np.argmax(targets, 1)) 
        
        # Backwards pass
        weight_gradients = []
        bias_gradients = []
        
        # The output layer error
        error = MSE_derivative(activations[-1], targets) * sigmoid_derivative(zs[-1])
        for l in range(2, len(self.weights)+2):
            # Calculate the gradients from the error
            bias_gradients.append(np.sum(error, 0))
            weight_gradients.append(np.dot(activations[-l].T, error))
            # Backpropagate the error until we get the input layer
            if not l > len(zs):
                error = np.dot(self.weights[-l+1], error.T).T * sigmoid_derivative(zs[-l])
            
        weight_gradients.reverse()
        bias_gradients.reverse()
        
        return correct, weight_gradients, bias_gradients
    
    def fit(self, data, targets, epochs=10, batch_size=5, learning_rate=1e-3, reg_weight=1e-3)
        num_samples = data.shape[0]
        batch_starts = np.arange(batch_size, num_samples, batch_size)
        idxes = np.arange(0, num_samples, 1)
        for epoch in range(epochs):
            np.random.shuffle(idxes)
            batches = np.split(idxes, batch_starts)
            total_correct = 0
            for batch in batches:
                c, weight_gradients, bias_gradients = self.backward(data[batch], targets[batch])
                total_correct += c
                for i in range(len(self.weights)):
                    self.weights[i] = (1 - learning_rate * reg_weight / self.weights[i].size) * self.weights[i]  - learning_rate * (1/batch_size) * weight_gradients[i] 
                    self.biases[i] -= learning_rate * (1/batch_size) * bias_gradients[i] 
            print(f'Epoch {epoch} - {total_correct}/{num_samples}')
        

In [66]:
input_dim = 28 * 28
hidden_dim = 30 
out_dim = 10

net = Network([input_dim, 100, 30, out_dim])

In [None]:
%%time
net.fit(x, y, epochs=50, batch_size=5, learning_rate=1)

Epoch 0 - 37149/60000
Epoch 1 - 37650/60000
Epoch 2 - 37921/60000
Epoch 3 - 38290/60000
Epoch 4 - 38283/60000
Epoch 5 - 38375/60000
Epoch 6 - 38832/60000
Epoch 7 - 38930/60000
Epoch 8 - 39129/60000
Epoch 9 - 39370/60000
Epoch 10 - 40199/60000
Epoch 11 - 41202/60000
Epoch 12 - 41402/60000
Epoch 13 - 41621/60000
Epoch 14 - 41816/60000
Epoch 15 - 41835/60000
Epoch 16 - 41957/60000
Epoch 17 - 42139/60000
Epoch 18 - 42175/60000
Epoch 19 - 42116/60000
Epoch 20 - 42256/60000
Epoch 21 - 42217/60000
Epoch 22 - 42367/60000
Epoch 23 - 42174/60000
Epoch 24 - 42090/60000
Epoch 25 - 42150/60000


In [10]:
net.fit(x, y, epochs=5, batch_size=5, learning_rate=1)

Epoch 0 - 52120/60000
Epoch 1 - 52232/60000
Epoch 2 - 52294/60000
Epoch 3 - 52414/60000
Epoch 4 - 53210/60000


In [11]:
net.fit(x, y, epochs=5, batch_size=5, learning_rate=5e-1)

Epoch 0 - 57800/60000
Epoch 1 - 57845/60000
Epoch 2 - 57913/60000
Epoch 3 - 57945/60000
Epoch 4 - 57992/60000
