In [72]:
import numpy as np

In [None]:
# load data

import cPickle
import gzip
 
with gzip.open('mnist.pkl.gz', 'rb') as f:
    train_set, valid_set, test_set = cPickle.load(f)

In [165]:
class Layer:
    def get_parameters(self):
        return []
    
    def get_grads(self, input_,  next_grad):
        return []

class LinearLayer(Layer):
    
    def __init__(self, input_size, output_size):
        self.w = np.random.uniform(-1./output_size**0.5, 1./output_size**0.5, size=(output_size, input_size))
        self.b = np.zeros(output_size)

    def output(self, x):
        return np.dot(x, self.w.T) + self.b
    
    def grad(self, input_, grad_next_layer):
        return np.dot(grad_next_layer, self.w)
    
    def update_parameters(self, input_, learning_rate, grad_next_layer):
        self.w -= learning_rate * np.dot(input_.T, grad_next_layer)
        self.b -= learning_rate * grad_next_layer
        
    def get_parameters(self):
        return [self.w, self.b]
    
class SigmoidLayer(Layer):
    def __init__(self):
        pass
    
    def sigmoid(self, x):
        return 1. / (1. + np.exp(x))
    
    def output(self, x):
        return self.sigmoid(x)
    
    def grad(self, input_, grad_next_layer):
        s = self.sigmoid(input_)
        return s * (1 - s)
        
class SoftmaxLayer(Layer):
    
    def __init__(self):
        pass
    
    def output(self, x):
        probs = np.exp(x)
        return probs / np.sum(probs, axis=1)[:,None]
    
    def grad(self, input_, grad_next_layer):
        v = input_ - input_**2
        return v
  
        
        

In [161]:
class MLP:
    
    def __init__(self, layers):
        self.layers = layers
        
    def train(self, n_epochs, batch_size, train, valid):
        train_x = train[0]
        train_y = train[1]
        n_batch = int(train_x.shape[0] / batch_size)
        
        for epoch in range(n_epochs):
            
            for current_batch in range(n_batch):
                fprop_results = self.fprop(train_x[current_batch*batch_size:(current_batch+1)*batch_size])
                grads = self.bprop(train_x[current_batch*batch_size:(current_batch+1)*batch_size],
                                   train_y[current_batch*batch_size:(current_batch+1)*batch_size],
                                  fprop_results)
                    
    def fprop(self, data_x, data_y):
        fprop_results = [data_x]
        # fprop
        for layer in self.layers:
            fprop_results.append(layer.output(fprop_results[-1]))
            
        cost = -np.log(fprop_results[-1][np.arange(fprop_results[-1].shape[0]), data_y]).mean()
        return cost, fprop_results
    
    def bprop(self, data_x, data_y, fprop_results):
        activations = fprop_results[-1]
        dc = -1 / activations * (np.arange(activations.shape[1])[:, None] == data_y).T

        grads = [dc]
        for index, layer in enumerate(self.layers[::-1]):
            print index, grads[-1].shape
            grads.append(layer.grad(fprop_results[len(fprop_results)-index-1], grads[-1]))
            
        return grads

    def verify_gradients(self, input_size, n_classes):
    
        random_x = np.random.uniform(0, 1, size=(1, input_size))
        random_y = np.random.randint(10)
        epsilon = 10**-5
        
        cost, fprop_results = self.fprop(random_x, random_y)
        grads = self.bprop(random_x, random_y, fprop_results)
        
        # compute gradients with finite difference
        for index, layer in enumerate(layers):
            for parameter in zip(layer.get_parameters(), layer.get_grads(fprop_results[index], grads[-index])):
                
                if len(parameter.shape) == 2:
                    numerical_estimate = np.zeros(parameter.shape)
                    for i in range(parameter.shape[0]):
                        for j in range(parameter.shape[1]):
                            parameter[i, j] += epsilon
                            cost_right, discard = self.fprop(random_x, random_y)
                            parameter[i, j] -= 2 * epsilon
                            cost_left, discard = self.fprop(random_x, random_y)
                            numerical_estimate[i, j] = (cost_right - cost_left) / (2 * epsilon)
                            
                    print np.allclose(numerical_estimate, numerical_estimate)
    
    def predict(self):
        pass

In [166]:
# build actual mlp

layers = [LinearLayer(784, 100), SigmoidLayer(), LinearLayer(100, 10), SoftmaxLayer()]
mlp = MLP(layers)

mlp.verify_gradients(784, 10)
# mlp.train(5, 100, train_set, valid_set)

0 (1, 10)
1 (1, 10)
2 (1, 100)
3 (1, 100)
