In [1]:

# Package imports
import matplotlib.pyplot as plt
import numpy as np
import sklearn
import sklearn.datasets
import sklearn.linear_model
import matplotlib
import math

class NN():
    
    def __init__(self, config={}):
                        
        self.classes = 3 #2 or 3
        self.steps_count = 20000

        self.samples_count = 201
        self.batch_size = 200
        self.batch_size = min(self.batch_size, self.samples_count-1)
    
        self.reg_lambda = 0.01
        self.learning_rate = 0.01
        
        self.input_dim = 2
        
        self.hidden_dim1 = 10
        self.hidden_dim2 = 10
        self.hidden_dim3 = 10
        
        self.activization_type = "Relu" #Sigmoid, Tanh, Relu
        
        for key, value in config.items():
            setattr(self, key, value)

        self.classes = max(min(self.classes, 3), 0)
        self.output_dim = self.classes
        
        np.random.seed(0)
        
        # layer1
        self.W1 = np.random.randn(self.input_dim, self.hidden_dim1) / np.sqrt(self.input_dim)
        self.b1 = np.zeros((1, self.hidden_dim1))
        
        # layer2
        self.W2 = np.random.randn(self.hidden_dim1, self.hidden_dim2) / np.sqrt(self.hidden_dim1)
        self.b2 = np.zeros((1, self.hidden_dim2)) 
    
        # layer3
        self.W3 = np.random.randn(self.hidden_dim2, self.hidden_dim3) / np.sqrt(self.hidden_dim2)
        self.b3 = np.zeros((1, self.hidden_dim3))    
    
        # layer4
        self.W4 = np.random.randn(self.hidden_dim3, self.output_dim) / np.sqrt(self.hidden_dim3)
        self.b4 = np.zeros((1, self.output_dim))     

    def generateData(self):       
        
        np.random.seed(0)
        
        if (self.classes == 2):
            x, y = sklearn.datasets.make_moons(self.samples_count, noise=0.20)
        else:
            x, y = sklearn.datasets.make_blobs(self.samples_count, n_features=2, center_box=(-1.0, 1.0))
            
        self.train_x = x
        self.train_y = np.eye(np.max(y) + 1)[y]
    
    def train(self, print_loss=True):
        
        for i in xrange(0, self.steps_count):
            
            # Selecting the batch data
            offset = (i * self.batch_size) % (self.samples_count - self.batch_size)
            x = self.train_x[offset:(offset + self.batch_size)]    
            y = self.train_y[offset:(offset + self.batch_size)]
            
            # Forward propagation
            res = self.forwardPropagation(x)
            
            # Back propagation
            self.backpropagation(x, y, res)
            
            # This is expensive because it uses the whole dataset, so we don't want to do it too often.
            if print_loss and i % 5000 == 1:
              print "Loss after iteration %i: %f" %(i, self.calculate_loss(x, y, self.batch_size))
            
        print "Loss: %f" %(self.calculate_loss(self.train_x, self.train_y, self.samples_count))
        print("")
        
    def forwardPropagation(self, x):
        
        #selecting an activization function
        activization = self.get_activization_func()
        
        # layer1
        layer1 = x.dot(self.W1) + self.b1
        a_layer1 = activization(layer1)
        
        # layer2
        layer2 = a_layer1.dot(self.W2) + self.b2
        a_layer2 = activization(layer2)   
    
        # layer3
        layer3 = a_layer2.dot(self.W3) + self.b3
        a_layer3 = activization(layer3)       
    
        # layer4
        layer4 = a_layer3.dot(self.W4) + self.b4
        probs = self.softmax(layer4)
        
        res = {"probs": probs,
                "layer4": layer4,
               
                "a_layer3": a_layer3,
                "layer3": layer3,                
            
                "a_layer2": a_layer2,
                "layer2": layer2,
                
                "a_layer1": a_layer1,
                "layer1": layer1,
            }
        return res
    
    def backpropagation(self, x, y, res):
        
        num_examples = len(x)
        back_activization_func = self.get_back_activization_func()
        
        a_layer1 = res["a_layer1"]
        layer1 = res["layer1"]
        
        a_layer2 = res["a_layer2"]
        layer2 = res["layer2"]

        a_layer3 = res["a_layer3"]
        layer3 = res["layer3"]
        
        probs = res["probs"]
        
        # Backpropagation
        delta5 = probs - y 
                 
        # dlayer4
        dW4 = np.dot(a_layer3.T, delta5)
        db4 = np.sum(delta5, axis=0, keepdims=True)
        
        # dlayer3
        delta4 = back_activization_func(delta5, self.W4, layer3)
        dW3 = np.dot(a_layer2.T, delta4)
        db3 = np.sum(delta4, axis=0, keepdims=True)
        
        # dlayer2
        delta3 = back_activization_func(delta4, self.W3, layer2)
        dW2 = np.dot(a_layer1.T, delta3)
        db2 = np.sum(delta3, axis=0, keepdims=True)        
        
        # dlayer1
        delta2 = back_activization_func(delta3, self.W2, layer1)
        dW1 = np.dot(x.T, delta2)
        db1 = np.sum(delta2, axis=0, keepdims=True) 
    
        # Add regularization terms (b1 and b2 don't have regularization terms)
        dW4 += self.reg_lambda * self.W4        
        dW3 += self.reg_lambda * self.W3
        dW2 += self.reg_lambda * self.W2
        dW1 += self.reg_lambda * self.W1

        # Gradient descent parameter update
        self.W1 += -self.learning_rate * dW1
        self.b1 += -self.learning_rate * db1
        
        self.W2 += -self.learning_rate * dW2
        self.b2 += -self.learning_rate * db2

        self.W3 += -self.learning_rate * dW3
        self.b3 += -self.learning_rate * db3        
 
        self.W4 += -self.learning_rate * dW4
        self.b4 += -self.learning_rate * db4
        
    def predict(self, x):
        res = self.forwardPropagation(x)
        probs = res["probs"]

        return np.argmax(probs, axis=1)    
    
    def calculate_loss(self, x, y, size):
                
        res = self.forwardPropagation(x)
        probs = res["probs"]
            
        # Calculating the loss
        corect_logprobs = -np.log(np.amax(probs * y, axis=1))        
        data_loss = np.sum(corect_logprobs)
                
        # Add regulatization term to loss (optional)
        data_loss += self.reg_lambda / 2 * (np.sum(np.square(self.W1)) +
                                            np.sum(np.square(self.W2)) + 
                                            np.sum(np.square(self.W3))
                                           )
        return 1./size * data_loss
    
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def dsigmoid(self, x):
        return self.sigmoid(x) * (1 - self.sigmoid(x))
    
    def relu(self, x):
        return np.maximum(0, x)

    def drelu(self, a1, W, probs):
        dhidden = np.dot(probs, W.T)
        dhidden[a1 <= 0] = 0
        return dhidden
    
    def softmax(self, z, t = 1.0):
        exp_scores = np.exp(z)
        probs = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
        return probs     
        
    def get_activization_func(self):
        
        #selecting the activization function
        if (self.activization_type == "Relu"):
            activization_func = self.relu
            
        elif (self.activization_type == "Sigmoid"):
            activization_func = self.sigmoid
            
        elif (self.activization_type == "Tanh"):
            activization_func = np.tanh
            
        else:
            activization_func = np.tanh
            
        return activization_func
    
    
    def get_back_activization_func(self):
        
        #selecting the back activization function
        if (self.activization_type == "Relu"):
            back_activization_func = self.back_relu
            
        elif (self.activization_type == "Sigmoid"):
            back_activization_func = self.back_sigmoid
            
        elif (self.activization_type == "Tanh"):
            back_activization_func = self.back_tanh
            
        else:
            back_activization_func = self.back_tanh
            
        return back_activization_func
    
    def back_relu(self, delta, w_matrix, layer):
        
        activization_func = self.get_activization_func()
        a_layer = activization_func(layer)
        
        dhidden = np.dot(delta, w_matrix.T)
        dhidden[a_layer <= 0] = 0
        return dhidden
        
    def back_sigmoid(self, delta, w_matrix, layer):
        return delta.dot(w_matrix.T) * self.dsigmoid(layer)

    def back_tanh(self, delta, w_matrix, layer):
        
        activization_func = self.get_activization_func()
        a_layer = activization_func(layer)
        
        return delta.dot(w_matrix.T) * (1 - np.power(a_layer, 2))
    
    def plot_decision_boundary(self, pred_func):
                
        X = self.train_x
        y = np.argmax(self.train_y, axis=1)
        
        # Set min and max values and give it some padding
        x_min, x_max = X[:, 0].min() - .5, X[:, 0].max() + .5
        y_min, y_max = X[:, 1].min() - .5, X[:, 1].max() + .5
        h = 0.01
        
        # Generate a grid of points with distance h between them
        xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
        
        # Predict the function value for the whole gid
        Z = pred_func(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)
        
        # Plot the contour and training examples
        plt.contourf(xx, yy, Z, cmap=plt.cm.Spectral)
        plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.Spectral)    
    
print("Reloaded")

Reloaded


In [11]:
# Experiments with 2 output classes

classes2 = 2
steps_count = 20001
activization_type = "Relu" #Relu, Tanh, Sigmoid

configs2 = [
    {"classes": classes2,
     "steps_count": steps_count,
     "activization_type": activization_type,
     "hidden_dim1": 15,
     "hidden_dim2": 15,
     "hidden_dim3": 15,     
    },
    
#     {"classes": classes2,
#      "steps_count": steps_count,
#      "activization_type": activization_type,     
#      "hidden_dim1": 11,
#      "hidden_dim2": 10,
#     },
    
#     {"classes": classes2,
#      "steps_count": steps_count,
#      "activization_type": activization_type,     
#      "hidden_dim1": 12,
#      "hidden_dim2": 12,
#     },  
]

plt.figure(figsize=(16, 8))
for (i, config) in enumerate(configs2):
    nn = NN(config)
    nn.generateData()
    nn.train()
    loss = nn.calculate_loss(nn.train_x, nn.train_y, nn.samples_count)

    plt.subplot(math.ceil(float(len(configs2)) / 2), 2, i+1)
    nn.plot_decision_boundary(lambda x: nn.predict(x))
    plt.title('Loss: %f, hidden layer1 size %d, hidden layer2 size %d, hidden layer3 size %d' %
              (loss, config["hidden_dim1"], config["hidden_dim2"], config["hidden_dim3"]))
    
plt.show()


Loss after iteration 1: 0.425383
Loss after iteration 5001: 0.693814
Loss after iteration 10001: 0.693361
Loss after iteration 15001: 0.693194
Loss: 0.693183



In [12]:
# Experiments with 3 output classes

classes = 3
steps_count = 50001
activization_type = "Tanh" #Relu, Tanh, Sigmoid
learning_rate = 0.001

configs3 = [
#     {"classes": classes,
#      "steps_count": steps_count,
#      "activization_type": activization_type,
#      "learning_rate": learning_rate,
#      "hidden_dim1": 8,
#      "hidden_dim2": 8,
#     },
    
#     {"classes": classes,
#      "steps_count": steps_count,
#      "activization_type": activization_type,
#      "learning_rate": learning_rate,     
#      "hidden_dim1": 8,
#      "hidden_dim2": 16,
#     },
    
#     {"classes": classes,
#      "steps_count": steps_count,
#      "activization_type": activization_type,
#      "learning_rate": learning_rate,
#      "hidden_dim1": 16,
#      "hidden_dim2": 8,
#     },
    
    {"classes": classes,
     "steps_count": steps_count,
     "activization_type": activization_type,
     "learning_rate": learning_rate,
     "hidden_dim1": 16,
     "hidden_dim2": 16,
     "hidden_dim3": 16,
    },   
]

plt.figure(figsize=(16, 8))
for (i, config) in enumerate(configs3):
    nn = NN(config)
    nn.generateData()
    nn.train()
    loss = nn.calculate_loss(nn.train_x, nn.train_y, nn.samples_count)

    plt.subplot(math.ceil(float(len(configs3)) / 2), 2, i+1)
    nn.plot_decision_boundary(lambda x: nn.predict(x))
    plt.title('Loss: %f, hidden layer1 size %d, hidden layer2 size %d, hidden layer3 size %d' %
              (loss, config["hidden_dim1"], config["hidden_dim2"], config["hidden_dim3"]))
    
plt.show()


Loss after iteration 1: 1.104076
Loss after iteration 5001: 0.345156
Loss after iteration 10001: 0.112056
Loss after iteration 15001: 0.054196
Loss after iteration 20001: 0.044844
Loss after iteration 25001: 0.041664
Loss after iteration 30001: 0.041567
Loss after iteration 35001: 0.038824
Loss after iteration 40001: 0.041901
Loss after iteration 45001: 0.037807
Loss: 0.036130

