### SVHN neural network from scratch

#### Import the dataset from drive

In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import h5py
import numpy as np

# Open the file as readonly
h5f = h5py.File('/content/drive/My Drive/DLCP/Project-1/Data/SVHN_single_grey1.h5', 'r')

# Load the training, test and validation set
x_train = h5f['X_train'][:]
y_train = h5f['y_train'][:]
x_test = h5f['X_test'][:]
y_test = h5f['y_test'][:]


# Close this file
h5f.close()

x_train = x_train.reshape(x_train.shape[0], 1024)
x_test = x_test.reshape(x_test.shape[0], 1024)

# # normalize inputs from 0-255 to 0-1
x_train = x_train / 255.0
x_test = x_test / 255.0

X_val = x_test
y_val = y_test

print('Training set', x_train.shape, y_train.shape)
print('Test set', x_test.shape, y_test.shape)

Training set (42000, 1024) (42000,)
Test set (18000, 1024) (18000,)


In [0]:
print(x_train.shape)
print(y_train.shape)
print(X_val.shape)
print(y_val.shape)

(42000, 1024)
(42000,)
(18000, 1024)
(18000,)


In [0]:
print(x_test.shape)
print(y_test.shape)

(18000, 1024)
(18000,)


#### Define the Fully Connected Layer

In [0]:
import numpy as np 

class Linear():
    def __init__(self, in_size, out_size):
        self.W = np.random.randn(in_size, out_size) * 0.01
        self.b = np.zeros((1, out_size))
        self.params = [self.W, self.b]
        self.gradW = None
        self.gradB = None
        self.gradInput = None

    def forward(self, X):
        self.X = X
        output = np.dot(self.X, self.W) + self.b
        return output

    def backward(self, nextgrad):
        self.gradW = np.dot(self.X.T, nextgrad)
        self.gradB = np.sum(nextgrad, axis=0)
        self.gradInput = np.dot(nextgrad, self.W.T)
        return self.gradInput, [self.gradW, self.gradB]

#### Define the Rectified Linear Activation Layer


In [0]:
class ReLU():
    def __init__(self):
        self.params = []
        self.gradInput = None

    def forward(self, X):
        self.output = np.maximum(X, 0)
        return self.output

    def backward(self, nextgrad):
        self.gradInput = nextgrad.copy()
        self.gradInput[self.output <=0] = 0
        return self.gradInput, []

#### Define the softmax function

In [0]:
def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

#### Define the Cross Entropy Loss

In [0]:
class CrossEntropy:
    def forward(self, X, y):
        self.m = y.shape[0]
        self.p = softmax(X)
        cross_entropy = -np.log(self.p[range(self.m), y]+1e-16)
        loss = np.sum(cross_entropy) / self.m
        return loss
    
    def backward(self, X, y):
        y_idx = y.argmax()        
        grad = softmax(X)
        grad[range(self.m), y] -= 1
        grad /= self.m
        return grad

#### Define the container NN class that enables the forward prop and backward propagation of the entire network. Note, how this class enables us to add layers of different types and also correctly pass gradients using the chain rule. Add L2 Regularization.

In [0]:
class NN():
    def __init__(self, lossfunc=CrossEntropy(), mode='train'):
        self.params = []
        self.layers = []
        self.loss_func = lossfunc
        self.grads = []
        self.mode = mode
        
    def add_layer(self, layer):
        self.layers.append(layer)
        self.params.append(layer.params)

    def forward(self, X):
        for layer in self.layers:
            X = layer.forward(X)
        return X
    
    def backward(self, nextgrad):
        self.clear_grad_param()
        for layer in reversed(self.layers):
            nextgrad, grad = layer.backward(nextgrad)
            self.grads.append(grad)
        return self.grads
    
    def train_step(self, X, y):
        out = self.forward(X)
        loss = self.loss_func.forward(out,y)  + ((Lambda / (2 * y.shape[0])) * np.sum([np.sum(w**2) for w in self.params[0][0]]))
        nextgrad = self.loss_func.backward(out,y) + ((Lambda/y.shape[0]) * np.sum([np.sum(w) for w in self.params[0][0]]))
        grads = self.backward(nextgrad)
        return loss, grads
    
    def predict(self, X):
        X = self.forward(X)
        p = softmax(X)
        return np.argmax(p, axis=1)
    
    def predict_scores(self, X):
        X = self.forward(X)
        p = softmax(X)
        return p
    
    def clear_grad_param(self):
        self.grads = []

#### Define the update function (SGD)

In [0]:
def update(velocity, params, grads, learning_rate=0.01, mu=0.9):
    for v, p, g, in zip(velocity, params, reversed(grads)):
        for i in range(len(g)):
            v[i] = (mu * v[i]) - (learning_rate * g[i])
            p[i] += v[i]

#### Define a function which gives us the minibatches (both the datapoint and the corresponding label)

In [0]:
def minibatch(X, y, minibatch_size):
    n = X.shape[0]
    minibatches = []
    permutation = np.random.permutation(X.shape[0])
    X = X[permutation]
    y = y[permutation]
    
    for i in range(0, n , minibatch_size):
        X_batch = X[i:i + minibatch_size, :]
        y_batch = y[i:i + minibatch_size, ]

        minibatches.append((X_batch, y_batch))
    return minibatches

#### The traning loop

In [0]:
def sgd(net, X_train, y_train, minibatch_size, epoch, learning_rate, mu=0.9, X_val=None, y_val=None, Lambda=0, verb=True):
    val_loss_epoch = []
    minibatches = minibatch(X_train, y_train, minibatch_size)
    minibatches_val = minibatch(X_val, y_val, minibatch_size)
    
    for i in range(epoch):
        loss_batch = []
        val_loss_batch = []
        velocity = []
        for param_layer in net.params:
            p = [np.zeros_like(param) for param in list(param_layer)]
            velocity.append(p)
            
        # iterate over mini batches
        for X_mini, y_mini in minibatches:
            loss, grads = net.train_step(X_mini, y_mini)
            loss_batch.append(loss)
            update(velocity, net.params, grads, learning_rate=learning_rate, mu=mu)

        for X_mini_val, y_mini_val in minibatches_val:
            val_loss, _ = net.train_step(X_mini, y_mini)
            val_loss_batch.append(val_loss)
        
        # accuracy of model at end of epoch after all mini batch updates
        m_train = X_train.shape[0]
        m_val = X_val.shape[0]
        y_train_pred = []
        y_val_pred = []
        y_train1 = []
        y_vall = []
        for ii in range(0, m_train, minibatch_size):
            X_tr = X_train[ii:ii + minibatch_size, : ]
            y_tr = y_train[ii:ii + minibatch_size,]
            y_train1 = np.append(y_train1, y_tr)
            y_train_pred = np.append(y_train_pred, net.predict(X_tr))

        for ii in range(0, m_val, minibatch_size):
            X_va = X_val[ii:ii + minibatch_size, : ]
            y_va = y_val[ii:ii + minibatch_size,]
            y_vall = np.append(y_vall, y_va)
            y_val_pred = np.append(y_val_pred, net.predict(X_va))
            
        train_acc = check_accuracy(y_train1, y_train_pred)
        val_acc = check_accuracy(y_vall, y_val_pred)
        
        ## weights
        w = np.array(net.params[0][0])
        
        ## adding regularization to cost
        mean_train_loss = (sum(loss_batch) / float(len(loss_batch)))
        mean_val_loss = sum(val_loss_batch) / float(len(val_loss_batch))
        
        val_loss_epoch.append(mean_val_loss)
        if verb:
            if i%5 == 0:
                print("Epoch {3}/{4}: Loss = {0} | Training Accuracy = {1}".format(mean_train_loss, train_acc, val_acc, i, epoch))
    return net, [val_acc,  mean_val_loss]

#### Checking the accuracy of the model 

In [0]:
def check_accuracy(y_true, y_pred):
    count = 0
    for i,j in zip(y_true, y_pred):
        if int(i)==j:
            count +=1
    return float(count)/float(len(y_true))

#### Invoking all that we have created until now

In [0]:
from random import shuffle

## input size
input_dim = x_train.shape[1]

def train_and_test_loop(iterations, lr, Lambda, verb=True):
    ## hyperparameters
    iterations = iterations
    learning_rate = lr
    hidden_nodes = 128
    output_nodes = 10

    ## define neural net
    nn = NN()
    nn.add_layer(Linear(input_dim, hidden_nodes))
    nn.add_layer(ReLU())
    nn.add_layer(Linear(hidden_nodes, output_nodes))

    nn, [val_acc, val_loss] = sgd(nn, x_train , y_train, minibatch_size=420, epoch=iterations, learning_rate=learning_rate,\
                      X_val=X_val, y_val=y_val, Lambda=Lambda, verb=verb)
    return [val_acc, val_loss]

#### Repeat all the steps given in Babysitting process on SVHN dataset.

In [0]:
lr = 0.01
Lambda = 0
train_and_test_loop(1, lr, Lambda)

Epoch 0/1: Loss = 2.30254291097064 | Training Accuracy = 0.12335714285714286


0.12227777777777778

Loss and accuracy is reasonable for untrained network 


In [0]:
lr = 0.00001
Lambda = 1e3
train_and_test_loop(1, lr, Lambda)


Epoch 0/1: Loss = 3.1446824563871853e+128 | Training Accuracy = 0.09966666666666667


0.10077777777777777

Loss went up because of Lambda factor 

In [0]:
x_train_subset = x_train[30:50,]
y_train_subset = y_train[30:50,]
x_train = x_train_subset
y_train = y_train_subset

In [0]:
x_train.shape

(20, 1024)

In [0]:
y_train.shape

(20,)

Overfit very small portion of the training data
So, set a small learning rate and turn regularization off

In the code below:
- Take the first 20 examples from SVHN
- turn off regularization(reg=0.0)
- use simple vanilla 'sgd'

In [0]:
%%time

lr = 0.02
Lambda = 0
train_and_test_loop(1000, lr, Lambda)

Epoch 0/1000: Loss = 2.303775449383137 | Training Accuracy = 0.1
Epoch 5/1000: Loss = 2.286312810648771 | Training Accuracy = 0.2
Epoch 10/1000: Loss = 2.2674454469013563 | Training Accuracy = 0.2
Epoch 15/1000: Loss = 2.243303526792781 | Training Accuracy = 0.2
Epoch 20/1000: Loss = 2.2134474609280255 | Training Accuracy = 0.2
Epoch 25/1000: Loss = 2.181553090815501 | Training Accuracy = 0.2
Epoch 30/1000: Loss = 2.154668509191717 | Training Accuracy = 0.2
Epoch 35/1000: Loss = 2.136374610169531 | Training Accuracy = 0.2
Epoch 40/1000: Loss = 2.124409072670691 | Training Accuracy = 0.2
Epoch 45/1000: Loss = 2.1157275418460517 | Training Accuracy = 0.2
Epoch 50/1000: Loss = 2.1087511606039735 | Training Accuracy = 0.2
Epoch 55/1000: Loss = 2.10286769490577 | Training Accuracy = 0.2
Epoch 60/1000: Loss = 2.09775660446433 | Training Accuracy = 0.2
Epoch 65/1000: Loss = 2.093221132563362 | Training Accuracy = 0.2
Epoch 70/1000: Loss = 2.0891039379573835 | Training Accuracy = 0.2
Epoch 75/

Very small loss, train accuracy going to 100, nice! We are successful in overfitting. If your accuracy is not 100%, then tweak the hyperparameters and epoch values.

In [0]:
# Reload the original training and test set 

import h5py
import numpy as np

# Open the file as readonly
h5f = h5py.File('/content/drive/My Drive/DLCP/Project-1/Data/SVHN_single_grey1.h5', 'r')

# Load the training, test and validation set
x_train = h5f['X_train'][:]
y_train = h5f['y_train'][:]
x_test = h5f['X_test'][:]
y_test = h5f['y_test'][:]


# Close this file
h5f.close()

x_train = x_train.reshape(x_train.shape[0], 1024)
x_test = x_test.reshape(x_test.shape[0], 1024)

# # normalize inputs from 0-255 to 0-1
x_train = x_train / 255.0
x_test = x_test / 255.0

X_val = x_test
y_val = y_test

print('Training set', x_train.shape, y_train.shape)
print('Test set', x_test.shape, y_test.shape)

Training set (42000, 1024) (42000,)
Test set (18000, 1024) (18000,)


###Step 4: Start with small regularization and find learning rate that makes the loss go down.

- we start with Lambda(small regularization) = 1e-7
- we start with a small learning rate =1e-7

In [0]:
#set the hyperparameters according to the above instructions
lr = 1e-7
Lambda = 1e-7
iterations = 100
#call the train and test function
train_and_test_loop(iterations=iterations, lr=lr,Lambda=Lambda)

Epoch 0/100: Loss = 2.3129020523458985 | Training Accuracy = 0.1019047619047619
Epoch 50/100: Loss = 2.3122106666718105 | Training Accuracy = 0.10164285714285715


0.09938888888888889

### Step 5: Lets try a (larger) learning rate . 

- Learning rate lr  
- Regularization lambda 


In [0]:
# Set Hyperparameters( High value for lr and low values for lambda)
lr = 1
Lambda = 0
iterations = 1
# Call the train and test function
train_and_test_loop(iterations=iterations, lr=lr,Lambda=Lambda)

Epoch 0/1: Loss = 29.633920730298275 | Training Accuracy = 0.0999047619047619


0.10022222222222223

**Observation** : Loss went up

### Step 6: Train the model for different learning rates (In a range) based on the learning from above steps

- learning rate =
- regularization remains the small, lambda 

In [0]:
lr = 0.01
Lambda = 0.003
train_and_test_loop(iterations=50, lr=lr,Lambda=Lambda)

Epoch 0/50: Loss = 2.3128706805053234 | Training Accuracy = 0.12461904761904762


0.2003888888888889

### Hyperparameter Optimization

### Cross validation Strategy


- Do coarse -> fine cross-validation in stages

- First stage: only a few epochs to get rough idea of what params work
- Second stage: longer running time, finer search
- … (repeat as necessary)

### Tip for detecting explosions in the solver: 
- If the cost is ever > 3 * original cost, break out early




### For example: Run coarse search for 10 times with different lr and Lambda values each with 100 epochs.


In [0]:
%%time
#Run coarse search for a coarse range of lr and lambda values and print the results of the 
#first 10 epochs and figure out the range of lr and lambda for finer search
import math
import numpy as np
for k in range(1,10):
    lr = math.pow(10, np.random.uniform(-7.0, 4.0))
    Lambda = math.pow(10, np.random.uniform(-5,5))
    best_acc = train_and_test_loop(50, lr, Lambda, False)
    print("Try {0}/{1}: Best_val_acc: {2}, lr: {3}, Lambda: {4}\n".format(k, 100, best_acc, lr, Lambda))

  return umr_sum(a, axis, dtype, out, keepdims)
  
  return umr_maximum(a, axis, None, out, keepdims)


Try 1/100: Best_val_acc: 0.10077777777777777, lr: 2.271906758517457, Lambda: 30903.229871518823

Try 2/100: Best_val_acc: 0.10077777777777777, lr: 0.012953111407207103, Lambda: 2136.0687033782715

Try 3/100: Best_val_acc: 0.10077777777777777, lr: 223.57309535727194, Lambda: 0.008505449308264152

Try 4/100: Best_val_acc: 0.10066666666666667, lr: 3.3375363641427936e-07, Lambda: 20.03948678128411

Try 5/100: Best_val_acc: 0.17333333333333334, lr: 0.00028048841273538885, Lambda: 0.00026288007569977496

Try 6/100: Best_val_acc: 0.10033333333333333, lr: 1.91826859188316e-06, Lambda: 0.0001006377382343227

Try 7/100: Best_val_acc: 0.10077777777777777, lr: 2256.444651199504, Lambda: 0.00016525793568113983

Try 8/100: Best_val_acc: 0.10077777777777777, lr: 0.11385850756164226, Lambda: 12.70771186488584

Try 9/100: Best_val_acc: 0.11683333333333333, lr: 0.10734762345471321, Lambda: 1.6164428866218095e-05

CPU times: user 33min 40s, sys: 15min 11s, total: 48min 51s
Wall time: 26min 35s



**Learning rate and Lambda values corresponding to better accuracy rates**

*   Try 5/100: Best_val_acc: 0.17333333333333334, lr: 0.00028048841273538885, Lambda: 0.00026288007569977496
*   Try 9/100: Best_val_acc: 0.11683333333333333, lr: 0.10734762345471321, Lambda: 1.6164428866218095e-05





### Now run finer search

In [0]:
#Set a finer range of hyperparameters and figure out even finer range
import math
import numpy as np
for k in range(1,10):
    lr = math.pow(10, np.random.uniform(-3.0, 1.0))
    Lambda = math.pow(10, np.random.uniform(-5,2))
    best_acc = train_and_test_loop(10, lr, Lambda, False)
    print("Try {0}/{1}: Best_val_acc: {2}, lr: {3}, Lambda: {4}\n".format(k, 10, best_acc, lr, Lambda))

  
  return umr_maximum(a, axis, None, out, keepdims)
  if sys.path[0] == '':


Try 1/10: Best_val_acc: [0.10077777777777777, nan], lr: 0.15387519821913492, Lambda: 0.00013496767005277493

Try 2/10: Best_val_acc: [0.24983333333333332, 2.242363286315263], lr: 0.00664917999214434, Lambda: 0.21088513336418893

Try 3/10: Best_val_acc: [0.0971111111111111, 2.3014919191907377], lr: 0.0016633161935173803, Lambda: 3.78245218499077e-05

Try 4/10: Best_val_acc: [0.5913333333333334, 1.1222306557805626], lr: 0.04392805692170639, Lambda: 0.001266685087183563

Try 5/10: Best_val_acc: [0.10077777777777777, 2.3546362981163472e+172], lr: 0.1572610007672735, Lambda: 2.1984793744019777

Try 6/10: Best_val_acc: [0.10077777777777777, nan], lr: 3.7165775485436705, Lambda: 6.697891989596913

Try 7/10: Best_val_acc: [0.6500555555555556, 1.0711399305850882], lr: 0.025595405777120297, Lambda: 1.050944460504173e-05

Try 8/10: Best_val_acc: [0.10077777777777777, nan], lr: 0.35999478694745557, Lambda: 1.04700774838969e-05

Try 9/10: Best_val_acc: [0.10077777777777777, 1.8158077792124403e+73],

**Learning rate and Lambda values corresponding to better accuracy rates**


*   Try 7/10: Best_val_acc: [0.6500555555555556, 1.0711399305850882], lr: 0.025595405777120297, Lambda: 1.050944460504173e-05
*   Try 4/10: Best_val_acc: [0.5913333333333334, 1.1222306557805626], lr: 0.04392805692170639, Lambda: 0.001266685087183563





### Running deep with the best possible lr and lambda and report the accuracy 

In [0]:
#Set the best hyperparameters found in the previous steps
lr =  0.02
Lambda =  0.0003
iterations = 1000
#Call the train and test function (with score)
train_and_test_loop(iterations, lr, Lambda)

Epoch 0/1000: Loss = 2.3024304641468736 | Training Accuracy = 0.1079047619047619
Epoch 5/1000: Loss = 1.8038414137294374 | Training Accuracy = 0.4488809523809524
Epoch 10/1000: Loss = 1.2254019065202586 | Training Accuracy = 0.6408571428571429
Epoch 15/1000: Loss = 1.0381807354321118 | Training Accuracy = 0.6973571428571429
Epoch 20/1000: Loss = 0.9364721139791099 | Training Accuracy = 0.7283095238095239
Epoch 25/1000: Loss = 0.8592783572683627 | Training Accuracy = 0.7375
Epoch 30/1000: Loss = 0.7922491726768469 | Training Accuracy = 0.7585714285714286
Epoch 35/1000: Loss = 0.7387019783120299 | Training Accuracy = 0.7800952380952381
Epoch 40/1000: Loss = 0.7052297970016913 | Training Accuracy = 0.799
Epoch 45/1000: Loss = 0.6761506174631248 | Training Accuracy = 0.8115714285714286
Epoch 50/1000: Loss = 0.6483869543343672 | Training Accuracy = 0.8117619047619048
Epoch 55/1000: Loss = 0.6291619966062538 | Training Accuracy = 0.8130952380952381
Epoch 60/1000: Loss = 0.610811283341715 | T

0.823