#Introduction

    The dataset used is a subset of MNIST digit classification dataset.

    The model is implemented from scratch with 4 different layers of variable number of nodes.

# Data & library import

In [1]:
from torchvision import datasets, transforms
import numpy as np
import matplotlib.pyplot as plt

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

train_dataset = datasets.MNIST(root='./data', train=True, transform=transform, download=True,)
test_dataset = datasets.MNIST(root='./data', train=False, transform=transform)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 19665775.63it/s]


Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 623693.87it/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 5631704.97it/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 4129748.27it/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw






In [2]:
def obtain_subset(dataset,num_sample, num_classes=2):
    ###num_sample should be a multiple of num_classes
    new_x = np.zeros((num_sample, 784))
    new_y = np.zeros((num_sample,1),dtype=np.int8)

    lab = {}
    for i in range(num_classes):
        lab[i] = 0

    j = 0
    for i in range(len(train_dataset)):
        if train_dataset[i][1] in lab and lab[train_dataset[i][1]] < num_sample/num_classes:
            lab[train_dataset[i][1]] += 1
            new_x[j,:] = train_dataset[i][0].numpy().reshape(-1)
            new_y[j] = train_dataset[i][1]
            j+=1
        if j > num_sample:
            break
    print(lab)
    return new_x, new_y

tr_x, tr_y = obtain_subset(train_dataset,3000,3)
tst_x, tst_y = obtain_subset(test_dataset,999,3)

tr_x.shape, tr_y.shape, tst_x.shape, tst_y.shape

{0: 1000, 1: 1000, 2: 1000}
{0: 333, 1: 333, 2: 333}


((3000, 784), (3000, 1), (999, 784), (999, 1))

# Model Implementation

In [72]:
class DNNs:
    def __init__(self, num_classes=2, num_layers = 4, num_epochs=10, batch_size=64,
                 lr=0.001, step=10, step_rate=0.1):
        self.num_classes = num_classes
        self.num_layer= num_layers
        self.num_epochs = num_epochs
        self.batch_size = batch_size
        self.lr = lr
        self.step = step
        self.step_rate = step_rate

    def fit(self,x,y):
        #initialization
        self.initialize_layers(x.shape[1])


        loss = np.inf
        vt = [np.zeros((self.w[i].shape)) for i in range(self.num_layer)]
        mt = [np.zeros((self.w[i].shape)) for i in range(self.num_layer)]
        beta1, beta2 =  0.9, 0.9
        epsilon = 1e-8
        batches = x.shape[0]//self.batch_size

        for i in range(self.num_epochs):

            rand_ind = np.random.permutation(x.shape[0])
            x_rand = x[rand_ind,:]
            y_rand = y[rand_ind,:]

            for j in range(batches):
                begin = j*self.batch_size
                end = (j+1)*self.batch_size
                if end > x.shape[0]:
                    continue

                x_new = x_rand[begin:end,:]
                y_new = y_rand[begin:end,:]

                #forward compute
                self.compute_forward(x_new)


                #compute gradients via backpropogation
                self.compute_backward(x_new,y_new)

                #update weights
                for j in range(self.num_layer):
                    self.w[j] -= self.lr * self.dw[j]
                    self.b[j] -= self.lr * self.db[j]

                    #Adam optimizer
                    #mt[j] = beta1*mt[j] + (1-beta1)*self.dw[j]
                    #vt[j] = beta2*vt[j] + (1-beta2)*self.dw[j]**2
                    #self.w[j] -= (self.lr/np.sqrt(vt[j] + epsilon) )*mt[j]
                    #self.b[j] -= self.lr * self.db[j]

            #compute loss
            prev_loss = loss
            self.compute_forward(x_rand)
            loss = np.mean(self.loss_cce(y_rand, self.a[-1]))
            print(f'Epoch: {i} LR {self.lr} Loss: {loss}')

            if loss > 0.3 + prev_loss:
                print("Early stopping")
                break

            if (i+1)%self.step == 0:
                self.lr *= self.step_rate


    def predict(self,x):
        self.compute_forward(x)
        return np.argmax(self.a[-1],axis=1,keepdims=True)

    def initialize_one_layer(self,a,b):
        w = np.random.randn(a,b) #i/p - m x 728, w - 728 x num_nodes, o/p - m x 10, b - 1 x num_nodes
        #*np.sqrt(1./a)
        b = np.zeros((1,b))
        return w, b

    def initialize_layers(self,num_feats):
        num_nodes = [500,100,64,self.num_classes]
        self.w, self.b = [], []

        #initialization
        print('Initialiazing weights')
        for i in range(self.num_layer):
            if i==0:
                nodes = num_feats
            else:
                nodes = num_nodes[i-1]
            w,b = self.initialize_one_layer(nodes,num_nodes[i])
            print(f'Layer {i} :, w - {w.shape}, b - {b.shape}')
            self.w.append(w)
            self.b.append(b)
        print()

    def compute_forward(self,x):
        self.a = [[] for _ in range(self.num_layer)]
        self.z = [[] for _ in range(self.num_layer)]

        for i in range(self.num_layer):
            if i==0:
                aa = x
            else:
                aa = self.a[i-1]
            self.z[i] = np.dot(aa,self.w[i]) + self.b[i]
            self.a[i] = self.softmax(self.z[i])


    def compute_backward(self,x,y):
        self.dw = [[] for _ in range(self.num_layer)]
        self.db = [[] for _ in range(self.num_layer)]
        one_hot = self.get_one_hot(y)

        for i in range(self.num_layer-1,-1,-1):
            if i==self.num_layer-1:
                dz = self.a[-1] - one_hot
                assert dz.shape == self.z[i].shape
            else:
                dz = np.dot(dz, self.w[i+1].T) * self.grad_softmax(self.z[i]) #gz - 1000 x 100, w - 100 x 10, dz - 1000 x 10, new dz - 1000 x 100
                assert dz.shape == self.z[i].shape
            if i==0:
                self.dw[i] = (1/x.shape[0]) * np.dot(x.T,dz)  #x - 1000 x 784, dz - 1000 x 500
            else:
                self.dw[i] = (1/x.shape[0]) * np.dot(self.a[i-1].T,dz)

            self.db[i] = (1/x.shape[0]) * np.sum(dz,axis=0)

    def loss_cce(self,y,pred):
        one_hot = self.get_one_hot(y)
        return -np.mean(one_hot*np.log(pred),axis=0)

    def softmax(self,x,temp=1.05):
        #To avoid loss becoming nan when predictions are too high or too low, resulting in log(0) or log(1)
        t = np.exp(x/temp)
        return t/(np.sum(t,axis=1, keepdims=True) )

    def grad_softmax(self,x):
        t = self.softmax(x)
        return t*(1-t)

    def get_one_hot(self,y):
        one_hot = np.zeros((y.shape[0],self.num_classes))
        for i in range(y.shape[0]):
            one_hot[i, int(y[i].item())] = 1
        return one_hot

    def acc(self,y,pred):
        return np.mean(y==pred)*100

In [70]:
model = DNNs(num_classes=3, num_layers = 4, num_epochs=50, batch_size=128, lr=3,step=20, step_rate=0.5)
print('Training')
model.fit(tr_x,tr_y)
tr_pred =  model.predict(tr_x)
print(f'Training model accuracy: {model.acc(tr_y,tr_pred):.4} %')
tst_pred =  model.predict(tst_x)
print(f'Testing model accuracy: {model.acc(tst_y,tst_pred):.4} %')

Training
Initialiazing weights
Layer 0 :, w - (784, 500), b - (1, 500)
Layer 1 :, w - (500, 100), b - (1, 100)
Layer 2 :, w - (100, 64), b - (1, 64)
Layer 3 :, w - (64, 3), b - (1, 3)

Epoch: 0 LR 3 Loss: 0.36825319453061
Epoch: 1 LR 3 Loss: 0.3658933982594727
Epoch: 2 LR 3 Loss: 0.37053120036013837
Epoch: 3 LR 3 Loss: 0.3635354024251159
Epoch: 4 LR 3 Loss: 0.36264447121823834
Epoch: 5 LR 3 Loss: 0.3500388907582326
Epoch: 6 LR 3 Loss: 0.23106141259356805
Epoch: 7 LR 3 Loss: 0.19116149674330432
Epoch: 8 LR 3 Loss: 0.18234005386011287
Epoch: 9 LR 3 Loss: 0.1277214491340815
Epoch: 10 LR 3 Loss: 0.09964955747997391
Epoch: 11 LR 3 Loss: 0.10219736001386186
Epoch: 12 LR 3 Loss: 0.06855875071232308
Epoch: 13 LR 3 Loss: 0.09650873482197075
Epoch: 14 LR 3 Loss: 0.06628796822870356
Epoch: 15 LR 3 Loss: 0.0731376038800537
Epoch: 16 LR 3 Loss: 0.054205839863626544
Epoch: 17 LR 3 Loss: 0.054840671019798615
Epoch: 18 LR 3 Loss: 0.06580979616207762
Epoch: 19 LR 3 Loss: 0.050778930516030406
Epoch: 20 