## Import Necessary Libraries

In [3]:
import numpy as np
import pandas as pd
import plotly.express as px

## Activation Functions `ReLU` and `Softmax`

In [4]:
class ReLU:

    def __init__(self):
        self.state = None

    def forward(self, x):
        forward = np.where(x < 0, 0, x)
        self.state = forward
        return forward

    def derivative(self):
        derivative = np.where(self.state < 0, 0, 1)
        return derivative
    
    def __call__(self, x):
        return self.forward(x)
    
    
class Softmax:
    
    def __init__(self):
        self.state = None
    
    def forward(self, z):
        return np.exp(z) / np.sum(np.exp(z), axis=1, keepdims=True)
    
    def __call__(self, z):
        return self.forward(z)
        

## `Layer` is implementing a Linear Layer

In [5]:
class Layer:

    def init_weights(self):
        self.w = np.random.normal(self.mu, self.std, size=(self.out_features, self.in_features))
        self.b = np.random.normal(self.mu, self.std, size=(1, self.out_features))


    def __init__(self, in_features, out_features, mu=0):
        self.in_features = in_features
        self.out_features = out_features
        self.std = np.sqrt(2/self.in_features)
        self.mu = mu
        self.init_weights()

    def forward(self, x):
        self.x = x
        self.batch_size = x.shape[0]
        z = np.dot(x, self.w.T) + self.b
        return z
    
    def backward(self, gradient, lamda):
        self.dw = (np.dot(gradient.T, self.x) / self.batch_size) + lamda * self.w
        self.db = np.sum(gradient, axis=0, keepdims=True) / self.batch_size
        dx = np.dot(gradient, self.w)
        return dx
    
    def step(self, lr):
        self.w += -1 * (lr * self.dw)
        self.b += -1 * (lr * self.db)

    


## Define Cost Function `Cross Entropy`

In [6]:
class CrossEntropy:

    def derivative(self, output, labels):
        return output - labels
    
    def compute_cost(self, y, output, layers, lamda):
        cross = -1 * np.mean(np.sum(y * np.log(output), axis=1))
        L2_reg = 0
        for i in range(len(layers)):
            L2_reg += np.sum(layers[i].w ** 2)
        return cross + (lamda /(2 * len(y))) * L2_reg

## Implementation of MLP

In [29]:
class MLP:


    def initialize_layers(self):
        self.linear_layers = []
        for k in range(self.n_layers):
            in_features, out_features = self.layer_dims[k]
            self.linear_layers.append(Layer(in_features, out_features))


    def __init__(self, layer_dims:list, activations:list, lr:int, criterion, lamda:int):
        self.layer_dims = layer_dims
        self.n_layers = len(layer_dims)
        self.activations = activations
        self.lr = lr
        self.criterion = criterion
        self.lamda = lamda
        self.output = None
        self.initialize_layers()

    def forward(self, x):
        forward_input = x
        for k in range(self.n_layers):
            output = self.linear_layers[k].forward(forward_input)
            output = self.activations[k](output)
            forward_input = output 
        return output

   
    def backward(self, output, labels, lamda):
        global_derivative = self.criterion.derivative(output, labels)
        for k in reversed(range(len(self.linear_layers))):
            if k < self.n_layers-1:
                global_derivative *=  self.activations[k].derivative()
            linear_backward = self.linear_layers[k].backward(global_derivative, lamda)
            global_derivative = linear_backward
        return global_derivative
    
    def step(self):
        for layer in self.linear_layers:
            layer.step(self.lr)
    
    
    def fit(self, batches, epoch, x_val, y_val, Print=True):
        batch_mean_costs = []
        validation_costs = []
        for epoch_no in range(epoch):
            batch_costs = []
            for x_batch, y_batch in batches:
                output = self.forward(x_batch)
                cost = self.criterion.compute_cost(y_batch, output, self.linear_layers, self.lamda)
                self.backward(output, y_batch, self.lamda)
                self.step()
                batch_costs.append(cost)
            y_val_pred = self.forward(x_val)
            val_cost = self.criterion.compute_cost(y_val, y_val_pred, self.linear_layers, self.lamda)
            batch_mean_costs.append(np.mean(batch_costs))
            validation_costs.append(val_cost)
            if Print:
                print(f'train cost in epoch {epoch_no} is : {np.mean(batch_costs)}, ------ , valid cost : {val_cost}')
        return batch_mean_costs, validation_costs
    
    def predict(self, x):
        return np.argmax(self.forward(x), axis=1)

    def total_loss(self, labels):
        return self.criterion(self.output, labels).sum()

    def __call__(self, x):
        return self.forward(x)


## Functions to PreProcess the Data

In [14]:
def split_batch(x, y, batch_size):
    dt = np.concatenate([x, y] , axis=1)
    np.random.shuffle(dt)
    length = len(dt) // batch_size
    batches = []
    for batch_n in range(length):
        row_start, row_end = batch_n * batch_size, (batch_n + 1) * batch_size
        x_batch, y_batch = dt[row_start:row_end, :-10], dt[row_start:row_end, -10:]
        batches.append((x_batch, y_batch))
    return batches

In [15]:
def one_hot_encode(label):
    label = label.astype(int)
    result = np.zeros(shape=(len(label), 10))
    for i in range(len(label)):
        result[i, label[i,0]] = 1
    return result

## Read and PreProcess the DataSet

In [16]:
with open('MNIST/train.npy', 'rb') as f:
    x_train = np.load(f, allow_pickle=True)
    y_train = np.load(f, allow_pickle=True).reshape((-1,1)).astype(int)
    
with open('MNIST/val.npy', 'rb') as f:
    x_val = np.load(f, allow_pickle=True)
    y_val = np.load(f, allow_pickle=True).reshape((-1,1)).astype(int)
    
with open('MNIST/test.npy', 'rb') as f:
    x_test = np.load(f, allow_pickle=True)
    y_test = np.load(f, allow_pickle=True)

In [17]:
x_train = x_train / 255
x_val = x_val / 255
x_test = x_test / 255

In [18]:
y_train = one_hot_encode(y_train)
y_val = one_hot_encode(y_val)

## Define The Network and Run it

In [19]:
np.random.seed(42)
layers_dim = [(784, 32), (32, 32), (32, 32),(32, 10)]
activations = [ReLU(), ReLU(), ReLU(), Softmax()]
criterion = CrossEntropy()
m = MLP(layers_dim, activations, 0.01, criterion, 0.01)
batches = split_batch(x_train, y_train, 32)
train_hiss, valid_hiss = m.fit(batches, 15, x_val, y_val)

train cost in epoch 0 is : 0.8544293475629804, ------ , valid cost : 0.5845370626222814
train cost in epoch 1 is : 0.5954809953620532, ------ , valid cost : 0.5140353723413784
train cost in epoch 2 is : 0.542572069113794, ------ , valid cost : 0.4923875334361744
train cost in epoch 3 is : 0.5188174245406463, ------ , valid cost : 0.4807585996301968
train cost in epoch 4 is : 0.5048924286478808, ------ , valid cost : 0.47336279640513745
train cost in epoch 5 is : 0.4960173907179948, ------ , valid cost : 0.4691195753346706
train cost in epoch 6 is : 0.4901874544995113, ------ , valid cost : 0.46690305674433735
train cost in epoch 7 is : 0.4862095231510532, ------ , valid cost : 0.4655901410120334
train cost in epoch 8 is : 0.483595929557341, ------ , valid cost : 0.46505925305641244
train cost in epoch 9 is : 0.4819777488033443, ------ , valid cost : 0.46530217393886203
train cost in epoch 10 is : 0.4810544139094572, ------ , valid cost : 0.4659194168401588
train cost in epoch 11 is : 0

In [20]:
fig = px.line(x=list(range(15)), y=train_hiss, title='Cost of model in train and validation set')
fig.add_scatter(x=list(range(15)), y=valid_hiss, name='Validation').update_layout(xaxis_title='Epoch' , yaxis_title="Cost")

## Tune the Hyperparameters

In [22]:
lr_values = [0.01, 0.03, 0.1]
lamda_values = [0.01, 0.03, 0.1, 0.3]
train_dict, valid_dict = {}, {}
for lr in lr_values:
    for lamda in lamda_values:
        layers_dim = [(784, 32), (32, 32), (32, 32),(32, 10)]
        activations = [ReLU(), ReLU(), ReLU(), Softmax()]
        criterion = CrossEntropy()
        m = MLP(layers_dim, activations, lr, criterion, lamda)
        batches = split_batch(x_train, y_train, 32)
        train_hiss, valid_hiss = m.fit(batches, 15, x_val, y_val, Print=False)
        train_dict[f"lr:{lr},lamda:{lamda}"] = train_hiss
        valid_dict[f'lr:{lr},lamda:{lamda}'] = valid_hiss


overflow encountered in exp


invalid value encountered in divide


divide by zero encountered in log


invalid value encountered in multiply



In [23]:
px.box(pd.DataFrame(train_dict), title='Cost Function of Model in train set')

In [24]:
px.box(pd.DataFrame(valid_dict), title='Cost Function of Model in valid set')

## Predict Labels

In [30]:
layers_dim = [(784, 32), (32, 32), (32, 32),(32, 10)]
activations = [ReLU(), ReLU(), ReLU(), Softmax()]
criterion = CrossEntropy()
m = MLP(layers_dim, activations, 0.01, criterion, 0.01)
batches = split_batch(x_train, y_train, 32)
train_hiss, valid_hiss = m.fit(batches, 15, x_val, y_val)

train cost in epoch 0 is : 0.824997924375596, ------ , valid cost : 0.5713652040270525
train cost in epoch 1 is : 0.5822197283526266, ------ , valid cost : 0.5179752946811726
train cost in epoch 2 is : 0.5389537467649123, ------ , valid cost : 0.4952383367995091
train cost in epoch 3 is : 0.51550238332516, ------ , valid cost : 0.4816858338187448
train cost in epoch 4 is : 0.5011221908940215, ------ , valid cost : 0.47354964430937896
train cost in epoch 5 is : 0.49193076119720386, ------ , valid cost : 0.46827847578997406
train cost in epoch 6 is : 0.48589748061329785, ------ , valid cost : 0.4653153413432535
train cost in epoch 7 is : 0.4818518938335065, ------ , valid cost : 0.46368965322458966
train cost in epoch 8 is : 0.47920861590765196, ------ , valid cost : 0.46299820951289605
train cost in epoch 9 is : 0.4774663628044219, ------ , valid cost : 0.46291087115602814
train cost in epoch 10 is : 0.47630474899522957, ------ , valid cost : 0.4631892917628122
train cost in epoch 11 is

In [31]:
y_pred = m.predict(x_val)
y_val = np.argmax(y_val, axis=1)

## Print Classification Metrics

In [33]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.85      0.81      1023
           1       0.98      0.94      0.96       988
           2       0.77      0.71      0.74      1008
           3       0.86      0.86      0.86      1021
           4       0.70      0.82      0.75      1050
           5       0.93      0.91      0.92       996
           6       0.63      0.50      0.56       970
           7       0.88      0.91      0.90       955
           8       0.91      0.96      0.93       968
           9       0.93      0.93      0.93      1021

    accuracy                           0.84     10000
   macro avg       0.84      0.84      0.84     10000
weighted avg       0.84      0.84      0.84     10000

