 # Assignemnt 3

### Loading MNIST dataset 

In [1]:
import numpy as np
from torchvision.datasets import MNIST

def download_mnist(is_train: bool):
    dataset = MNIST(root='./data',transform=lambda x: np.array(x).flatten(),download=True,train=is_train)
    mnist_data = []
    mnist_labels = []
    for image, label in dataset:
        mnist_data.append(image)
        mnist_labels.append(label)
    return np.array(mnist_data), np.array(mnist_labels)
train_X, train_Y = download_mnist(True)
test_X, test_Y = download_mnist(False)

In [2]:
def encode_labels(labels):
    encoded_labels = np.zeros((len(labels), 10))
    for i, label in enumerate(labels):
        encoded_labels[i][label] = 1
    return encoded_labels

train_Y = encode_labels(train_Y)
test_Y = encode_labels(test_Y)

In [3]:
def normalize_data(data):
    return data / 255

train_X = normalize_data(train_X)
test_X = normalize_data(test_X)

In [4]:

print(train_Y.shape)

print(test_X.shape)
print(test_Y.shape)


(60000, 10)
(10000, 784)
(10000, 10)


### Training process

In [5]:
def split_into_batches(data, labels, batch_size):
    batches = []
    permutatuion = np.random.permutation(data.shape[0])
    data_shuffled = data[permutatuion]
    labels_shuffled = labels[permutatuion]
    for i in range(0, data.shape[0], batch_size):
        batches.append((data_shuffled[i:i+batch_size], labels_shuffled[i:i+batch_size]))
    return batches

In [6]:
def sigmoid(x):
    clipped = np.clip(x, -500, 500)
    return 1 / (1 + np.exp(-clipped))


def tanh(x):
    return np.tanh(x)

def relu(x):
    return np.maximum(0, x)

In [7]:
def sigmoid_prime(x):
    s = sigmoid(x)
    return s * (1 - s)

def tanh_prime(x):
    return 1 - np.tanh(x) ** 2

def relu_prime(x):
    return (x > 0).astype(float)



In [8]:
def softmax(z):
    exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))  
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)


In [9]:
def forward_propagation(X, W, b,dropout_rate=None):
    activations = [X]
    zs=[]
    for w,b in zip(W, b):
        z = np.dot(activations[-1], w) + b
        activation= relu(z)
        if dropout_rate:
            mask = (np.random.rand(*activation.shape) > dropout_rate).astype(float)
            z = z * mask
            activation = activation * mask / (1 - dropout_rate)
        zs.append(z)
        activations.append(activation)
    activations[-1] = softmax(zs[-1])
    return activations, zs

In [10]:
def backward_propagation(W, b, t, activations, zs):
   batch_size = t.shape[0]
   gradient_b = [np.zeros(b.shape) for b in b]
   gradient_W = [np.zeros(w.shape) for w in W]
   error = activations[-1] - t
   for layer in range(len(W) - 1, -1, -1):
       gradient_b[layer] = np.mean(error, axis=0, keepdims=True)
       gradient_W[layer] = np.dot(activations[layer].T, error) / batch_size
       if layer > 0:  
            error = np.dot(error, W[layer].T) * relu_prime(zs[layer-1])
   return gradient_W, gradient_b

In [11]:
def get_learning_rate(initial_lr, epoch, decay_rate=0.1, decay_steps=10):
    return initial_lr * (1 / (1 + decay_rate * (epoch // decay_steps)))

In [12]:
def gradient_descent(X, y_true, y_pred, W, b, learning_rate):
    size = X.shape[0] 
    dW = np.dot(X.T, (y_pred - y_true)) / size
    db = np.sum(y_pred - y_true, axis=0) / size
    
    W -= learning_rate * dW
    b -= learning_rate * db
    return W, b

In [13]:
def train_batch(batch,batch_labels,weights,biases,learning_rate,dropout_rate=None):
      activations, zs = forward_propagation(batch, weights, biases,dropout_rate)
      gradient_W, gradient_b = backward_propagation(weights, biases, batch_labels, activations, zs)
      for i in range(len(weights)):
          weights[i], biases[i] = weights[i] - learning_rate * gradient_W[i], biases[i] - learning_rate * gradient_b[i]
      return weights, biases

In [14]:
def initialize_layers(input_size, hidden_layers, output_size):
    weights = []
    biases = []
    layers = [input_size] + hidden_layers + [output_size]
  
    for i in range(1, len(layers)):
        fan_in = layers[i-1]
        fan_out = layers[i]
        W=np.random.randn(fan_in,fan_out) * np.sqrt(2 / fan_in)
        b = np.ones((1, fan_out)) * 0.1
        weights.append(W)
        biases.append(b)
    return weights, biases





### Testing process

In [15]:
def test(data,labels,weights,biases):
    activations,zs = forward_propagation(data, weights, biases)
    predictions = np.argmax(activations[-1], axis=1)
    true_labels = np.argmax(labels, axis=1)
    accuracy = np.mean(predictions == true_labels)
    return accuracy
   

### Solution

In [16]:
import os
def save_model(weights, biases,epochs):
    os.makedirs(f'models_dropout/epochs_{epochs}', exist_ok=True)
    for layer in range(len(weights)):
        np.save(f'models_dropout/epochs_{epochs}/weights_{layer+1}.npy', weights[layer])
        np.save(f'models_dropout/epochs_{epochs}/biases_{layer+1}.npy', biases[layer])

def load_model(epochs,layers):
    weights = []
    biases = []
    for i in range(layers):
        weights.append(np.load(f'models_dropout/epochs_{epochs}/weights_{i+1}.npy'))
        biases.append(np.load(f'models_dropout/epochs_{epochs}/biases_{i+1}.npy'))
    return weights, biases

In [17]:
from typing import TypedDict

class Hyperparameters(TypedDict):
    learning_rate: float
    epochs: int
    batch_size: int
    hidden_layers: list[int]
    dropout_rate: float | None
    patience: int
    reduce_factor: float
    min_lr: float

In [18]:
def split_validation(data, labels, ratio):
    permutation = np.random.permutation(data.shape[0])
    validation_size = int(data.shape[0] * ratio)
    validation_indices = permutation[:validation_size]
    training_indices = permutation[validation_size:]
    return data[training_indices], labels[training_indices], data[validation_indices], labels[validation_indices]

In [19]:


from time import time


def train(X, y, hyperparameters):
    training_data, training_labels, validation_data, validation_labels = split_validation(X, y, 0.2)
    weights, biases = initialize_layers(training_data.shape[1], hyperparameters['hidden_layers'], training_labels.shape[1])
    start = time()
    patience = hyperparameters['patience']
    reduce_factor = hyperparameters['reduce_factor']
    min_lr = hyperparameters['min_lr']
    initial_learning_rate = hyperparameters['learning_rate']
    learning_rate = initial_learning_rate  
    best_val_accuracy = 0  
    plateau_count = 0  

    for epoch in range(hyperparameters['epochs']):
        batches = split_into_batches(training_data, training_labels, hyperparameters['batch_size'])
        
      
        for batch, batch_labels in batches:
            weights, biases = train_batch(batch, batch_labels, weights, biases, learning_rate, hyperparameters['dropout_rate'])
        
        validation_accuracy = test(validation_data, validation_labels, weights, biases) * 100
        training_accuracy = test(training_data, training_labels, weights, biases) * 100
        duration = time() - start
        
     
        if (epoch + 1) % 5 == 0:
            print(f'Epoch {epoch+1}/{hyperparameters["epochs"]} - Validation Accuracy: {validation_accuracy:.2f}% Training Accuracy: {training_accuracy:.2f}% | Duration: {duration:.2f}s')

     
        if validation_accuracy > best_val_accuracy + 0.1:
            best_val_accuracy = validation_accuracy
            plateau_count = 0  
            save_model(weights, biases, epoch + 1)
        else:
            if best_val_accuracy >= 95:
                print(f'Early stopping at epoch {epoch+1}. Best validation accuracy: {best_val_accuracy:.2f}%')
                break
            plateau_count += 1

      
        if plateau_count >= patience:
            new_learning_rate = max(learning_rate * reduce_factor, min_lr)
            if new_learning_rate < learning_rate:
                learning_rate = new_learning_rate
            plateau_count = 0 


    return weights, biases

In [21]:
hyperparameters = Hyperparameters(
    learning_rate=0.9,        
    epochs=60,                 
    batch_size=128,          
    hidden_layers=[100],      
    dropout_rate=0.05,         
    patience=2,               
    reduce_factor=0.5,        
    min_lr=5e-3               
)

model_weight, model_biases= train(train_X,train_Y,hyperparameters)
save_model(model_weight, model_biases, hyperparameters['epochs'])
test_accuracy = test(test_X, test_Y, model_weight, model_biases) * 100
print(f'Test Accuracy: {test_accuracy:.2f}%')


Epoch 5/60 - Validation Accuracy: 94.01% Training Accuracy: 94.31% | Duration: 37.18s
Epoch 10/60 - Validation Accuracy: 94.86% Training Accuracy: 95.30% | Duration: 73.25s
Epoch 15/60 - Validation Accuracy: 95.00% Training Accuracy: 95.53% | Duration: 109.06s
Early stopping at epoch 15. Best validation accuracy: 95.17%
Test Accuracy: 95.32%
