<a href="https://colab.research.google.com/github/Apoorvmittal11/23-CS-072-DL-LAB-EXPERIMENT/blob/main/DL%20EXPERIMENT2/23_CS_072_DL_EXPERIMENT_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Build and train a fully connected neural network , without relying on deep learning libraries such as TensorFlow or PyTorch.


Dataset: MNIST

You can use the torch library only to load the dataset like this:

import torch

import torchvision

train_dataset = torchvision.datasets.MNIST(

    root='./data',
    train=True,
    transform=torchvision.transforms.ToTensor(),
    download=True
)

val_dataset = torchvision.datasets.MNIST(

    root='./data',
    train=False,
    transform=torchvision.transforms.ToTensor(),
    download=True
)

define batches

train_loader = torch.utils.data.DataLoader(dataset=train_dataset,batch_size=64,shuffle=True)

val_loader= torch.utils.data.DataLoader(dataset=val_dataset,batch_size=64,shuffle= False)

After this, make sure you convert the tensors to numpy, as you will be implementing this in numpy.

for images, labels in train_loader:
    # images: torch.Tensor of shape (batch_size, 1, 28, 28)
    # labels: torch.Tensor of shape (batch_size,)

    # Step 1: Move to CPU (important if CUDA is enabled)
    images = images.cpu()
    labels = labels.cpu()

    # Step 2: Convert to NumPy
    images_np = images.numpy()
    labels_np = labels.numpy()

Why .cpu() Is Required
* NumPy

* Calling .cpu() ensures the tensor is in host memory

* Torch is allowed up to .cpu().numpy(); beyond that point, only NumPy is permitted.  

**Implementation Requirements**

* Load the MNIST dataset
* Normalize input pixel values
* One-hot encode class labels
* Split the training data into and sets


In [1]:
import numpy as np
import torch
import torchvision
import matplotlib.pyplot as plt

train_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=True,
    transform=torchvision.transforms.ToTensor(),
    download=True
)

val_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=False,
    transform=torchvision.transforms.ToTensor(),
    download=True
)

# Batch size defined here
BATCH_SIZE = 64

train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False)


def process_batch(images, labels):
    # Step 1 & 2: CPU and Numpy
    images_np = images.cpu().numpy()
    labels_np = labels.cpu().numpy()

    input_data = images_np.reshape(images_np.shape[0], -1)

    # One-hot encode
    one_hot_labels = np.zeros((labels_np.size, 10))

    one_hot_labels[np.arange(labels_np.size), labels_np] = 1

    return input_data, one_hot_labels, labels_np

100%|██████████| 9.91M/9.91M [00:00<00:00, 42.1MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 1.19MB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 10.2MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 7.88MB/s]


Inside the NeuralNetwork class, implement the following methods:

1.)forward()
* Perform forward propagation through all layers
* Compute and store intermediate activations


2.)backward()
* Perform backpropagation
* Compute gradients of weights and biases


3.)compute_loss()
* Calculate the loss (e.g., Cross-Entropy Loss )


4.)update_parameters()
* Update weights and biases using gradient descent
* predict() for inference
* evaluate() to compute accuracy.

In [2]:
class Activations:
    @staticmethod
    def relu(z):
        return np.maximum(0, z)

    @staticmethod
    def relu_deriv(z):
        return (z > 0).astype(float)

    @staticmethod
    def sigmoid(z):
        z = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z))

    @staticmethod
    def sigmoid_deriv(z):
        s = Activations.sigmoid(z)
        return s * (1 - s)

    @staticmethod
    def tanh(z):
        return np.tanh(z)

    @staticmethod
    def tanh_deriv(z):
        return 1 - np.tanh(z)**2

    @staticmethod
    def softmax(z):
        exps = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exps / np.sum(exps, axis=1, keepdims=True)

In [3]:
class NeuralNetwork:
    def __init__(self, layer_sizes, activation='relu', learning_rate=0.01):
        self.layer_sizes = layer_sizes
        self.learning_rate = learning_rate
        self.params = {}
        self.act_func_name = activation

        np.random.seed(42)
        for i in range(1, len(layer_sizes)):
            input_dim = layer_sizes[i-1]
            output_dim = layer_sizes[i]

            if activation == 'relu':
                scale = np.sqrt(2.0 / input_dim)
            else:
                scale = np.sqrt(1.0 / input_dim)

            self.params['W' + str(i)] = np.random.randn(input_dim, output_dim) * scale
            self.params['b' + str(i)] = np.zeros((1, output_dim))

    def _get_activation(self, z):
        if self.act_func_name == 'relu':
            return Activations.relu(z)
        elif self.act_func_name == 'sigmoid':
            return Activations.sigmoid(z)
        elif self.act_func_name == 'tanh':
            return Activations.tanh(z)
        return z

    def _get_activation_deriv(self, z):
        if self.act_func_name == 'relu':
            return Activations.relu_deriv(z)
        elif self.act_func_name == 'sigmoid':
            return Activations.sigmoid_deriv(z)
        elif self.act_func_name == 'tanh':
            return Activations.tanh_deriv(z)
        return 1

    def forward(self, X):
        self.cache = {'A0': X}
        L = len(self.layer_sizes) - 1

        # Hidden layers
        for i in range(1, L):
            Z = np.dot(self.cache['A' + str(i-1)], self.params['W' + str(i)]) + self.params['b' + str(i)]
            A = self._get_activation(Z)
            self.cache['Z' + str(i)] = Z
            self.cache['A' + str(i)] = A

        # Output layer
        Z_out = np.dot(self.cache['A' + str(L-1)], self.params['W' + str(L)]) + self.params['b' + str(L)]
        A_out = Activations.softmax(Z_out)

        self.cache['Z' + str(L)] = Z_out
        self.cache['A' + str(L)] = A_out

        return A_out

    def compute_loss(self, Y_hat, Y):
        m = Y.shape[0]
        # Add epsilon to prevent log(0)
        epsilon = 1e-15
        cost = -np.sum(Y * np.log(Y_hat + epsilon)) / m
        return cost

    def backward(self, Y, Y_hat):
        grads = {}
        L = len(self.layer_sizes) - 1
        m = Y.shape[0]

        dZ = Y_hat - Y

        grads['dW' + str(L)] = np.dot(self.cache['A' + str(L-1)].T, dZ) / m
        grads['db' + str(L)] = np.sum(dZ, axis=0, keepdims=True) / m
        for i in range(L-1, 0, -1):
            dA = np.dot(dZ, self.params['W' + str(i+1)].T)
            dZ = dA * self._get_activation_deriv(self.cache['Z' + str(i)])

            grads['dW' + str(i)] = np.dot(self.cache['A' + str(i-1)].T, dZ) / m
            grads['db' + str(i)] = np.sum(dZ, axis=0, keepdims=True) / m

        return grads

    def update_parameters(self, grads):
        L = len(self.layer_sizes) - 1
        for i in range(1, L + 1):
            self.params['W' + str(i)] -= self.learning_rate * grads['dW' + str(i)]
            self.params['b' + str(i)] -= self.learning_rate * grads['db' + str(i)]

    def predict(self, X):
        A_out = self.forward(X)
        return np.argmax(A_out, axis=1)

    def evaluate(self, X, Y_true):
        predictions = self.predict(X)
        accuracy = np.mean(predictions == Y_true)
        return accuracy

A.Train the model for multiple epochs

B.Compute and record:
* Training loss
* Training accuracy
* Validation loss
* Validation accuracy per epoch

In [11]:
def train_model(model, train_loader, val_loader, epochs=5):
    history = {
        'train_loss': [], 'train_acc': [],
        'val_loss': [], 'val_acc': []
    }

    for epoch in range(epochs):
        epoch_loss = 0
        correct_train = 0
        total_train = 0

        for images, labels in train_loader:
            X_batch, Y_batch_onehot, Y_batch_labels = process_batch(images, labels)

            Y_hat = model.forward(X_batch)

            loss = model.compute_loss(Y_hat, Y_batch_onehot)
            epoch_loss += loss

            grads = model.backward(Y_batch_onehot, Y_hat)

            model.update_parameters(grads)

            preds = np.argmax(Y_hat, axis=1)
            correct_train += np.sum(preds == Y_batch_labels)
            total_train += Y_batch_labels.shape[0]

        avg_train_loss = epoch_loss / len(train_loader)
        train_acc = correct_train / total_train

        val_loss_accum = 0
        correct_val = 0
        total_val = 0

        for images, labels in val_loader:
            X_val, Y_val_onehot, Y_val_labels = process_batch(images, labels)

            Y_hat_val = model.forward(X_val)

            val_loss_accum += model.compute_loss(Y_hat_val, Y_val_onehot)

            preds_val = np.argmax(Y_hat_val, axis=1)
            correct_val += np.sum(preds_val == Y_val_labels)
            total_val += Y_val_labels.shape[0]

        avg_val_loss = val_loss_accum / len(val_loader)
        val_acc = correct_val / total_val

        history['train_loss'].append(avg_train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(avg_val_loss)
        history['val_acc'].append(val_acc)

        print(f"Epoch {epoch+1}/{epochs} | "
              f"Train Loss: {avg_train_loss:.4f} | Train Acc: {train_acc:.4f} | "
              f"Val Loss: {avg_val_loss:.4f} | Val Acc: {val_acc:.4f}")

    return history


model = NeuralNetwork(layer_sizes=[784, 128, 10], activation='relu', learning_rate=0.1)

history = train_model(model, train_loader, val_loader, epochs=5)

Epoch 1/5 | Train Loss: 0.3758 | Train Acc: 0.8936 | Val Loss: 0.2358 | Val Acc: 0.9306
Epoch 2/5 | Train Loss: 0.1996 | Train Acc: 0.9433 | Val Loss: 0.2018 | Val Acc: 0.9392
Epoch 3/5 | Train Loss: 0.1487 | Train Acc: 0.9577 | Val Loss: 0.1336 | Val Acc: 0.9588
Epoch 4/5 | Train Loss: 0.1199 | Train Acc: 0.9661 | Val Loss: 0.1105 | Val Acc: 0.9675
Epoch 5/5 | Train Loss: 0.1010 | Train Acc: 0.9715 | Val Loss: 0.1008 | Val Acc: 0.9713


**Experiments-**

Try out different hyperparameter configurations and log the results for each experiment. Since these results will be included in your submission files under the experiment section, ensure that all tables and plots are properly saved:

* Number of hidden layers
* Number of neurons per layer
* Activation functions (ReLU, Sigmoid, Tanh)

Use activation in the output layer

In [6]:
print("1: ReLU, 1 Hidden Layer ")
model_1 = NeuralNetwork(layer_sizes=[784, 128, 10], activation='relu', learning_rate=0.1)
history_1 = train_model(model_1, train_loader, val_loader, epochs=5)

print("\n2: Tanh, 2 Hidden Layers ")
model_2 = NeuralNetwork(layer_sizes=[784, 128, 64, 10], activation='tanh', learning_rate=0.1)
history_2 = train_model(model_2, train_loader, val_loader, epochs=5)

print("\n3: Sigmoid, 1 Hidden Layer ")
model_3 = NeuralNetwork(layer_sizes=[784, 128, 10], activation='sigmoid', learning_rate=0.1)
history_3 = train_model(model_3, train_loader, val_loader, epochs=5)

1: ReLU, 1 Hidden Layer 
Epoch 1/5 | Train Loss: 0.3753 | Train Acc: 0.8950 | Val Loss: 0.2223 | Val Acc: 0.9349
Epoch 2/5 | Train Loss: 0.1987 | Train Acc: 0.9433 | Val Loss: 0.1593 | Val Acc: 0.9533
Epoch 3/5 | Train Loss: 0.1489 | Train Acc: 0.9572 | Val Loss: 0.1366 | Val Acc: 0.9595
Epoch 4/5 | Train Loss: 0.1192 | Train Acc: 0.9669 | Val Loss: 0.1076 | Val Acc: 0.9675
Epoch 5/5 | Train Loss: 0.1006 | Train Acc: 0.9718 | Val Loss: 0.0947 | Val Acc: 0.9723

2: Tanh, 2 Hidden Layers 
Epoch 1/5 | Train Loss: 0.3869 | Train Acc: 0.8938 | Val Loss: 0.2502 | Val Acc: 0.9297
Epoch 2/5 | Train Loss: 0.2085 | Train Acc: 0.9392 | Val Loss: 0.1887 | Val Acc: 0.9415
Epoch 3/5 | Train Loss: 0.1535 | Train Acc: 0.9558 | Val Loss: 0.1315 | Val Acc: 0.9607
Epoch 4/5 | Train Loss: 0.1223 | Train Acc: 0.9641 | Val Loss: 0.1226 | Val Acc: 0.9624
Epoch 5/5 | Train Loss: 0.1017 | Train Acc: 0.9698 | Val Loss: 0.1066 | Val Acc: 0.9659

3: Sigmoid, 1 Hidden Layer 
Epoch 1/5 | Train Loss: 0.7823 | Train 