# Task B
In general, a basic neural network architecture can be considered that consists of an input  
layer, one or more hidden layers, and an output layer.  

You  are  supposed  to  build  15  distinct  artificial  neural  network  classifiers  by  varying  one  or 
more paramours from the following list: 

- (i). Number of hidden layers – 2 or 3
- (ii) Total number of neurons in the hidden layer is 100 or 150
- (iii) Activation function is from any of the following functions: tanh, sigmoid, ReLu

---

You need to train your network on the MNIST dataset. You can use any optimization algorithm 
like  stochastic  gradient  descent  or  Adam  optimizer.  You  need  to  evaluate  your  network's 
performance on a test set of images from the MNIST dataset. You can calculate the accuracy and 
confusion matrix to measure your network's performance. 
Perform a comparative study of these 15 models and figure out the best classifier. Do you have 
a classifier that  is not statistically significant from the best classifier? Detail the results with all 
explanations.

## Importing the Libraries

In [None]:
# Importing PyTorch
import torch
from torch import nn
import numpy as np
import matplotlib.pyplot as plt
import sklearn.metrics as metrics

# Importing TorchVision
import torchvision
from torchvision import datasets
from torchvision.transforms import ToTensor

import warnings
warnings.filterwarnings("ignore")

In [None]:
print(f"PyTorch version: {torch.__version__}\ntorchvision version: {torchvision.__version__}")

## Importing the Dataset

In [None]:
train_data = datasets.MNIST(
    root='data',
    train=True,
    download=True,
    transform=ToTensor(),
    target_transform=None
)

test_data = datasets.MNIST(
    root='data',
    train=False,
    download=True,
    transform=ToTensor(),
    target_transform=None
)

In [None]:
# Check one of the data set
image, label = train_data[0]

# Print the image
plt.matshow(image[0])

# Print the output
print(label)

In [None]:
image.shape

In [None]:
# Let's check the size of datasets
len(train_data.data), len(test_data.data), len(train_data.targets), len(test_data.targets)

## Loading the Data

In [None]:
# What this step is practically doing is converting all the data into batches of 32
# And returning the iterables to us

from torch.utils.data import DataLoader

train_data_loader = DataLoader(
    dataset=train_data,
    batch_size=100,
    shuffle=True
)

test_data_loader = DataLoader(
    dataset=test_data,
    shuffle=False
)

print(f"Dataloaders: {train_data_loader}, {test_data_loader}")
print(f"Length of train dataloader: {len(train_data_loader)} batches of {100}")
print(f"Length of test dataloader: {len(test_data_loader)}")


## Making the Model

In [None]:
class MNISTModel(nn.Module):
    def __init__(self, input_shape, hidden_layers, hidden_units, activation_func, output_shape):
        # Call the super class's init function
        super().__init__()
        
        self.layer_stack = nn.Sequential(
            nn.Flatten(),
            nn.Linear(in_features=input_shape, out_features=hidden_units),
            nn.ReLU(),
        )
        self.hidden_units = hidden_units
        
        for i in range(hidden_layers):
            self.layer_stack.append(nn.Linear(in_features=self.hidden_units, out_features=self.hidden_units))
            # self.hidden_units -= 20
            if activation_func == "t":
                self.layer_stack.append(nn.Tanh())
            elif activation_func == "s":
                self.layer_stack.append(nn.Sigmoid())
            elif activation_func == "r":
                self.layer_stack.append(nn.ReLU())
        
        self.layer_stack.append(nn.Linear(in_features=self.hidden_units, out_features=output_shape))
        self.layer_stack.append(nn.LogSoftmax())
    
    def forward(self, x):
        return self.layer_stack(x)

## Writing Loss Functions

In [None]:
mnistModel = MNISTModel(
    input_shape=784,
    hidden_layers=3,
    hidden_units=100,
    output_shape=10,
    activation_func="s"
)
mnistModel.to('cpu')

In [None]:
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mnistModel.parameters(), lr=0.001)

## Training the Model

In [None]:
# ---------Training---------
epochs = 10

# Create the training and testing loop
for epoch in range(epochs):    
        
    for batch, (X, y) in enumerate(train_data_loader):
        # Train the model
        mnistModel.train()
        # Generate Value
        y_pred = mnistModel(X)
        # Generate loss from loss function
        loss = loss_fn(y_pred, y)
        # Optimize :)
        # Apparently this sets all gradients to zero
        optimizer.zero_grad()
        # Back Propagate
        loss.backward()
        optimizer.step()
    
        # if (batch+1) % 2 == 0:
        #         print (f'Epoch [{epoch+1}/{epochs}], Step[{batch+1}/{len(train_data_loader)}], Loss: {loss.item():.4f}')

        

## Testing the Model

### Checking Accuracy

In [None]:
with torch.no_grad():
    n_correct = 0
    n_samples = 0
    
    y_pred = []
    y_true = []

    # We loop through all the test data
    for images, labels in test_data_loader:
        # Generate output for one model
        outputs = mnistModel(images)
        # Max returns (value ,index) i.e we need to check which digit has higher probability
        _, predicted = torch.max(outputs.data, 1)
        y_pred.append(predicted[0])
        y_true.append(labels[0])

In [None]:
import sklearn.metrics as metrics

metrics.confusion_matrix(y_true, y_pred)
print("Classification report for ANN :\n%s\n"
      % (metrics.classification_report(y_true, y_pred)))


## Iterating through all models


In [None]:
hidden_sizes = [2, 3]
neurons_in_hidden_layers = [100, 150]
activation_functions = ["t", "s", "r"]
epochs = 100
model_no = 0

for hs in hidden_sizes:
    for nihl in neurons_in_hidden_layers:
        for af in activation_functions:
            model_no += 1
            
            mnistModel = MNISTModel(
                input_shape=784,
                hidden_layers=hs,
                hidden_units=nihl,
                output_shape=10,
                activation_func=af
            )
            
            print(f"Model #{model_no}")
            
            loss_fn = nn.CrossEntropyLoss()
            optimizer = torch.optim.Adam(mnistModel.parameters(), lr=0.001)

            # ---------Training---------
            # Create the training and testing loop
            for epoch in range(epochs):    
                    
                for batch, (X, y) in enumerate(train_data_loader):
                    # Train the model
                    mnistModel.train()
                    # Generate Value
                    y_pred = mnistModel(X)
                    # Generate loss from loss function
                    loss = loss_fn(y_pred, y)
                    # Optimize :)
                    # Apparently this sets all gradients to zero
                    optimizer.zero_grad()
                    # Back Propagate
                    loss.backward()
                    optimizer.step()
                    
                    # Debugging Purposes only, Do not use in the loop
                    # if (batch+1) % 100 == 0:
                    #         print (f'Epoch [{epoch+1}/{epochs}], Step[{batch+1}/{len(train_data_loader)}], Loss: {loss.item():.4f}')

            with torch.no_grad():                
                y_pred = []
                y_true = []

                # We loop through all the test data
                for images, labels in test_data_loader:
                    # Generate output for one model
                    outputs = mnistModel(images)
                    # Max returns (value ,index) i.e we need to check which digit has higher probability
                    _, predicted = torch.max(outputs.data, 1)
                    y_pred.append(predicted[0])
                    y_true.append(labels[0])

            metrics.confusion_matrix(y_true, y_pred)
            print(f"Classification report for Hidden Layers: {hs}, Neurons in Each Hidden Layer: {nihl}, Activation Function: {af} :\n%s\n"
                % (metrics.classification_report(y_true, y_pred)))