# ***1 : Defining Neural Network Class***

In [20]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class NeuralNet(nn.Module):
    def __init__(self):
        super(NeuralNet, self).__init__()
        self.fc1 = nn.Linear(784, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 10)

    def forward(self, x):

        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# ***2 : Flattening the Image***

In [21]:
flatten = nn.Flatten()
sample_img, _ = next(iter(train_loader))
flattened_img = flatten(sample_img)
print(flattened_img.shape)

torch.Size([64, 784])


# ***3 : Adding Hidden Layer***

In [22]:
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(784, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.flatten(x)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# ***4 : Adding Output Layer***

In [23]:
class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(784, 128)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.flatten(x)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# ***5 : Initializing weights***

In [24]:
torch.manual_seed(42)

def initialize_weights(model):
    for m in model.modules():
        if isinstance(m, nn.Linear):
            nn.init.xavier_uniform_(m.weight)
            nn.init.zeros_(m.bias)

model = SimpleNN()
initialize_weights(model)

# ***6 : Using CrossEntropyLoss***

In [25]:
criterion = nn.CrossEntropyLoss()

# ***7 : Configuring Adam Optimizer***

In [26]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# ***8 : Initial Loss Before Training***

In [27]:
images, labels = next(iter(train_loader))
outputs = model(images)
initial_loss = criterion(outputs, labels)
print("Initial Loss:", initial_loss.item())

Initial Loss: 2.4456543922424316


# ***9 : Training LOOP***

In [29]:
num_epochs = 5

for epoch in range(num_epochs):
    for images, labels in train_loader:
        outputs = model(images)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}")

Epoch [1/5], Loss: 0.0142
Epoch [2/5], Loss: 0.0089
Epoch [3/5], Loss: 0.0582
Epoch [4/5], Loss: 0.0123
Epoch [5/5], Loss: 0.0364


# ***10 : Evaluating Model Performance***

In [30]:
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = 100 * correct / total
print(f"Test Accuracy: {accuracy:.2f}%")

Test Accuracy: 97.74%


In [31]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

all_labels = []
all_preds = []

with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        all_labels.extend(labels.numpy())
        all_preds.extend(predicted.numpy())

accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average='macro')
recall = recall_score(all_labels, all_preds, average='macro')
f1 = f1_score(all_labels, all_preds, average='macro')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

Accuracy: 0.9774
Precision: 0.9775
Recall: 0.9772
F1-Score: 0.9773


# ***11 : Adding a Hidden Layer***

In [32]:
class DeepNN(nn.Module):
    def __init__(self):
        super(DeepNN, self).__init__()
        self.flatten = nn.Flatten()
        self.fc1 = nn.Linear(784, 256)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(256, 64)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        x = self.flatten(x)
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.fc3(x)
        return x

# ***12 : Comparing Training Loss and Metric Changes***

In [33]:
model = DeepNN()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

num_epochs = 5
for epoch in range(num_epochs):
    running_loss = 0.0
    for images, labels in train_loader:
        outputs = model(images)
        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    avg_loss = running_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Average Loss: {avg_loss:.4f}")

all_labels = []
all_preds = []
with torch.no_grad():
    for images, labels in test_loader:
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        all_labels.extend(labels.numpy())
        all_preds.extend(predicted.numpy())

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy = accuracy_score(all_labels, all_preds)
precision = precision_score(all_labels, all_preds, average='macro')
recall = recall_score(all_labels, all_preds, average='macro')
f1 = f1_score(all_labels, all_preds, average='macro')

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")


Epoch [1/5], Average Loss: 0.2976
Epoch [2/5], Average Loss: 0.1123
Epoch [3/5], Average Loss: 0.0741
Epoch [4/5], Average Loss: 0.0550
Epoch [5/5], Average Loss: 0.0401
Accuracy: 0.9783
Precision: 0.9784
Recall: 0.9781
F1-Score: 0.9782


### ðŸ§© Step: Discussion â€” Trade-offs Between Computation and Accuracy

Adding extra hidden layers or increasing neuron counts often improves a modelâ€™s ability to learn complex patterns, but it comes with costs:

#### 1. **Computation Time**

* More layers and parameters mean more multiplications during training.
* Each epoch takes longer to complete, especially on CPUs or smaller GPUs.

#### 2. **Memory Usage**

* Larger models consume more GPU or RAM memory for weights, gradients, and activations.
* This can cause slower data loading and even out-of-memory errors in limited environments.

#### 3. **Accuracy and Generalization**

* While deeper or wider networks may achieve higher training accuracy, they risk overfitting â€” performing worse on unseen data.
* Smaller models might generalize better if the dataset is simple (like MNIST).

#### 4. **Optimal Balance**

* The best architecture finds balance â€” minimal complexity for maximum performance.
* For MNIST, 1â€“2 hidden layers with 128â€“256 neurons typically reach >97% accuracy without unnecessary computation.

# ***13 : Saving Model State***

In [34]:
torch.save(model.state_dict(), "mnist_model_state_dict.pth")
print("Model state dictionary saved successfully.")

Model state dictionary saved successfully.


In [35]:
from google.colab import files
files.download("mnist_model_state_dict.pth")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>