# Neural Networks Practical Workshop

In this workshop, we'll explore neural networks from the ground up. We'll first implement a neural network from scratch using NumPy, then use PyTorch for more efficient implementations.

## 1. Setup and Libraries

In [None]:
# Import necessary libraries
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# For reproducibility
np.random.seed(42)
torch.manual_seed(42)

## 2. Neural Network from Scratch using NumPy

Let's start by implementing a simple neural network from scratch to understand the fundamental concepts.

In [None]:
class NeuralNetworkFromScratch:
    def __init__(self, input_size, hidden_size, output_size):
        # Initialize weights and biases
        self.W1 = np.random.randn(input_size, hidden_size) * 0.01
        self.b1 = np.zeros((1, hidden_size))
        self.W2 = np.random.randn(hidden_size, output_size) * 0.01
        self.b2 = np.zeros((1, output_size))
    
    def sigmoid(self, x):
        # Sigmoid activation function
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_derivative(self, x):
        # Derivative of sigmoid for backpropagation
        return x * (1 - x)
    
    def softmax(self, x):
        # Softmax activation for output layer
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)
    
    def forward(self, X):
        # Forward propagation
        self.z1 = np.dot(X, self.W1) + self.b1
        self.a1 = self.sigmoid(self.z1)
        self.z2 = np.dot(self.a1, self.W2) + self.b2
        self.a2 = self.softmax(self.z2)
        return self.a2
    
    def backward(self, X, y, output):
        # Backpropagation
        # Convert y to one-hot encoding
        y_one_hot = np.zeros((y.size, output.shape[1]))
        y_one_hot[np.arange(y.size), y] = 1
        
        # Calculate gradients
        dz2 = output - y_one_hot
        dW2 = np.dot(self.a1.T, dz2)
        db2 = np.sum(dz2, axis=0, keepdims=True)
        
        da1 = np.dot(dz2, self.W2.T)
        dz1 = da1 * self.sigmoid_derivative(self.a1)
        dW1 = np.dot(X.T, dz1)
        db1 = np.sum(dz1, axis=0, keepdims=True)
        
        return dW1, db1, dW2, db2
    
    def train(self, X, y, learning_rate=0.1, epochs=1000):
        # Training loop
        losses = []
        
        for epoch in range(epochs):
            # Forward pass
            output = self.forward(X)
            
            # Calculate loss
            y_one_hot = np.zeros((y.size, output.shape[1]))
            y_one_hot[np.arange(y.size), y] = 1
            loss = -np.sum(y_one_hot * np.log(output + 1e-8)) / y.size
            losses.append(loss)
            
            # Backpropagation
            dW1, db1, dW2, db2 = self.backward(X, y, output)
            
            # Update weights and biases
            self.W1 -= learning_rate * dW1
            self.b1 -= learning_rate * db1
            self.W2 -= learning_rate * dW2
            self.b2 -= learning_rate * db2
            
            # Print progress
            if epoch % 100 == 0:
                print(f"Epoch {epoch}, Loss: {loss:.4f}")
        
        return losses
    
    def predict(self, X):
        # Make predictions
        output = self.forward(X)
        return np.argmax(output, axis=1)

### 2.1 Test our Neural Network from Scratch with a Toy Dataset

In [None]:
# Create a simple dataset (XOR problem)
X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([0, 1, 1, 0])

# Initialize and train the network
nn_scratch = NeuralNetworkFromScratch(input_size=2, hidden_size=4, output_size=2)
losses = nn_scratch.train(X, y, learning_rate=0.5, epochs=5000)

# Plot the loss curve
plt.figure(figsize=(10, 6))
plt.plot(losses)
plt.title('Loss over training epochs')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.grid(True)
plt.show()

# Test the network
predictions = nn_scratch.predict(X)
print("Predictions:", predictions)
print("Actual:     ", y)
print(f"Accuracy: {np.mean(predictions == y) * 100:.2f}%")

## 3. Load the MNIST Dataset for a Real-World Example

In [None]:
# Define transformations
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.1307,), (0.3081,))
])

# Download and load the MNIST dataset
train_dataset = datasets.MNIST('./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST('./data', train=False, transform=transform)

# Create data loaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

### 3.1 Visualize Some MNIST Examples

In [None]:
# Display a few examples
examples = iter(train_loader)
images, labels = next(examples)

plt.figure(figsize=(12, 8))
for i in range(9):
    plt.subplot(3, 3, i+1)
    plt.imshow(images[i][0], cmap='gray')
    plt.title(f'Label: {labels[i]}')
    plt.axis('off')
plt.tight_layout()
plt.show()

## 4. Neural Network with PyTorch

Now that we understand the fundamentals, let's use PyTorch to build a more efficient neural network.

In [None]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

# Initialize the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NeuralNetwork().to(device)

# Define loss function and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

### 4.1 Training and Testing Functions

In [None]:
def train(model, dataloader, loss_fn, optimizer, device):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        
        # Forward pass
        pred = model(X)
        loss = loss_fn(pred, y)
        
        # Backpropagation
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Calculate metrics
        running_loss += loss.item()
        _, predicted = torch.max(pred.data, 1)
        total += y.size(0)
        correct += (predicted == y).sum().item()
        
        if batch % 100 == 0:
            print(f"Batch [{batch}/{len(dataloader)}] Loss: {loss.item():.4f}")
    
    epoch_loss = running_loss / len(dataloader)
    accuracy = 100 * correct / total
    return epoch_loss, accuracy

def test(model, dataloader, loss_fn, device):
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            test_loss += loss_fn(pred, y).item()
            _, predicted = torch.max(pred.data, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()
    
    test_loss /= len(dataloader)
    accuracy = 100 * correct / total
    print(f"Test Error: \n Accuracy: {accuracy:.2f}%, Avg loss: {test_loss:.4f}")
    return test_loss, accuracy

### 4.2 Train the PyTorch Model

In [None]:
# Train the model
epochs = 5
train_losses = []
train_accuracies = []
test_losses = []
test_accuracies = []

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}\n-------------------------------")
    train_loss, train_acc = train(model, train_loader, loss_fn, optimizer, device)
    test_loss, test_acc = test(model, test_loader, loss_fn, device)
    
    train_losses.append(train_loss)
    train_accuracies.append(train_acc)
    test_losses.append(test_loss)
    test_accuracies.append(test_acc)
    
print("Done!")

### 4.3 Visualize Results

In [None]:
plt.figure(figsize=(12, 5))

# Plot losses
plt.subplot(1, 2, 1)
plt.plot(range(1, epochs+1), train_losses, label='Train Loss')
plt.plot(range(1, epochs+1), test_losses, label='Test Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Test Loss')
plt.legend()
plt.grid(True)

# Plot accuracies
plt.subplot(1, 2, 2)
plt.plot(range(1, epochs+1), train_accuracies, label='Train Accuracy')
plt.plot(range(1, epochs+1), test_accuracies, label='Test Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.title('Training and Test Accuracy')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

### 4.4 Visualize Model Predictions

In [None]:
# Get a batch of test images
test_examples = iter(test_loader)
test_images, test_labels = next(test_examples)

# Make predictions
with torch.no_grad():
    test_images_device = test_images.to(device)
    predictions = model(test_images_device)
    predicted_classes = torch.max(predictions, 1)[1]

# Display predictions
plt.figure(figsize=(12, 8))
for i in range(9):
    plt.subplot(3, 3, i+1)
    plt.imshow(test_images[i][0], cmap='gray')
    plt.title(f'Actual: {test_labels[i]}, Predicted: {predicted_classes[i].cpu().numpy()}')
    plt.axis('off')
plt.tight_layout()
plt.show()

## 5. Experimentation

Now it's your turn to experiment. Try changing these aspects of the model:

1. Number of hidden layers and neurons
2. Activation functions (ReLU, Tanh, Sigmoid, etc.)
3. Learning rate and optimizer
4. Batch size
5. Regularization techniques (Dropout, L2 regularization)

Note how these changes affect the model's performance and training time.

In [None]:
# Example: Create a model with dropout for regularization
class NeuralNetworkWithDropout(nn.Module):
    def __init__(self, dropout_rate=0.2):
        super(NeuralNetworkWithDropout, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 128),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(dropout_rate),
            nn.Linear(64, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

# Initialize the model with dropout
model_dropout = NeuralNetworkWithDropout(dropout_rate=0.3).to(device)

# Change optimizer to Adam with a different learning rate
optimizer_dropout = optim.Adam(model_dropout.parameters(), lr=0.001)

# You can now train this new model using the same training loop

## 6. Summary and Conclusions

In this workshop, we've:

1. Implemented a neural network from scratch to understand the fundamentals
2. Used PyTorch to build and train a more efficient model
3. Visualized the model's performance and predictions
4. Experimented with different model architectures and hyperparameters

Key takeaways:
- The fundamental operations (forward pass, activation, backpropagation) are the same in both implementations
- PyTorch provides automatic differentiation and optimized operations for faster training
- Model architecture and hyperparameters significantly impact performance
- Understanding the underlying principles helps in designing and debugging neural networks

## 7. Custom Network vs PyTorch: Direct Comparison

Let's directly compare a neural network built from scratch with a PyTorch model using identical architectures on the MNIST dataset. This will help us understand the differences in implementation, performance, and training speed.

In [None]:
# Prepare MNIST data for the custom model (flattened format)
import time
from sklearn.metrics import accuracy_score

# Get a smaller subset of MNIST for faster training
train_subset_size = 5000
test_subset_size = 1000

# Extract a subset of training data
train_samples = []
train_labels = []

for images, labels in train_loader:
    batch_size = len(labels)
    if len(train_labels) + batch_size > train_subset_size:
        # Take only what we need to reach the subset size
        remaining = train_subset_size - len(train_labels)
        train_samples.append(images[:remaining].view(remaining, -1).cpu().numpy())
        train_labels.append(labels[:remaining].cpu().numpy())
        break
    else:
        train_samples.append(images.view(batch_size, -1).cpu().numpy())
        train_labels.append(labels.cpu().numpy())

# Extract a subset of test data
test_samples = []
test_labels = []

for images, labels in test_loader:
    batch_size = len(labels)
    if len(test_labels) + batch_size > test_subset_size:
        # Take only what we need to reach the subset size
        remaining = test_subset_size - len(test_labels)
        test_samples.append(images[:remaining].view(remaining, -1).cpu().numpy())
        test_labels.append(labels[:remaining].cpu().numpy())
        break
    else:
        test_samples.append(images.view(batch_size, -1).cpu().numpy())
        test_labels.append(labels.cpu().numpy())

# Convert lists of arrays to single arrays
X_train = np.vstack(train_samples)
y_train = np.concatenate(train_labels)
X_test = np.vstack(test_samples)
y_test = np.concatenate(test_labels)

print(f"Training data shape: {X_train.shape}")
print(f"Training labels shape: {y_train.shape}")
print(f"Test data shape: {X_test.shape}")
print(f"Test labels shape: {y_test.shape}")

In [None]:
# Define a custom neural network with the same architecture as our PyTorch model
class MNISTNeuralNetworkFromScratch:
    def __init__(self, input_size=784, hidden1_size=128, hidden2_size=64, output_size=10):
        # Initialize weights and biases with Xavier initialization
        self.W1 = np.random.randn(input_size, hidden1_size) * np.sqrt(1 / input_size)
        self.b1 = np.zeros((1, hidden1_size))
        self.W2 = np.random.randn(hidden1_size, hidden2_size) * np.sqrt(1 / hidden1_size)
        self.b2 = np.zeros((1, hidden2_size))
        self.W3 = np.random.randn(hidden2_size, output_size) * np.sqrt(1 / hidden2_size)
        self.b3 = np.zeros((1, output_size))
    
    def relu(self, x):
        # ReLU activation function
        return np.maximum(0, x)
    
    def relu_derivative(self, x):
        # Derivative of ReLU for backpropagation
        return np.where(x > 0, 1, 0)
    
    def softmax(self, x):
        # Softmax activation for output layer
        exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
        return exp_x / np.sum(exp_x, axis=1, keepdims=True)
    
    def forward(self, X):
        # Forward propagation
        self.z1 = np.dot(X, self.W1) + self.b1
        self.a1 = self.relu(self.z1)
        self.z2 = np.dot(self.a1, self.W2) + self.b2
        self.a2 = self.relu(self.z2)
        self.z3 = np.dot(self.a2, self.W3) + self.b3
        self.a3 = self.softmax(self.z3)
        return self.a3
    
    def backward(self, X, y, output, learning_rate=0.01):
        # Backpropagation
        batch_size = X.shape[0]
        
        # Convert y to one-hot encoding
        y_one_hot = np.zeros((y.size, output.shape[1]))
        y_one_hot[np.arange(y.size), y] = 1
        
        # Output layer gradients
        dz3 = output - y_one_hot
        dW3 = (1/batch_size) * np.dot(self.a2.T, dz3)
        db3 = (1/batch_size) * np.sum(dz3, axis=0, keepdims=True)
        
        # Second hidden layer gradients
        da2 = np.dot(dz3, self.W3.T)
        dz2 = da2 * self.relu_derivative(self.a2)
        dW2 = (1/batch_size) * np.dot(self.a1.T, dz2)
        db2 = (1/batch_size) * np.sum(dz2, axis=0, keepdims=True)
        
        # First hidden layer gradients
        da1 = np.dot(dz2, self.W2.T)
        dz1 = da1 * self.relu_derivative(self.a1)
        dW1 = (1/batch_size) * np.dot(X.T, dz1)
        db1 = (1/batch_size) * np.sum(dz1, axis=0, keepdims=True)
        
        # Update parameters with gradient descent
        self.W3 -= learning_rate * dW3
        self.b3 -= learning_rate * db3
        self.W2 -= learning_rate * dW2
        self.b2 -= learning_rate * db2
        self.W1 -= learning_rate * dW1
        self.b1 -= learning_rate * db1
    
    def compute_loss(self, y_true, y_pred):
        # Cross-entropy loss
        # Convert y_true to one-hot encoding
        y_one_hot = np.zeros((y_true.size, y_pred.shape[1]))
        y_one_hot[np.arange(y_true.size), y_true] = 1
        
        # Calculate cross-entropy loss
        loss = -np.mean(np.sum(y_one_hot * np.log(y_pred + 1e-8), axis=1))
        return loss
    
    def train(self, X, y, batch_size=64, epochs=10, learning_rate=0.01):
        n_samples = X.shape[0]
        n_batches = n_samples // batch_size
        
        # Keep track of metrics
        losses = []
        accuracies = []
        
        start_time = time.time()
        
        for epoch in range(epochs):
            epoch_loss = 0
            epoch_correct = 0
            indices = np.random.permutation(n_samples)
            X_shuffled = X[indices]
            y_shuffled = y[indices]
            
            for i in range(n_batches):
                # Get batch
                start_idx = i * batch_size
                end_idx = min((i + 1) * batch_size, n_samples)
                X_batch = X_shuffled[start_idx:end_idx]
                y_batch = y_shuffled[start_idx:end_idx]
                
                # Forward pass
                y_pred = self.forward(X_batch)
                
                # Compute loss and accuracy
                batch_loss = self.compute_loss(y_batch, y_pred)
                epoch_loss += batch_loss * (end_idx - start_idx)
                
                # Count correct predictions
                batch_preds = np.argmax(y_pred, axis=1)
                epoch_correct += np.sum(batch_preds == y_batch)
                
                # Backward pass
                self.backward(X_batch, y_batch, y_pred, learning_rate)
            
            # Calculate epoch metrics
            epoch_loss /= n_samples
            epoch_accuracy = epoch_correct / n_samples * 100
            
            losses.append(epoch_loss)
            accuracies.append(epoch_accuracy)
            
            # Print progress
            print(f"Epoch {epoch+1}/{epochs} - Loss: {epoch_loss:.4f} - Accuracy: {epoch_accuracy:.2f}%")
        
        training_time = time.time() - start_time
        print(f"Training completed in {training_time:.2f} seconds")
        
        return losses, accuracies, training_time
    
    def predict(self, X):
        # Make predictions
        output = self.forward(X)
        return np.argmax(output, axis=1)

In [None]:
# Create an equivalent PyTorch model with exactly the same architecture
class EquivalentPyTorchModel(nn.Module):
    def __init__(self, input_size=784, hidden1_size=128, hidden2_size=64, output_size=10):
        super(EquivalentPyTorchModel, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden1_size)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(hidden1_size, hidden2_size)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(hidden2_size, output_size)
        # Not including softmax here as CrossEntropyLoss includes it
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.fc3(x)
        return x

# Initialize the PyTorch model
pytorch_model = EquivalentPyTorchModel().to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
py_optimizer = optim.SGD(pytorch_model.parameters(), lr=0.01, momentum=0.9)

# Convert data to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train).to(device)
y_train_tensor = torch.LongTensor(y_train).to(device)
X_test_tensor = torch.FloatTensor(X_test).to(device)
y_test_tensor = torch.LongTensor(y_test).to(device)

# Create data loaders for PyTorch training
from torch.utils.data import TensorDataset, DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=64)

In [None]:
# Function to train the PyTorch model
def train_pytorch_model(model, dataloader, criterion, optimizer, device, num_epochs=10):
    losses = []
    accuracies = []
    
    start_time = time.time()
    
    for epoch in range(num_epochs):
        model.train()
        running_loss = 0.0
        correct = 0
        total = 0
        
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            
            # Zero the gradients
            optimizer.zero_grad()
            
            # Forward pass
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            
            # Backward pass and optimize
            loss.backward()
            optimizer.step()
            
            # Update statistics
            running_loss += loss.item() * inputs.size(0)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
        
        epoch_loss = running_loss / total
        epoch_accuracy = correct / total * 100
        
        losses.append(epoch_loss)
        accuracies.append(epoch_accuracy)
        
        print(f"Epoch {epoch+1}/{num_epochs} - Loss: {epoch_loss:.4f} - Accuracy: {epoch_accuracy:.2f}%")
    
    training_time = time.time() - start_time
    print(f"Training completed in {training_time:.2f} seconds")
    
    return losses, accuracies, training_time

# Function to evaluate the PyTorch model
def evaluate_pytorch_model(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    accuracy = correct / total * 100
    print(f"Test Accuracy: {accuracy:.2f}%")
    
    return accuracy, all_preds, all_labels

In [None]:
# Train the custom neural network
print("Training custom neural network from scratch...")
custom_nn = MNISTNeuralNetworkFromScratch(input_size=784, hidden1_size=128, hidden2_size=64, output_size=10)
custom_losses, custom_accuracies, custom_time = custom_nn.train(
    X_train, y_train, batch_size=64, epochs=10, learning_rate=0.01
)

# Evaluate custom model on test data
custom_predictions = custom_nn.predict(X_test)
custom_test_accuracy = np.mean(custom_predictions == y_test) * 100
print(f"Custom model test accuracy: {custom_test_accuracy:.2f}%")

In [None]:
# Train the PyTorch model
print("\nTraining PyTorch model...")
pytorch_losses, pytorch_accuracies, pytorch_time = train_pytorch_model(
    pytorch_model, train_dataloader, criterion, py_optimizer, device, num_epochs=10
)

# Evaluate PyTorch model on test data
pytorch_test_accuracy, pytorch_predictions, test_labels = evaluate_pytorch_model(
    pytorch_model, test_dataloader, device
)

In [None]:
# Compare the results
print("\nPerformance Comparison:")
print(f"{'Model':<20} {'Training Time (s)':<20} {'Test Accuracy (%)':<20}")
print(f"{'-'*60}")
print(f"{'Custom Neural Net':<20} {custom_time:<20.2f} {custom_test_accuracy:<20.2f}")
print(f"{'PyTorch Model':<20} {pytorch_time:<20.2f} {pytorch_test_accuracy:<20.2f}")
print(f"{'Speedup':<20} {custom_time/pytorch_time:.2f}x")

# Plot training metrics comparison
plt.figure(figsize=(12, 10))

# Plot training loss
plt.subplot(2, 1, 1)
plt.plot(range(1, len(custom_losses) + 1), custom_losses, 'b-', label='Custom Implementation')
plt.plot(range(1, len(pytorch_losses) + 1), pytorch_losses, 'r-', label='PyTorch Implementation')
plt.title('Training Loss Comparison')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)

# Plot training accuracy
plt.subplot(2, 1, 2)
plt.plot(range(1, len(custom_accuracies) + 1), custom_accuracies, 'b-', label='Custom Implementation')
plt.plot(range(1, len(pytorch_accuracies) + 1), pytorch_accuracies, 'r-', label='PyTorch Implementation')
plt.title('Training Accuracy Comparison')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Compare predictions visually
plt.figure(figsize=(15, 10))

# Select a few random test examples
num_samples = 5
random_indices = np.random.choice(len(X_test), num_samples, replace=False)

for i, idx in enumerate(random_indices):
    plt.subplot(2, num_samples, i + 1)
    plt.imshow(X_test[idx].reshape(28, 28), cmap='gray')
    plt.title(f"True: {y_test[idx]}")
    plt.axis('off')
    
    plt.subplot(2, num_samples, i + 1 + num_samples)
    plt.bar(['Custom', 'PyTorch'], [custom_predictions[idx], pytorch_predictions[idx]], color=['blue', 'red'])
    plt.title(f"Custom: {custom_predictions[idx]}, PyTorch: {pytorch_predictions[idx]}")

plt.tight_layout()
plt.show()

### Discussion of Comparison Results

From the experiment above, we can observe several key points:

1. **Training Speed**: PyTorch typically trains much faster due to its optimized backend, GPU acceleration, and vectorized operations.

2. **Implementation Complexity**: Our custom implementation required explicit coding of the forward and backward passes, while PyTorch handled this automatically through its autograd system.

3. **Performance**: Both implementations should theoretically converge to similar results given enough training time, but PyTorch's optimized operations often lead to better performance and stability.

4. **Memory Usage**: Custom implementations may use less memory for small models but don't scale as efficiently for larger networks.

5. **Code Length**: The custom implementation required significantly more code to achieve the same functionality.

## Key Insights from Building Neural Networks from Scratch

- Understanding the fundamental operations helps debug complex models in modern frameworks
- Implementing backpropagation from scratch gives insights into gradient flow and training dynamics
- Knowledge of the underlying mathematics makes it easier to adapt and extend existing architectures
- Frameworks like PyTorch abstract away much of the complexity while maintaining flexibility for research and application