In [1]:
import numpy as np
import torch
import torchvision
from torchvision import transforms
import matplotlib.pyplot as plt


In [2]:

train_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=True,
    transform=transforms.ToTensor(),
    download=True
)

val_dataset = torchvision.datasets.MNIST(
    root='./data',
    train=False,
    transform=transforms.ToTensor(),
    download=True
)

train_loader = torch.utils.data.DataLoader(
    dataset=train_dataset,
    batch_size=64,
    shuffle=True
)

val_loader = torch.utils.data.DataLoader(
    dataset=val_dataset,
    batch_size=64,
    shuffle=False
)


In [3]:


def one_hot(labels, num_classes=10):
    oh = np.zeros((labels.size, num_classes))
    oh[np.arange(labels.size), labels] = 1
    return oh

def softmax(z):
    z = z - np.max(z, axis=1, keepdims=True)
    exp_z = np.exp(z)
    return exp_z / np.sum(exp_z, axis=1, keepdims=True)

def relu(z):
    return np.maximum(0, z)

def relu_derivative(z):
    return (z > 0).astype(float)

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_derivative(z):
    s = sigmoid(z)
    return s * (1 - s)

def tanh(z):
    return np.tanh(z)

def tanh_derivative(z):
    return 1 - np.tanh(z)**2


In [4]:

class NeuralNetwork:
    def __init__(self, layer_sizes, activation='relu', lr=0.01):
        self.layer_sizes = layer_sizes
        self.activation_name = activation
        self.lr = lr
        
        self.weights = []
        self.biases = []
        self.Z = []
        self.A = []
        
        for i in range(len(layer_sizes) - 1):
            w = np.random.randn(layer_sizes[i], layer_sizes[i+1]) * np.sqrt(2 / layer_sizes[i])
            b = np.zeros((1, layer_sizes[i+1]))
            self.weights.append(w)
            self.biases.append(b)

    def activation(self, z):
        if self.activation_name == 'relu':
            return relu(z)
        elif self.activation_name == 'sigmoid':
            return sigmoid(z)
        elif self.activation_name == 'tanh':
            return tanh(z)

    def activation_derivative(self, z):
        if self.activation_name == 'relu':
            return relu_derivative(z)
        elif self.activation_name == 'sigmoid':
            return sigmoid_derivative(z)
        elif self.activation_name == 'tanh':
            return tanh_derivative(z)

    def forward(self, X):
        self.A = [X]
        self.Z = []
        
        for i in range(len(self.weights) - 1):
            z = self.A[-1] @ self.weights[i] + self.biases[i]
            a = self.activation(z)
            self.Z.append(z)
            self.A.append(a)
        
        z = self.A[-1] @ self.weights[-1] + self.biases[-1]
        a = softmax(z)
        self.Z.append(z)
        self.A.append(a)
        return a

    def compute_loss(self, y_true, y_pred):
        m = y_true.shape[0]
        return -np.sum(y_true * np.log(y_pred + 1e-9)) / m

    def backward(self, y_true):
        m = y_true.shape[0]
        dZ = self.A[-1] - y_true
        self.dW = []
        self.db = []
        
        for i in reversed(range(len(self.weights))):
            dW = self.A[i].T @ dZ / m
            db = np.sum(dZ, axis=0, keepdims=True) / m
            self.dW.insert(0, dW)
            self.db.insert(0, db)
            if i != 0:
                dZ = (dZ @ self.weights[i].T) * self.activation_derivative(self.Z[i-1])

    def update_parameters(self):
        for i in range(len(self.weights)):
            self.weights[i] -= self.lr * self.dW[i]
            self.biases[i] -= self.lr * self.db[i]

    def predict(self, X):
        return np.argmax(self.forward(X), axis=1)

    def evaluate(self, loader):
        total_loss, correct, total = 0, 0, 0
        for images, labels in loader:
            images = images.cpu().numpy().reshape(images.size(0), -1) / 255.0
            labels = labels.cpu().numpy()
            y = one_hot(labels)
            preds = self.forward(images)
            total_loss += self.compute_loss(y, preds) * images.shape[0]
            correct += np.sum(np.argmax(preds, axis=1) == labels)
            total += images.shape[0]
        return total_loss / total, correct / total


In [5]:

def train_model(model, train_loader, val_loader, epochs):
    history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}
    
    for epoch in range(epochs):
        total_loss, correct, total = 0, 0, 0
        
        for images, labels in train_loader:
            images = images.cpu().numpy().reshape(images.size(0), -1) / 255.0
            labels = labels.cpu().numpy()
            y = one_hot(labels)
            
            preds = model.forward(images)
            loss = model.compute_loss(y, preds)
            
            model.backward(y)
            model.update_parameters()
            
            total_loss += loss * images.shape[0]
            correct += np.sum(np.argmax(preds, axis=1) == labels)
            total += images.shape[0]
        
        train_loss = total_loss / total
        train_acc = correct / total
        val_loss, val_acc = model.evaluate(val_loader)
        
        history['train_loss'].append(train_loss)
        history['train_acc'].append(train_acc)
        history['val_loss'].append(val_loss)
        history['val_acc'].append(val_acc)
        
        print(f"Epoch {epoch+1}: Train Acc={train_acc:.4f}, Val Acc={val_acc:.4f}")
    
    return history


In [6]:


experiments = [
    {'layers': [784, 128, 10], 'activation': 'relu'},
    {'layers': [784, 256, 128, 10], 'activation': 'relu'},
    {'layers': [784, 128, 10], 'activation': 'sigmoid'},
    {'layers': [784, 128, 10], 'activation': 'tanh'},
]

results = []

for exp in experiments:
    print("\nRunning", exp)
    model = NeuralNetwork(exp['layers'], activation=exp['activation'], lr=0.01)
    hist = train_model(model, train_loader, val_loader, epochs=5)
    results.append((exp, hist))



Running {'layers': [784, 128, 10], 'activation': 'relu'}
Epoch 1: Train Acc=0.1183, Val Acc=0.1135
Epoch 2: Train Acc=0.1124, Val Acc=0.1135
Epoch 3: Train Acc=0.1124, Val Acc=0.1135
Epoch 4: Train Acc=0.1124, Val Acc=0.1135
Epoch 5: Train Acc=0.1124, Val Acc=0.1135

Running {'layers': [784, 256, 128, 10], 'activation': 'relu'}
Epoch 1: Train Acc=0.1216, Val Acc=0.1135
Epoch 2: Train Acc=0.1124, Val Acc=0.1135
Epoch 3: Train Acc=0.1124, Val Acc=0.1135
Epoch 4: Train Acc=0.1124, Val Acc=0.1135
Epoch 5: Train Acc=0.1124, Val Acc=0.1135

Running {'layers': [784, 128, 10], 'activation': 'sigmoid'}
Epoch 1: Train Acc=0.1087, Val Acc=0.1135
Epoch 2: Train Acc=0.1101, Val Acc=0.1135
Epoch 3: Train Acc=0.1092, Val Acc=0.1135
Epoch 4: Train Acc=0.1118, Val Acc=0.0892
Epoch 5: Train Acc=0.1119, Val Acc=0.1135

Running {'layers': [784, 128, 10], 'activation': 'tanh'}
Epoch 1: Train Acc=0.1156, Val Acc=0.1135
Epoch 2: Train Acc=0.1124, Val Acc=0.1135
Epoch 3: Train Acc=0.1124, Val Acc=0.1135
Epoc

In [7]:


for i, (cfg, hist) in enumerate(results):
    plt.figure()
    plt.plot(hist['train_loss'], label='Train Loss')
    plt.plot(hist['val_loss'], label='Val Loss')
    plt.legend()
    plt.title(str(cfg))
    plt.savefig(f'exp_{i}_loss.png')
    plt.close()
    
    plt.figure()
    plt.plot(hist['train_acc'], label='Train Acc')
    plt.plot(hist['val_acc'], label='Val Acc')
    plt.legend()
    plt.title(str(cfg))
    plt.savefig(f'exp_{i}_acc.png')
    plt.close()
