In [None]:
# Import necessary libraries
import numpy as np
import pickle
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score, confusion_matrix,accuracy_score
from torchvision.datasets import FashionMNIST
from torchvision.transforms import ToTensor
import os


In [2]:
# Updated data loading function with train-validation split
def load_data(validation_split=0.2):
    # Load training and test data
    train_data = FashionMNIST(root=".", train=True, download=True, transform=ToTensor())
    test_data = FashionMNIST(root=".", train=False, download=True, transform=ToTensor())
    
    # Convert to numpy arrays
    X_train = np.array([np.array(img[0]).flatten() for img in train_data])
    y_train = np.array(train_data.targets)
    X_test = np.array([np.array(img[0]).flatten() for img in test_data])
    y_test = np.array(test_data.targets)
    
    # Split training data into train and validation sets
    num_train_samples = int((1 - validation_split) * X_train.shape[0])
    indices = np.arange(X_train.shape[0])
    np.random.shuffle(indices)
    
    X_train, X_val = X_train[indices[:num_train_samples]], X_train[indices[num_train_samples:]]
    y_train, y_val = y_train[indices[:num_train_samples]], y_train[indices[num_train_samples:]]
    
    return X_train, y_train, X_val, y_val, X_test, y_test

# Load the data with 20% validation split
X_train, y_train, X_val, y_val, X_test, y_test = load_data()


In [3]:
class DenseLayer:
    def __init__(self, input_dim, output_dim):
        # Initialize weights and biases as float32 to reduce size
        self.weights = np.random.randn(input_dim, output_dim).astype(np.float32) * 0.01
        self.bias = np.zeros((1, output_dim), dtype=np.float32)
    
    def forward(self, X):
        self.X = X  # Store input for backpropagation
        return np.dot(X, self.weights) + self.bias
    
    def backward(self, grad_output, learning_rate):
        grad_input = np.dot(grad_output, self.weights.T)
        grad_weights = np.dot(self.X.T, grad_output)
        grad_bias = np.sum(grad_output, axis=0, keepdims=True)
        
        # Update weights and biases
        self.weights -= learning_rate * grad_weights.astype(np.float32)
        self.bias -= learning_rate * grad_bias.astype(np.float32)
        
        return grad_input



In [4]:
class BatchNormalization:
    def __init__(self, num_features, epsilon=1e-5, momentum=0.9):
        self.epsilon = epsilon
        self.momentum = momentum
        self.gamma = np.ones((1, num_features), dtype=np.float32)
        self.beta = np.zeros((1, num_features), dtype=np.float32)
        self.running_mean = np.zeros((1, num_features), dtype=np.float32)
        self.running_var = np.ones((1, num_features), dtype=np.float32)
    
    def forward(self, X, training=True):
        if training:
            batch_mean = np.mean(X, axis=0).astype(np.float32)
            batch_var = np.var(X, axis=0).astype(np.float32)
            self.X_centered = (X - batch_mean).astype(np.float32)
            self.stddev_inv = (1. / np.sqrt(batch_var + self.epsilon)).astype(np.float32)
            
            # Normalize
            self.X_norm = self.X_centered * self.stddev_inv
            self.out = self.gamma * self.X_norm + self.beta
            
            # Update running statistics
            self.running_mean = (self.momentum * self.running_mean + (1 - self.momentum) * batch_mean).astype(np.float32)
            self.running_var = (self.momentum * self.running_var + (1 - self.momentum) * batch_var).astype(np.float32)
        else:
            # Use running statistics for inference
            self.X_norm = (X - self.running_mean) / np.sqrt(self.running_var + self.epsilon)
            self.out = self.gamma * self.X_norm + self.beta
            
        return self.out
    
    def backward(self, grad_output, learning_rate):
        grad_gamma = np.sum(grad_output * self.X_norm, axis=0).astype(np.float32)
        grad_beta = np.sum(grad_output, axis=0).astype(np.float32)
        
        grad_X_norm = grad_output * self.gamma
        grad_var = np.sum(grad_X_norm * self.X_centered, axis=0) * -0.5 * self.stddev_inv**3
        grad_mean = np.sum(grad_X_norm * -self.stddev_inv, axis=0) + grad_var * np.mean(-2. * self.X_centered, axis=0)
        
        grad_input = (grad_X_norm * self.stddev_inv + grad_var * 2 * self.X_centered / grad_output.shape[0] + grad_mean / grad_output.shape[0]).astype(np.float32)
        
        # Update gamma and beta
        self.gamma -= learning_rate * grad_gamma
        self.beta -= learning_rate * grad_beta
        
        return grad_input


In [5]:
class ReLU:
    def forward(self, X):
        self.X = X
        return np.maximum(0, X)
    
    def backward(self, grad_output, learning_rate):
        return grad_output * (self.X > 0)


In [6]:
class Dropout:
    def __init__(self, dropout_rate):
        self.dropout_rate = dropout_rate
        self.mask = None  # Initialize mask as None
    
    def forward(self, X, training=True):
        if training:
            # Apply dropout mask during training
            # print("in dropout")
            self.mask = (np.random.rand(*X.shape) > self.dropout_rate) / (1 - self.dropout_rate)
            # if(self.mask is None):
            #     print("mask is none")
            # else :
            #     print("mask is not none")
            return X * self.mask
        else:
            # No dropout during inference; mask is effectively all 1s
            self.mask = np.ones_like(X)
            return X
    
    def backward(self, grad_output, learning_rate=None):
        # Use the mask for backpropagation
        if self.mask is None:
            raise ValueError("Mask has not been initialized. Ensure forward pass is called before backward.")
        return grad_output * self.mask


In [7]:
class Adam:
    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8):
        self.learning_rate = learning_rate
        self.beta1 = beta1
        self.beta2 = beta2
        self.epsilon = epsilon
        self.m = {}
        self.v = {}
        self.t = 0

    def update(self, layer):
        if layer not in self.m:
            # Initialize first and second moment vectors
            self.m[layer] = np.zeros_like(layer.weights)
            self.v[layer] = np.zeros_like(layer.weights)

        # Update time step
        self.t += 1

        # Calculate gradients
        grad_weights = layer.grad_weights
        grad_bias = layer.grad_bias

        # Update biased first moment estimate
        self.m[layer] = self.beta1 * self.m[layer] + (1 - self.beta1) * grad_weights
        self.v[layer] = self.beta2 * self.v[layer] + (1 - self.beta2) * (grad_weights**2)

        # Bias-corrected first and second moment estimates
        m_hat = self.m[layer] / (1 - self.beta1**self.t)
        v_hat = self.v[layer] / (1 - self.beta2**self.t)

        # Update weights and biases
        layer.weights -= self.learning_rate * m_hat / (np.sqrt(v_hat) + self.epsilon)
        layer.bias -= self.learning_rate * grad_bias  # No moment estimates for biases


In [8]:
class Softmax:
    def forward(self, X):
        # Apply softmax activation
        exp_values = np.exp(X - np.max(X, axis=1, keepdims=True))
        self.output = exp_values / np.sum(exp_values, axis=1, keepdims=True)
        return self.output
    
    def backward(self, grad_output, learning_rate=None):
        # Gradient for the softmax layer (assuming Cross-Entropy Loss)
        return grad_output



In [9]:
class NeuralNetwork:
    def __init__(self, layers):
        self.layers = layers
    
    def forward(self, X, training=True):
        for layer in self.layers:
            X = layer.forward(X)
        return X
    
    def backward(self, grad_output, learning_rate):
        for layer in reversed(self.layers):
            grad_output = layer.backward(grad_output, learning_rate)
    
    def clean_for_saving(self):
        # Remove unnecessary attributes from layers to reduce file size
        for layer in self.layers:
            if hasattr(layer, 'X'):
                del layer.X  # Remove input storage used in backpropagation
            if hasattr(layer, 'X_centered'):
                del layer.X_centered
            if hasattr(layer, 'stddev_inv'):
                del layer.stddev_inv
            if hasattr(layer, 'X_norm'):
                del layer.X_norm
            if hasattr(layer, 'grad_weights'):
                del layer.grad_weights
            if hasattr(layer, 'grad_bias'):
                del layer.grad_bias


In [None]:

def evaluate(network, X_test, y_test):
    # Forward pass through the test data
    predictions = network.forward(X_test, training=False)
    predicted_labels = np.argmax(predictions, axis=1)
    
    # Calculate accuracy and F1 score
    accuracy = accuracy_score(y_test, predicted_labels)
    f1 = f1_score(y_test, predicted_labels, average="macro")
    conf_matrix = confusion_matrix(y_test, predicted_labels)
    
    print(f"Test Accuracy: {accuracy:.4f}")
    print(f"Macro F1 Score: {f1:.4f}")
    print("Confusion Matrix:\n", conf_matrix)
    
    return accuracy, f1, conf_matrix


In [11]:
# Save model using pickle
def save_model(network, filename):
    network.clean_for_saving()
    with open(filename, 'wb') as f:
        pickle.dump(network, f)

# Load model from pickle file
def load_model(filename):
    with open(filename, 'rb') as f:
        network = pickle.load(f)
    return network


In [12]:
# Function to compute loss (cross-entropy)
def compute_loss(predictions, targets):
    # One-hot encode targets
    targets_one_hot = np.eye(10)[targets]
    # Calculate loss
    loss = -np.mean(np.sum(targets_one_hot * np.log(predictions + 1e-9), axis=1))
    return loss

In [None]:
# Ensure the output directory exists
os.makedirs("Figures", exist_ok=True)

In [14]:
# Function to plot confusion matrix using Seaborn
def plot_confusion_matrix(conf_matrix, title):
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(title)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")
    # plt.show()
    plt.savefig(f"Figures/{title}.png")
    plt.close()

In [15]:
# Training function with added confusion matrix plotting
def train_model(network, X_train, y_train, X_val, y_val, epochs, batch_size, learning_rate, optimizer,archnum):
    # Lists to store metrics
    history = {
        'train_loss': [],
        'val_loss': [],
        'train_accuracy': [],
        'val_accuracy': [],
        'val_macro_f1': []
    }
    best_f1 = 0  # Track the best validation macro-F1 score for this run

    for epoch in range(epochs):
        # Adjust learning rate every 3 epochs
        if epoch > 0 and epoch % 3 == 0:
            learning_rate *= 0.5

        # Shuffle training data
        indices = np.arange(X_train.shape[0])
        np.random.shuffle(indices)
        X_train, y_train = X_train[indices], y_train[indices]
        
        # Training in mini-batches
        epoch_loss = 0
        for i in range(0, X_train.shape[0], batch_size):
            X_batch = X_train[i:i+batch_size]
            y_batch = y_train[i:i+batch_size]
            
            # Forward pass
            predictions = network.forward(X_batch)
            loss = compute_loss(predictions, y_batch)
            epoch_loss += loss
            
            # Backward pass
            grad_output = predictions - np.eye(10)[y_batch]
            network.backward(grad_output, learning_rate)
        
        # Calculate and store average training loss
        avg_train_loss = epoch_loss / (X_train.shape[0] // batch_size)
        
        # Validation pass
        val_predictions = network.forward(X_val, training=False)
        val_loss = compute_loss(val_predictions, y_val)
        
        # Calculate training and validation accuracy
        train_preds = np.argmax(network.forward(X_train, training=False), axis=1)
        val_preds = np.argmax(val_predictions, axis=1)
        
        train_accuracy = accuracy_score(y_train, train_preds)
        val_accuracy = accuracy_score(y_val, val_preds)
        
        # Calculate validation macro-F1 score
        val_macro_f1 = f1_score(y_val, val_preds, average="macro")
        
        # Append metrics to history
        history['train_loss'].append(avg_train_loss)
        history['val_loss'].append(val_loss)
        history['train_accuracy'].append(train_accuracy)
        history['val_accuracy'].append(val_accuracy)
        history['val_macro_f1'].append(val_macro_f1)
        
        print(f"Epoch {epoch+1}/{epochs} - "
              f"Train Loss: {avg_train_loss:.4f}, Val Loss: {val_loss:.4f}, "
              f"Train Acc: {train_accuracy:.4f}, Val Acc: {val_accuracy:.4f}, "
              f"Val Macro-F1: {val_macro_f1:.4f}")
        
    # Confusion matrices for train and validation
    train_conf_matrix = confusion_matrix(y_train, train_preds)
    val_conf_matrix = confusion_matrix(y_val, val_preds)
    
    # Plot confusion matrices
    plot_confusion_matrix(train_conf_matrix, f"Train Confusion Matrix - Arch {archnum} - LR {learning_rate}")
    plot_confusion_matrix(val_conf_matrix, f"Validation Confusion Matrix - Arch {archnum} - LR {learning_rate}")

    return history, max(history['val_macro_f1'])  # Return history and best F1 score for this model


In [16]:


# Define base colors for each architecture
architecture_colors = sns.color_palette("dark", 3)  # 3 darker colors for 3 architectures

# Define line styles and markers for each learning rate
line_styles = ['-', '--', '-.', ':']
# markers = ['o', 's', 'D', 'x']  # Circle, square, diamond, x-mark

def plot_metric(history, metric, title):
    # Loop through each (learning rate, architecture) configuration
    for idx, ((lr, arch), hist) in enumerate(history.items()):
        # Determine color based on architecture and line style/marker based on learning rate
        arch_index = int(arch.split('_')[1]) - 1  # Extract architecture index (e.g., 'arch_1' -> 0)
        color = architecture_colors[arch_index]  # Base color for the architecture
        line_style = line_styles[idx % len(line_styles)]  # Cycle through line styles
        # marker = markers[idx % len(markers)]  # Cycle through markers
        
        # Plot the metric with chosen color, line style, and marker
        plt.plot(hist[metric], label=f"{arch},LR:{lr}", color=color, linestyle=line_style)

    plt.title(title)
    plt.xlabel("Epochs")
    plt.ylabel(metric.replace('_', ' ').title())
    plt.legend()
    
    # Save the plot
    plt.savefig(f"Figures/{metric}.png")
    plt.close()




In [17]:
# Define architectures
architectures = [
    [DenseLayer(784, 128), ReLU(), DenseLayer(128, 10), Softmax()],
    [DenseLayer(784, 256), ReLU(), DenseLayer(256, 128), ReLU(), DenseLayer(128, 10), Softmax()],
    [DenseLayer(784, 512), ReLU(), BatchNormalization(512), DenseLayer(512, 256), ReLU(), DenseLayer(256, 10), Softmax()]
]

# Test different learning rates
learning_rates = [0.005, 0.0025, 0.001, 0.0005]

# Dictionary to store results and the best configuration
results = {}
best_model_config = None
best_model_f1 = 0
best_model_path = 'best_model_final.pickle'

for i, arch in enumerate(architectures):
    for lr in learning_rates:
        print(f"\nTraining model with architecture {i+1} and learning rate {lr}")
        network = NeuralNetwork(arch)
        optimizer = Adam(learning_rate=lr)
        
        # Train the model without saving weights in each epoch
        history, max_val_f1 = train_model(
            network, X_train, y_train, X_val, y_val, epochs=10, batch_size=64, learning_rate=lr, optimizer=optimizer
        , archnum = i+1)
        
        # Save training history
        results[(lr, f"arch_{i+1}")] = history

        # If this model has the best F1 score across all configurations, save its weights
        if max_val_f1 > best_model_f1:
            best_model_f1 = max_val_f1
            best_model_config = (lr, f"arch_{i+1}")
            save_model(network, best_model_path)
            print(f"New best model found: LR: {lr}, Arch: {i+1} with Val Macro-F1: {max_val_f1:.4f}")







Training model with architecture 1 and learning rate 0.005
Epoch 1/10 - Train Loss: 0.6449, Val Loss: 0.5009, Train Acc: 0.8148, Val Acc: 0.8107, Val Macro-F1: 0.8084
Epoch 2/10 - Train Loss: 0.4369, Val Loss: 0.4885, Train Acc: 0.8277, Val Acc: 0.8204, Val Macro-F1: 0.8129
Epoch 3/10 - Train Loss: 0.3906, Val Loss: 0.3989, Train Acc: 0.8622, Val Acc: 0.8502, Val Macro-F1: 0.8501
Epoch 4/10 - Train Loss: 0.3302, Val Loss: 0.3508, Train Acc: 0.8836, Val Acc: 0.8730, Val Macro-F1: 0.8717
Epoch 5/10 - Train Loss: 0.3156, Val Loss: 0.3404, Train Acc: 0.8919, Val Acc: 0.8739, Val Macro-F1: 0.8719
Epoch 6/10 - Train Loss: 0.3066, Val Loss: 0.3381, Train Acc: 0.8942, Val Acc: 0.8742, Val Macro-F1: 0.8712
Epoch 7/10 - Train Loss: 0.2801, Val Loss: 0.3251, Train Acc: 0.9021, Val Acc: 0.8818, Val Macro-F1: 0.8804
Epoch 8/10 - Train Loss: 0.2747, Val Loss: 0.3238, Train Acc: 0.9039, Val Acc: 0.8823, Val Macro-F1: 0.8808
Epoch 9/10 - Train Loss: 0.2706, Val Loss: 0.3185, Train Acc: 0.9049, Val Ac

In [18]:
# Plot metrics for all architectures and learning rates
plot_metric(results, 'train_loss', 'Training Loss over Epochs')
plot_metric(results, 'val_loss', 'Validation Loss over Epochs')
plot_metric(results, 'train_accuracy', 'Training Accuracy over Epochs')
plot_metric(results, 'val_accuracy', 'Validation Accuracy over Epochs')
plot_metric(results, 'val_macro_f1', 'Validation Macro F1 Score over Epochs')

In [19]:
# Display the best model configuration
print(f"Best Model Configuration: Learning Rate: {best_model_config[0]}, Architecture: {best_model_config[1]}")
print(f"Best Validation Macro-F1 Score: {best_model_f1:.4f}")

Best Model Configuration: Learning Rate: 0.0005, Architecture: arch_2
Best Validation Macro-F1 Score: 0.8984


In [20]:
best_model_path = 'best_model_final.pickle'
best_network = load_model(best_model_path)

# Test set evaluation
test_predictions = best_network.forward(X_test, training=False)
test_accuracy = accuracy_score(y_test, np.argmax(test_predictions, axis=1))
test_macro_f1 = f1_score(y_test, np.argmax(test_predictions, axis=1), average="macro")
test_conf_matrix = confusion_matrix(y_test, np.argmax(test_predictions, axis=1))
plot_confusion_matrix(test_conf_matrix, "Test Confusion Matrix")

print(f"Test Accuracy: {test_accuracy:.4f}")
print(f"Test Macro F1 Score: {test_macro_f1:.4f}")
print("Confusion Matrix:\n", test_conf_matrix)

Test Accuracy: 0.8933
Test Macro F1 Score: 0.8935
Confusion Matrix:
 [[825   0  21  19   2   1 124   0   8   0]
 [  2 972   1  19   4   0   1   0   1   0]
 [ 10   0 822  13  81   0  73   0   1   0]
 [ 18  10  11 896  33   0  27   1   4   0]
 [  0   1  85  25 839   0  50   0   0   0]
 [  0   0   0   1   0 957   0  22   1  19]
 [ 98   2  71  25  71   0 725   0   8   0]
 [  0   0   0   0   0  15   0 965   0  20]
 [  5   1   2   3   3   4   5   4 973   0]
 [  0   0   0   0   0   5   1  35   0 959]]


In [21]:
X_train.shape[1]

784