In [19]:
# Cell 1: Imports
import numpy as np

In [20]:


# Cell 2: Activation Functions
def relu(Z):
    return np.maximum(0, Z)

def sigmoid(Z):
    return 1 / (1 + np.exp(-Z))

def relu_derivative(Z):
    return Z > 0

def sigmoid_derivative(Z):
    return sigmoid(Z) * (1 - sigmoid(Z))

def softmax(Z):
    return np.exp(Z) / np.sum(np.exp(Z), axis=1, keepdims=True)

In [21]:
def tanh(Z):
    """
    Hyperbolic tangent activation function
    Z: Input numpy array
    Returns: tanh(Z)
    """
    return np.tanh(Z)

def tanh_derivative(Z):
    """
    Derivative of tanh activation function
    Z: Input numpy array
    Returns: 1 - tanh²(Z)
    """
    return 1 - np.square(np.tanh(Z))

In [22]:


# Cell 3: Data Preprocessing
def to_one_hot(y, num_classes):
    """
    Convert class labels to one-hot encoded vectors
    y: labels (can be 1D array, column vector, or row vector)
    num_classes: number of classes
    """
    y = np.array(y).reshape(-1)
    m = len(y)
    one_hot = np.zeros((m, num_classes))
    
    if num_classes == 1:
        return y.reshape(-1, 1)
    
    one_hot[np.arange(m), y.astype(int)] = 1
    return one_hot

In [23]:

# Cell 4: Neural Network Core Functions
def initialize_parameters(layers):
    parameters = {}
    for i in range(1, len(layers)):
        parameters[f'W{i}'] = np.random.randn(layers[i], layers[i - 1]) * 0.01
        parameters[f'b{i}'] = np.zeros((layers[i], 1))
    return parameters

In [24]:
def forward_propagation(X, parameters, layers, hidden_activation, output_activation):
    cache = {"A0": X}
    A = X
    for i in range(1, len(layers)-1):
        Z = np.dot(parameters[f'W{i}'], A.T) + parameters[f'b{i}']
        A = hidden_activation(Z.T)
        cache[f"Z{i}"], cache[f"A{i}"] = Z, A
    Z = np.dot(parameters[f'W{len(layers)-1}'], A.T) + parameters[f'b{len(layers)-1}']
    A = output_activation(Z.T)
    cache[f"Z{len(layers)-1}"], cache[f"A{len(layers)-1}"] = Z, A
    return A, cache


In [25]:


# def compute_loss(y_true, y_pred):
#     m = y_true.shape[0]
#     loss = -np.mean(y_true * np.log(y_pred) + (1 - y_true) * np.log(1 - y_pred))
#     return loss

In [26]:
def compute_loss(y_true, y_pred):
    """
    Categorical cross-entropy loss
    """
    m = y_true.shape[0]
    # Add small epsilon to avoid log(0)
    eps = 1e-15
    y_pred = np.clip(y_pred, eps, 1 - eps)
    return -np.sum(y_true * np.log(y_pred)) / m

In [27]:


# def back_propagation(y, parameters, cache, layers):
#     grads = {}
#     m = y.shape[0]
#     L = len(layers) - 1
    
#     if len(y.shape) == 1 or y.shape[1] == 1:
#         y = to_one_hot(y, layers[-1])
    
#     A = cache[f"A{L}"]
#     dZ = A - y
    
#     for i in reversed(range(1, L+1)):
#         dW = (1/m) * np.dot(dZ.T, cache[f"A{i-1}"])
#         db = (1/m) * np.sum(dZ, axis=0).reshape(-1, 1)
        
#         grads[f"dW{i}"], grads[f"db{i}"] = dW, db
        
#         if i > 1:
#             dA = np.dot(dZ, parameters[f"W{i}"])
#             dZ = dA * relu_derivative(cache[f"Z{i-1}"].T)
    
#     return grads

In [28]:
def back_propagation(y, parameters, cache, layers, hidden_activation_derivative):
    grads = {}
    m = y.shape[0]
    L = len(layers) - 1
    
    # For the output layer (using softmax)
    dZ = cache[f"A{L}"] - y
    
    for i in reversed(range(1, L+1)):
        dW = (1/m) * np.dot(dZ.T, cache[f"A{i-1}"])
        db = (1/m) * np.sum(dZ, axis=0, keepdims=True).T
        
        grads[f"dW{i}"], grads[f"db{i}"] = dW, db
        
        if i > 1:
            dA = np.dot(dZ, parameters[f"W{i}"])
            dZ = dA * hidden_activation_derivative(cache[f"Z{i-1}"].T)
    
    return grads

In [29]:


def update_parameters(parameters, grads, learning_rate):
    """
    Update network parameters using gradients
    """
    for i in range(1, len(parameters) // 2 + 1):
        parameters[f"W{i}"] -= learning_rate * grads[f"dW{i}"]
        parameters[f"b{i}"] -= learning_rate * grads[f"db{i}"]
    return parameters

In [30]:
#  #Cell 5: Training Function
# def train(X, y, layers, learning_rate=0.1, epochs=1000):
#     parameters = initialize_parameters(layers)
#     for epoch in range(epochs):
#         y_pred, cache = forward_propagation(X, parameters, layers)
#         loss = compute_loss(y, y_pred)
#         if epoch % 100 == 0:
#             print(f"Epoch {epoch}, Loss: {loss}")
#         grads = back_propagation(y, parameters, cache, layers)
#         parameters = update_parameters(parameters, grads, learning_rate)
#     return parameters

In [31]:
def train_network(X, y, layers, learning_rate=0.01, epochs=1000, batch_size=32, 
                 hidden_activation=relu, output_activation=softmax,
                 hidden_activation_derivative=relu_derivative):
    parameters = initialize_parameters(layers)
    n_samples = X.shape[0]
    
    # Ensure batch_size is not larger than dataset
    batch_size = min(batch_size, n_samples)
    n_batches = max(n_samples // batch_size, 1)  # Ensure at least 1 batch
    
    print("Training Progress:")
    for epoch in range(epochs):
        epoch_loss = 0
        
        # Shuffle the data
        indices = np.random.permutation(n_samples)
        X_shuffled = X[indices]
        y_shuffled = y[indices]
        
        # Mini-batch training
        for batch in range(n_batches):
            start_idx = batch * batch_size
            end_idx = min(start_idx + batch_size, n_samples)
            
            X_batch = X_shuffled[start_idx:end_idx]
            y_batch = y_shuffled[start_idx:end_idx]
            
            # Forward pass with specified activation functions
            y_pred, cache = forward_propagation(X_batch, parameters, layers, 
                                              hidden_activation, output_activation)
            
            # Compute loss
            loss = compute_loss(y_batch, y_pred)
            epoch_loss += loss
            
            # Backward pass with specified derivative
            grads = back_propagation(y_batch, parameters, cache, layers, 
                                   hidden_activation_derivative)
            
            # Update parameters
            parameters = update_parameters(parameters, grads, learning_rate)
        
        # Print progress
        if epoch % 10 == 0:
            avg_loss = epoch_loss / n_batches
            print(f"Epoch {epoch}/{epochs}, Loss: {avg_loss:.4f}")
    
    return parameters

In [32]:
def predict(X, parameters, layers, hidden_activation, output_activation):
    y_pred, _ = forward_propagation(X, parameters, layers, hidden_activation, output_activation)
    return y_pred

In [33]:
import pandas as pd

In [38]:
from sklearn.preprocessing import StandardScaler
# Now use the new function name in your training code:
print("Loading dataset from CSV files...")
train_data = pd.read_csv("data/emnist-mnist-train.csv")
test_data = pd.read_csv("data/emnist-mnist-test.csv")

# Separate features (X) and labels (y)
X_train = train_data.iloc[:, 1:].values  # All columns except first
y_train = train_data.iloc[:, 0].values   # First column contains labels

X_test = test_data.iloc[:, 1:].values
y_test = test_data.iloc[:, 0].values

# Normalize the data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert labels to one-hot encoding
num_classes = len(np.unique(y_train))
y_train_one_hot = to_one_hot(y_train, num_classes=num_classes)
y_test_one_hot = to_one_hot(y_test, num_classes=num_classes)

print(f"Training data shape: {X_train.shape}")
print(f"Training labels shape: {y_train_one_hot.shape}")
print(f"Number of classes: {num_classes}")

# Define network architecture
input_size = X_train.shape[1]  # 784 features
layers = [input_size, 256, 128, num_classes]

# Train the network using the new function name
print("\nTraining the network...")
parameters = train_network(X_train, y_train_one_hot, 
                         layers=layers,
                         learning_rate=0.01,
                         epochs=100,
                         batch_size=32, 
                        hidden_activation=tanh,
                        hidden_activation_derivative=tanh_derivative,
                        output_activation=softmax)

# Test the model
predictions = predict(X_test, parameters, layers, tanh, softmax)
predicted_classes = np.argmax(predictions, axis=1)
actual_classes = y_test

# Calculate accuracy
accuracy = np.mean(predicted_classes == actual_classes)
print(f"\nTest Accuracy: {accuracy * 100:.2f}%")

Loading dataset from CSV files...
Training data shape: (59999, 784)
Training labels shape: (59999, 10)
Number of classes: 10

Training the network...
Training Progress:
Epoch 0/100, Loss: 1.1633
Epoch 10/100, Loss: 0.0947
Epoch 20/100, Loss: 0.0391
Epoch 30/100, Loss: 0.0166
Epoch 40/100, Loss: 0.0078
Epoch 50/100, Loss: 0.0043
Epoch 60/100, Loss: 0.0027
Epoch 70/100, Loss: 0.0019
Epoch 80/100, Loss: 0.0015
Epoch 90/100, Loss: 0.0012

Test Accuracy: 97.87%
