In [161]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import wandb
import datetime
import time
from tqdm import tqdm
from keras.datasets import fashion_mnist

In [14]:
wandb.login()

wandb: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: bullseye2608 (bullseye2608-indian-institute-of-technology-madras) to https://api.wandb.ai. Use `wandb login --relogin` to force relogin


True

In [141]:
def create_validation_set(X, Y, val_ratio=0.2, seed=None):
    """
    Splits X and Y into training and validation sets.

    Parameters:
    - X: Input data of shape (N, d).
    - Y: Corresponding labels of shape (N, ...).
    - val_ratio: Fraction of data to be used for validation (default is 0.2).
    - seed: Random seed for reproducibility.

    Returns:
    - X_train, X_val, Y_train, Y_val: The split datasets.
    """
    if seed is not None:
        np.random.seed(seed)
    
    n_samples = X.shape[0]
    # Generate a random permutation of indices
    indices = np.random.permutation(n_samples)
    # Determine split index based on the validation ratio
    split_index = int(n_samples * (1 - val_ratio))
    train_indices = indices[:split_index]
    val_indices = indices[split_index:]
    
    X_train = X[train_indices]
    Y_train = Y[train_indices]
    X_val = X[val_indices]
    Y_val = Y[val_indices]
    
    return X_train, X_val, Y_train, Y_val

In [164]:
(X_train, y_train), (X_test, y_test) = fashion_mnist.load_data()
(X_train, X_val, y_train, y_val) = create_validation_set(X_train, y_train, val_ratio=0.1, seed=42)

class_names = ['T-shirt/top', 'Trouser', 'Pullover', 'Dress', 'Coat', 
               'Sandal', 'Shirt', 'Sneaker', 'Bag', 'Ankle boot']

# Create a DataFrame for the training data
X_train_flat = X_train.reshape(X_train.shape[0], -1)
train_df = pd.DataFrame(X_train_flat)
train_df['label'] = y_train
train_df['label_name'] = [class_names[label] for label in y_train]

# Create a DataFrame for the validation data
X_val_flat = X_val.reshape(X_val.shape[0], -1)
val_df = pd.DataFrame(X_val_flat)
val_df['label'] = y_val
val_df['label_name'] = [class_names[label] for label in y_val]

# Create a DataFrame for the test data
X_test_flat = X_test.reshape(X_test.shape[0], -1)
test_df = pd.DataFrame(X_test_flat)
test_df['label'] = y_test
test_df['label_name'] = [class_names[label] for label in y_test]

In [21]:
import wandb

# Initialize a W&B run
wandb.init(
    entity="bullseye2608-indian-institute-of-technology-madras",
    project="my-awesome-project", 
    name="Images_"+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
)

# Log images to W&B
wandb.log({
    "fashion_mnist_samples": [
        wandb.Image(X_train[np.where(y_train == i)[0][0]], caption=class_names[i])
        for i in range(10)
    ]
})

wandb.finish()

In [165]:
def sigmoid(x):
    # For positive values, use standard formula
    # For negative values, use a mathematically equivalent but more stable form
    mask = x >= 0
    result = np.zeros_like(x, dtype=float)
    
    # For positive inputs: 1/(1+exp(-x))
    result[mask] = 1 / (1 + np.exp(-x[mask]))
    
    # For negative inputs: exp(x)/(1+exp(x))
    # This avoids computing exp of large positive numbers
    exp_x = np.exp(x[~mask])
    result[~mask] = exp_x / (1 + exp_x)
    
    return result

def softmax(X):
    # Subtract the max for numerical stability
    exps = np.exp(X - np.max(X, axis=-1, keepdims=True))
    return exps / np.sum(exps, axis=-1, keepdims=True)

def cross_entropy_loss(y_true, y_pred, epsilon=1e-15):
    """
    Calculates the cross-entropy loss between true labels and predicted probabilities.
    
    Parameters:
    -----------
    y_true : numpy.ndarray
        One-hot encoded true labels or class indices.
        If class indices, shape should be (n_samples,)
        If one-hot encoded, shape should be (n_samples, n_classes)
    y_pred : numpy.ndarray
        Predicted probabilities, shape (n_samples, n_classes)
    epsilon : float, optional
        Small constant added to log to avoid numerical instability
        
    Returns:
    --------
    loss : float
        Average cross-entropy loss across all samples
    """
    # Convert y_true to one-hot if it's provided as class indices
    if len(y_true.shape) == 1 or y_true.shape[1] == 1:
        n_samples = len(y_true)
        n_classes = y_pred.shape[1]
        y_true_one_hot = np.zeros((n_samples, n_classes))
        y_true_one_hot[np.arange(n_samples), y_true.astype(int).flatten()] = 1
        y_true = y_true_one_hot
    
    # Clip predictions to avoid log(0)
    y_pred = np.clip(y_pred, epsilon, 1 - epsilon)
    
    # Calculate cross entropy loss
    loss = -np.sum(y_true * np.log(y_pred)) / y_true.shape[0]
    
    return loss


def one_hot(y, transpose=False):
    one_hot_y = np.zeros((y.size, y.max() + 1))
    one_hot_y[np.arange(y.size), y] = 1
    if transpose:
        one_hot_y = one_hot_y.T
    return one_hot_y

def sigmoid_derivative(X):
    # Ensure X is in a safe range to avoid overflow
    X = np.clip(X, 1e-7, 1 - 1e-7)
    return X * (1 - X)

In [None]:
def initialise_weights_and_biases(input_size=784, num_hidden_layers=2, num_neurons=32, output_size=10):
    weights, biases = [], []
    if type(num_neurons)==int:
        sizes = [input_size] + [num_neurons]*num_hidden_layers + [output_size]
    else:
        assert len(num_neurons)==num_hidden_layers
        sizes = [input_size] + list(num_neurons) + [output_size]
    
    for i in range(1,num_hidden_layers+1):
        weights.append(np.random.uniform(-0.5, 0.5, (sizes[i], sizes[i-1])))
        biases.append(np.zeros(sizes[i]).reshape(1,-1))
    
    weights.append(np.random.uniform(-0.5, 0.5, (sizes[-1], sizes[-2])))
    biases.append(np.zeros(sizes[-1]).reshape(1,-1))
    
    return weights, biases

def forward_propagation(X, W, B):
    L = len(W)
    assert L==len(B) and L>=2
    A, H = [], []
    
    A_temp = np.dot(X, W[0].T) + B[0]
    H_temp = sigmoid(A_temp)
    
    A.append(A_temp)
    H.append(H_temp)
    
    for i in range(1,L-1):
        A_temp = np.dot(H[i-1], W[i].T) + B[i]
        H_temp = sigmoid(A_temp)
        
        A.append(A_temp)
        H.append(H_temp)
    
    A_temp = np.dot(H[L-2], W[L-1].T) + B[L-1]
    H_temp = softmax(A_temp)
    
    A.append(A_temp)
    H.append(H_temp)
    
    return A, H

def back_propagation(X, y_true, W, A, H):
    N = X.shape[0]
    L = len(W)
    assert N==y_true.size and L==len(A) and L==len(H)
    
    one_hot_y = one_hot(y_true)
    y_pred = H[2]
    dW, dB = [None] * L, [None] * L
    
    dA2 = y_pred - one_hot_y # NxK
    dW2 = (np.dot(dA2.T, H[1])) / N # KxH2
    dB2 = np.sum(dA2, axis=0).reshape(1,-1) / N # 1xK
    
    dA1 = np.dot(dA2, W[2]) * sigmoid_derivative(A[1]) # NxH2
    dW1 = (np.dot(dA1.T, H[0])) / N # H2xH1
    dB1 = np.sum(dA1, axis=0).reshape(1,-1) / N # 1xH2
    
    dA0 = np.dot(dA1, W[1]) * sigmoid_derivative(A[0]) # NxH1
    dW0 = (np.dot(dA0.T, X)) / N # H1xD
    dB0 = np.sum(dA0, axis=0).reshape(1,-1) / N # 1xH1
    
    dW = [dW0, dW1, dW2]
    dB = [dB0, dB1, dB2]
    
    return dW, dB
    
def back_propagation_2(X, y_true, W, A, H):
    N = X.shape[0]
    L = len(W)
    assert N==y_true.size and L==len(A) and L==len(H)
    
    one_hot_y = one_hot(y_true)
    y_pred = H[L-1]
    dW, dB = [None] * L, [None] * L
    
    dA = y_pred - one_hot_y # NxK
    
    for k in range(L-1, 0, -1):
        dWk = (np.dot(dA.T, H[k-1])) / N
        dBk = np.sum(dA, axis=0).reshape(1,-1) / N
        
        dA = np.dot(dA, W[k]) * sigmoid_derivative(A[k-1])
        
        dW[k] = dWk
        dB[k] = dBk
    
    dW0 = (np.dot(dA.T, X)) / N
    dB0 = np.sum(dA, axis=0).reshape(1,-1) / N
    
    dW[0] = dW0
    dB[0] = dB0
    
    
    return dW, dB
    
def gradient_descent(X_training, y_true, X_val, y_val, W, B, learning_rate=0.01, num_iterations=1000, log_every=100):
    L = len(W)
    training_error = np.zeros(num_iterations//log_every)
    validation_error = np.zeros(num_iterations//log_every)
    for i in range(num_iterations):
        A, H = forward_propagation(X_training, W, B)
        dW, dB = back_propagation_2(X_training, y_true, W, A, H)
        
        for j in range(L):
            
            W[j] -= learning_rate * dW[j]
            B[j] -= learning_rate * dB[j]
        
        if i%log_every==0:
            print(f'Loss after {i} iterations: {cross_entropy_loss(one_hot(y_true), H[-1])}')
            training_error[i//log_every] = cross_entropy_loss(one_hot(y_true), H[-1])
            A_val, H_val = forward_propagation(X_val, W, B)
            validation_error[i//log_every] = cross_entropy_loss(one_hot(y_val), H_val[-1])
        
            
    
    return W, B

def gradient_descent_with_momentum(X, y_true, W, B, learning_rate=0.01, momentum=0.4, num_iterations=100, log_every=100):
    L = len(W)
    
    # Initialize velocity vectors with zeros (same shape as theta)
    velocity_W = [np.zeros_like(W[j]) for j in range(L)]
    velocity_B = [np.zeros_like(B[j]) for j in range(L)]
    
    for i in range(num_iterations):
        # Forward and backward passes
        A, H = forward_propagation(X, W, B)
        dW, dB = back_propagation_2(X, y_true, W, A, H)
        
        # Update with momentum
        for j in range(L):
            # Update velocity vectors
            velocity_W[j] = momentum * velocity_W[j] - learning_rate * dW[j]
            velocity_B[j] = momentum * velocity_B[j] - learning_rate * dB[j]
            
            # Update parameters using velocities
            W[j] += velocity_W[j]
            B[j] += velocity_B[j]
        
        # Print loss periodically
        if i % log_every == 0:
            print(f'Loss after {i} iterations: {cross_entropy_loss(one_hot(y_true), H[-1])}')
    
    return W, B

def accuracy(y_true, y_pred):
    return np.mean(y_true == y_pred)

def test_model(X, y_true, W, B):
    A, H = forward_propagation(X, W, B)
    y_pred = np.argmax(H[-1], axis=1)
    print(f'Accuracy: {accuracy(y_true, y_pred)}')
    return accuracy(y_true, y_pred)
    
        
    
    
    

In [180]:
num_trials = 19

W, B = initialise_weights_and_biases(input_size=784, 
                            num_hidden_layers=1, 
                            num_neurons=41, 
                            output_size=10)

A, H = forward_propagation(X=X_train_flat[:num_trials,:], W=W, B=B)

dW, dB = back_propagation_2(X=X_train_flat[:num_trials,:], y_true=y_train[:num_trials], W=W, A=A, H=H)


In [159]:
num_trials = 2000

W, B = initialise_weights_and_biases(input_size=784, 
                            num_hidden_layers=2, 
                            num_neurons=32, 
                            output_size=10)

W_new, B_new = gradient_descent(X=X_train_flat[:num_trials,:],
                        y_true=y_train[:num_trials],
                        W=W, B=B,
                        learning_rate=0.1,
                        num_iterations=10000,
                        log_every=100)


Loss after 0 iterations: 2.4794911390802543
Loss after 100 iterations: 2.3100971387366136
Loss after 200 iterations: 2.3070401218241057
Loss after 300 iterations: 2.3043496454801415
Loss after 400 iterations: 2.3019473853354846
Loss after 500 iterations: 2.299782538467519
Loss after 600 iterations: 2.2978115876749605
Loss after 700 iterations: 2.2960054203113067
Loss after 800 iterations: 2.2943383629156933
Loss after 900 iterations: 2.292786905395302
Loss after 1000 iterations: 2.2913326623615404
Loss after 1100 iterations: 2.289961018552156
Loss after 1200 iterations: 2.2886614949328465


KeyboardInterrupt: 

In [184]:
num_trials = 60000

W, B = initialise_weights_and_biases(input_size=784, 
                            num_hidden_layers=1, 
                            num_neurons=32, 
                            output_size=10)

test_model(X=X_test_flat, y_true=y_test, W=W, B=B)

Accuracy: 0.1026


0.1026

In [187]:
W_new, B_new = gradient_descent_with_momentum(X=X_train_flat,
                                            y_true=y_train,
                                            W=W, B=B,
                                            learning_rate=0.1,
                                            momentum=0.9,
                                            num_iterations=4000,
                                            log_every=100)

Loss after 0 iterations: 0.48514154444668295
Loss after 100 iterations: 0.4828476850289215
Loss after 200 iterations: 0.47857999910444526
Loss after 300 iterations: 0.4776903544899087
Loss after 400 iterations: 0.47467012584318047
Loss after 500 iterations: 0.4786551332616049
Loss after 600 iterations: 0.4675180506768316
Loss after 700 iterations: 0.4655332038737685
Loss after 800 iterations: 0.46140897317226875
Loss after 900 iterations: 0.4550303110271173
Loss after 1000 iterations: 0.4566424441010105
Loss after 1100 iterations: 0.45294048996458736
Loss after 1200 iterations: 0.44811272127419083
Loss after 1300 iterations: 0.45441297349559917
Loss after 1400 iterations: 0.4427219270550769
Loss after 1500 iterations: 0.43603545811857947
Loss after 1600 iterations: 0.4360636680995777
Loss after 1700 iterations: 0.435871623727895
Loss after 1800 iterations: 0.438704125446202
Loss after 1900 iterations: 0.4364155868088221
Loss after 2000 iterations: 0.4313598490391894
Loss after 2100 ite

In [188]:
test_model(X=X_val_flat, y_true=y_val, W=W_new, B=B_new)

Accuracy: 0.8226666666666667


0.8226666666666667