# Exercise 3 - Train small networks with cyclic learning rates

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import time
import sys
import os
import pickle

# change working directory to the parent directory if not already there
if os.path.basename(os.getcwd()) != 'Assignment3':
    os.chdir(os.path.dirname(os.path.abspath('.')))


from src.exercise3 import ConvolutionalNetwork, load_cifar_data, train_with_cyclical_lr


In [2]:

def LoadBatch(batch_id, dtype=np.float64):
    """
    Load a CIFAR-10 batch file and return image data, one-hot labels, and raw labels.

    Parameters:
        filename (str): Path to the CIFAR-10 batch file.
        dtype (type): Data type for image and one-hot encoded label arrays (float32 or float64).

    Returns:
        X (numpy.ndarray): Image data of shape (3072, 10000), type float32/float64, values in [0,1].
        Y (numpy.ndarray): One-hot encoded labels of shape (10, 10000), type float32/float64.
        y (numpy.ndarray): Label vector of shape (10000,), type int (values 0-9).
    """

    cifar10_path = "/Users/axhome/AX/KTH/Courses/DD2424-DeepLearning/Assignment3/data/cifar-10-batches-py"
    # Construct full file path
    ## if batch_id is string
    if isinstance(batch_id, str):
        batch_file = os.path.join(cifar10_path, batch_id)
    else:   
        batch_file = os.path.join(cifar10_path, f"data_batch_{batch_id}")
    
    # Load the CIFAR-10 batch file
    with open(batch_file, 'rb') as file:
        batch = pickle.load(file, encoding='bytes')

    # Extract image data and labels
    images = batch[b'data']  # Shape (10000, 3072)
    labels = np.array(batch[b'labels'])  # Shape (10000,)

    # Convert image data to float and normalize to [0,1]
    X = images.astype(dtype) / 255.0  # Shape (10000, 3072)

    # Transpose X to match required shape (3072, 10000)
    X = X.T  # Shape (3072, 10000)

    # Convert labels to one-hot encoding
    K = 10  # Number of classes in CIFAR-10
    Y = np.zeros((K, X.shape[1]), dtype=dtype)  # Shape (10, 10000)
    Y[labels, np.arange(X.shape[1])] = 1  # Assign 1s for correct labels

    # Return X (3072×10000), Y (10×10000), y (10000,)

    # make sure labels are (1x10000)
    labels = labels.reshape(1, len(labels))

    return X, Y, labels


def preprocessData(X_train_raw, X_val_raw, X_test_raw):
    """
    Normalizes the dataset based on training set mean and standard deviation.
    
    Parameters:
      X_train_raw, X_val_raw, X_test_raw: Raw data matrices
    
    Returns:
      X_train, X_val, X_test: Normalized datasets
    """
    X_train_mean = np.mean(X_train_raw, axis=1, keepdims=True)
    X_train_std = np.std(X_train_raw, axis=1, keepdims=True)

    X_train = (X_train_raw - X_train_mean) / X_train_std
    X_val = (X_val_raw - X_train_mean) / X_train_std
    X_test = (X_test_raw - X_train_mean) / X_train_std

    return X_train, X_val, X_test


In [3]:
def get_cyclical_learning_rate(update_step, cycle_num, eta_min, eta_max, step_sizes):
    """
    Computes the learning rate for the current update step using cyclical learning rates.
    
    Args:
        update_step: Current step within the cycle
        cycle_num: Current cycle number (0-indexed)
        eta_min: Minimum learning rate
        eta_max: Maximum learning rate
        step_sizes: List of step sizes for each cycle
    
    Returns:
        eta: Current learning rate
    """
    # Get step size for current cycle
    current_step_size = step_sizes[cycle_num]
    
    # Calculate local step within the cycle
    cycle_step = update_step % (2 * current_step_size)
    
    # First half of the cycle: increase eta linearly
    if cycle_step < current_step_size:
        # Linear increase from eta_min to eta_max
        return eta_min + (cycle_step / current_step_size) * (eta_max - eta_min)
    # Second half of the cycle: decrease eta linearly 
    else:
        # Linear decrease from eta_max to eta_min
        return eta_max - ((cycle_step - current_step_size) / current_step_size) * (eta_max - eta_min)

In [4]:
def train_with_cyclical_lr(model, X_train, Y_train, X_val, Y_val, CLRparams, lambda_reg=0.0, 
                           use_label_smoothing=False, epsilon=0.1, logging_freq=10, verbose=True):
    """
    Train the network using mini-batch gradient descent with cyclical learning rates.
    
    Args:
        model: ConvolutionalNetwork instance
        X_train: Training data, shape (input_dim, n_train)
        Y_train: Training labels, shape (output_dim, n_train)
        X_val: Validation data, shape (input_dim, n_val)
        Y_val: Validation labels, shape (output_dim, n_val)
        CLRparams: Dictionary containing:
            - "n_batch": Mini-batch size
            - "eta_min": Minimum learning rate
            - "eta_max": Maximum learning rate
            - "n_s": Initial step size (half cycle length)
            - "n_cycles": Number of cycles to run
        lambda_reg: L2 regularization parameter
        use_label_smoothing: Whether to use label smoothing
        epsilon: Label smoothing parameter
        logging_freq: How often to log metrics
        verbose: Whether to print progress
    
    Returns:
        history: Dictionary containing training metrics
    """
    # Extract parameters
    batch_size = CLRparams["n_batch"]
    eta_min = CLRparams["eta_min"]
    eta_max = CLRparams["eta_max"]
    initial_step_size = CLRparams["n_s"]
    n_cycles = CLRparams["n_cycles"]
    
    # Calculate step sizes for each cycle (doubling each time)
    step_sizes = [initial_step_size * (2 ** i) for i in range(n_cycles)]
    
    # Apply label smoothing if requested
    if use_label_smoothing:
        Y_train_smooth = apply_label_smoothing(Y_train, epsilon)
    else:
        Y_train_smooth = Y_train
    
    # Get data dimensions
    n_train = X_train.shape[1]
    
    # Calculate total updates
    total_updates = sum(step_sizes) * 2
    
    # Initialize history dictionary
    history = {
        "cost_train": [], "cost_val": [],
        "loss_train": [], "loss_val": [],
        "acc_train": [], "acc_val": [],
        "update_steps": [],
        "learning_rates": []
    }
    
    # Initialize tracking variables
    update_step = 0
    current_cycle = 0
    cycle_step = 0
    
    # Start timer
    start_time = time.time()
    
    # Continue training until total updates reached
    while update_step < total_updates:
        # Shuffle data for each epoch
        shuffle_idx = np.random.permutation(n_train)
        X_shuffled = X_train[:, shuffle_idx]
        Y_shuffled = Y_train_smooth[:, shuffle_idx]
        
        # Process mini-batches
        for j in range(n_train // batch_size):
            # Skip if we've done enough updates
            if update_step >= total_updates:
                break
            
            # Extract mini-batch
            j_start = j * batch_size
            j_end = min(j_start + batch_size, n_train)
            X_batch = X_shuffled[:, j_start:j_end]
            Y_batch = Y_shuffled[:, j_start:j_end]
            
            # Forward pass
            P_batch, cache = model.forward(X_batch)
            
            # Backward pass
            grads = model.backward(Y_batch, P_batch, lambda_reg)
            
            # Get current learning rate
            eta = get_cyclical_learning_rate(cycle_step, current_cycle, eta_min, eta_max, step_sizes)
            
            # Update model parameters
            model.update_parameters(grads, eta)
            
            # Update cycle tracking
            cycle_step += 1
            update_step += 1
            
            # Check if cycle is complete
            if cycle_step >= 2 * step_sizes[current_cycle]:
                current_cycle += 1
                cycle_step = 0
                
                # Break if all cycles complete
                if current_cycle >= n_cycles:
                    break
            
            # Log metrics periodically
            if update_step % logging_freq == 0:
                # Compute metrics
                train_loss, train_acc = model.compute_loss_and_accuracy(X_train, Y_train, lambda_reg)
                val_loss, val_acc = model.compute_loss_and_accuracy(X_val, Y_val, lambda_reg)
                
                # Store metrics
                history["loss_train"].append(train_loss)
                history["acc_train"].append(train_acc)
                history["loss_val"].append(val_loss)
                history["acc_val"].append(val_acc)
                history["update_steps"].append(update_step)
                history["learning_rates"].append(eta)
                
                # Print progress
                if verbose and update_step % (logging_freq * 10) == 0:
                    print(f"Update {update_step}/{total_updates}, "
                          f"Cycle {current_cycle+1}/{n_cycles}, "
                          f"Step {cycle_step}/{2*step_sizes[current_cycle]}, "
                          f"LR: {eta:.6f}, "
                          f"Train Loss: {train_loss:.4f}, "
                          f"Val Acc: {val_acc:.4f}")
    
    # Record training time
    training_time = time.time() - start_time
    history["training_time"] = training_time
    
    if verbose:
        print(f"\nTraining completed in {training_time:.2f} seconds")
        print(f"Final validation accuracy: {history['acc_val'][-1]:.4f}")
    
    return history

In [5]:
from Assignment3.src.utils import plot_architecture_comparison


def compare_architectures(X_train, Y_train, X_val, Y_val, X_test, Y_test):
    """
    Compare the four architectures specified in Exercise 3.
    
    Args:
        X_train, Y_train: Training data and labels
        X_val, Y_val: Validation data and labels
        X_test, Y_test: Test data and labels
    
    Returns:
        results: Dictionary with results for each architecture
    """
    # Define the architectures to compare
    architectures = [
        {"name": "Architecture 1", "f": 2, "nf": 3, "nh": 50},
        {"name": "Architecture 2", "f": 4, "nf": 10, "nh": 50},
        {"name": "Architecture 3", "f": 8, "nf": 40, "nh": 50},
        {"name": "Architecture 4", "f": 16, "nf": 160, "nh": 50}
    ]
    
    # Common training parameters
    clr_params = {
        "n_batch": 100,
        "eta_min": 1e-5,
        "eta_max": 1e-1,
        "n_s": 800,
        "n_cycles": 3
    }
    
    # Lambda value (from Exercise 2 or your previous experiments)
    lambda_reg = 0.003
    
    # Store results
    results = []
    
    # Train and evaluate each architecture
    for arch in architectures:
        print(f"\nTraining {arch['name']}: f={arch['f']}, nf={arch['nf']}, nh={arch['nh']}")
        
        # Initialize model
        model = ConvolutionalNetwork(
            f=arch['f'], 
            nf=arch['nf'], 
            nh=arch['nh']
        )
        
        # Train model
        history = train_with_cyclical_lr(
            model, X_train, Y_train, X_val, Y_val,
            clr_params, lambda_reg,
            logging_freq=50, verbose=True
        )
        
        # Evaluate on test set
        test_loss, test_acc = model.compute_loss_and_accuracy(X_test, Y_test, lambda_reg)
        
        # Store results
        arch_result = {
            "name": arch["name"],
            "parameters": arch,
            "history": history,
            "test_accuracy": test_acc,
            "training_time": history["training_time"]
        }
        results.append(arch_result)
        
        print(f"Test accuracy: {test_acc:.4f}")
    
    # Plot comparison
    plot_architecture_comparison(results)
    
    return results

In [6]:
def train_longer_with_increasing_steps(X_train, Y_train, X_val, Y_val, X_test, Y_test):
    """
    Train architectures 2 and 3 for longer with increasing step sizes.
    
    Args:
        X_train, Y_train: Training data and labels
        X_val, Y_val: Validation data and labels
        X_test, Y_test: Test data and labels
    
    Returns:
        results: Dictionary with training results
    """
    # Define the architectures to train longer
    architectures = [
        {"name": "Architecture 2", "f": 4, "nf": 10, "nh": 50},
        {"name": "Architecture 3", "f": 8, "nf": 40, "nh": 50},
        {"name": "Architecture 2 Wide", "f": 4, "nf": 40, "nh": 50}
    ]
    
    # Training parameters
    clr_params = {
        "n_batch": 100,
        "eta_min": 1e-5,
        "eta_max": 1e-1,
        "n_s": 800,  # Initial step size
        "n_cycles": 3
    }
    
    lambda_reg = 0.003
    
    # Store results
    results = []
    
    # Train each architecture
    for arch in architectures:
        print(f"\nLonger training for {arch['name']}: f={arch['f']}, nf={arch['nf']}, nh={arch['nh']}")
        
        # Initialize model
        model = ConvolutionalNetwork(
            f=arch['f'], 
            nf=arch['nf'], 
            nh=arch['nh']
        )
        
        # Train model
        history = train_with_cyclical_lr(
            model, X_train, Y_train, X_val, Y_val,
            clr_params, lambda_reg,
            logging_freq=50, verbose=True
        )
        
        # Evaluate on test set
        test_loss, test_acc = model.compute_loss_and_accuracy(X_test, Y_test, lambda_reg)
        
        # Store results
        arch_result = {
            "name": arch["name"],
            "parameters": arch,
            "history": history,
            "test_accuracy": test_acc,
            "training_time": history["training_time"]
        }
        results.append(arch_result)
        
        print(f"Test accuracy: {test_acc:.4f}")
    
    # Plot comparison
    plot_loss_curves_comparison(results)
    
    return results

def plot_loss_curves_comparison(results):
    """
    Plot loss and accuracy curves comparison for longer training with increasing step sizes.
    
    Args:
        results: List of dictionaries with training results
    """
    # Create figure with multiple subplots
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    
    # Plot training loss
    ax = axes[0, 0]
    for res in results:
        ax.plot(res["history"]["update_steps"], res["history"]["loss_train"], 
                label=f"{res['name']} (Train)")
    ax.set_xlabel("Update Step")
    ax.set_ylabel("Training Loss")
    ax.set_title("Training Loss Comparison")
    ax.legend()
    ax.grid(True)
    
    # Plot validation loss
    ax = axes[0, 1]
    for res in results:
        ax.plot(res["history"]["update_steps"], res["history"]["loss_val"], 
                label=f"{res['name']} (Val)")
    ax.set_xlabel("Update Step")
    ax.set_ylabel("Validation Loss")
    ax.set_title("Validation Loss Comparison")
    ax.legend()
    ax.grid(True)
    
    # Plot training accuracy
    ax = axes[1, 0]
    for res in results:
        ax.plot(res["history"]["update_steps"], res["history"]["acc_train"], 
                label=f"{res['name']} (Train)")
    ax.set_xlabel("Update Step")
    ax.set_ylabel("Training Accuracy")
    ax.set_title("Training Accuracy Comparison")
    ax.legend()
    ax.grid(True)
    
    # Plot validation accuracy
    ax = axes[1, 1]
    for res in results:
        ax.plot(res["history"]["update_steps"], res["history"]["acc_val"], 
                label=f"{res['name']} (Val)")
    ax.set_xlabel("Update Step")
    ax.set_ylabel("Validation Accuracy")
    ax.set_title("Validation Accuracy Comparison")
    ax.legend()
    ax.grid(True)
    
    plt.tight_layout()
    plt.savefig('longer_training_comparison.png')
    plt.show()
    
    # Also create a bar chart for final test accuracies
    plt.figure(figsize=(10, 6))
    names = [res["name"] for res in results]
    test_accs = [res["test_accuracy"] for res in results]
    
    bars = plt.bar(names, test_accs, color='skyblue')
    plt.ylabel('Test Accuracy')
    plt.title('Test Accuracy Comparison - Longer Training')
    plt.ylim([0, 1])
    
    # Add accuracy values on bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{height:.4f}', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.savefig('longer_training_test_accuracy.png')
    plt.show()

In [7]:
def plot_architecture_comparison(results):
    """
    Plot comparison of different architectures.
    
    Args:
        results: List of dictionaries with architecture results
    """
    # Extract data for plotting
    names = [res["name"] for res in results]
    test_accs = [res["test_accuracy"] for res in results]
    train_times = [res["training_time"] for res in results]
    
    # Create figure with two subplots
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))
    
    # Plot test accuracy
    bar_width = 0.7
    x = np.arange(len(names))
    bars1 = ax1.bar(x, test_accs, bar_width, color='skyblue')
    
    ax1.set_ylabel('Test Accuracy')
    ax1.set_title('Test Accuracy by Architecture')
    ax1.set_xticks(x)
    ax1.set_xticklabels(names, rotation=45, ha='right')
    ax1.set_ylim([0, 1])
    
    # Add accuracy values on bars
    for bar in bars1:
        height = bar.get_height()
        ax1.text(bar.get_x() + bar.get_width()/2., height + 0.01,
                f'{height:.4f}', ha='center', va='bottom')
    
    # Plot training time
    bars2 = ax2.bar(x, train_times, bar_width, color='salmon')
    
    ax2.set_ylabel('Training Time (seconds)')
    ax2.set_title('Training Time by Architecture')
    ax2.set_xticks(x)
    ax2.set_xticklabels(names, rotation=45, ha='right')
    
    # Add time values on bars
    for bar in bars2:
        height = bar.get_height()
        ax2.text(bar.get_x() + bar.get_width()/2., height + 0.5,
                f'{height:.1f}s', ha='center', va='bottom')
    
    plt.tight_layout()
    plt.savefig('architecture_comparison.png')
    plt.show()

In [8]:
def run_exercise_3():
    """
    Run Exercise 3 of Assignment 3
    """
    print("Running Exercise 3 - Training Small Networks with Cyclical Learning Rates")
    
    # Load and preprocess data
    print("Loading and preprocessing data...")
    X_train_raw, Y_train, y_train = LoadBatch(1)
    X_val_raw, Y_val, y_val = LoadBatch(2)  
    X_test_raw, Y_test, y_test = LoadBatch("test_batch")
    
    # Normalize data
    X_train, X_val, X_test = preprocessData(X_train_raw, X_val_raw, X_test_raw)
    
    # Compare architectures
    print("\nPart 1: Comparing different architectures...")
    arch_results = compare_architectures(X_train, Y_train, X_val, Y_val, X_test, Y_test)
    
    # Train longer with increasing step sizes
    print("\nPart 2: Training with longer cycles and increasing step sizes...")
    longer_results = train_longer_with_increasing_steps(X_train, Y_train, X_val, Y_val, X_test, Y_test)
    
    print("\nExercise 3 completed!")
    
    return {
        "architecture_comparison": arch_results,
        "longer_training": longer_results
    }

In [9]:
# Run Exercise 3
results = run_exercise_3()

# Print summary of results
print("\nSummary of Results:")
print("-" * 50)

print("\nArchitecture Comparison:")
for arch, metrics in results["architecture_comparison"].items():
    print(f"\n{arch}:")
    print(f"Final Validation Accuracy: {metrics['val_acc']:.4f}")
    print(f"Training Time: {metrics['time']:.2f} seconds")

print("\nLonger Training Results:")
print(f"Final Validation Accuracy: {results['longer_training']['val_acc']:.4f}")
print(f"Total Training Time: {results['longer_training']['time']:.2f} seconds")


Running Exercise 3 - Training Small Networks with Cyclical Learning Rates
Loading and preprocessing data...

Part 1: Comparing different architectures...

Training Architecture 1: f=2, nf=3, nh=50
Update 500/11200, Cycle 1/3, Step 500/1600, LR: 0.062379, Train Loss: 2.3022, Val Acc: 0.1010
Update 1000/11200, Cycle 1/3, Step 1000/1600, LR: 0.075127, Train Loss: 2.3022, Val Acc: 0.1008
Update 1500/11200, Cycle 1/3, Step 1500/1600, LR: 0.012634, Train Loss: 2.3022, Val Acc: 0.1010
Update 2000/11200, Cycle 2/3, Step 400/3200, LR: 0.024945, Train Loss: 2.3022, Val Acc: 0.1008
Update 2500/11200, Cycle 2/3, Step 900/3200, LR: 0.056192, Train Loss: 2.3022, Val Acc: 0.1008
Update 3000/11200, Cycle 2/3, Step 1400/3200, LR: 0.087439, Train Loss: 2.3022, Val Acc: 0.1010
Update 3500/11200, Cycle 2/3, Step 1900/3200, LR: 0.081314, Train Loss: 2.3022, Val Acc: 0.1010
Update 4000/11200, Cycle 2/3, Step 2400/3200, LR: 0.050067, Train Loss: 2.3022, Val Acc: 0.1008
Update 4500/11200, Cycle 2/3, Step 2900

KeyboardInterrupt: 