In [4]:
import numpy as np
from sklearn.datasets import fetch_openml
import matplotlib.pyplot as plt

class ConvLayer:
    def __init__(self, num_filters, filter_size, input_channels=1):
        self.num_filters = num_filters
        self.filter_size = filter_size
        self.input_channels = input_channels
        self.filters = np.random.randn(num_filters, input_channels, filter_size, filter_size) / np.sqrt(filter_size * filter_size * input_channels)
        
    def forward(self, input):
        self.input = input
        self.batch_size, self.input_channels, self.height, self.width = input.shape
        output_height = self.height - self.filter_size + 1
        output_width = self.width - self.filter_size + 1
        
        output = np.zeros((self.batch_size, self.num_filters, output_height, output_width))
        
        for i in range(output_height):
            for j in range(output_width):
                input_slice = input[:, :, i:i+self.filter_size, j:j+self.filter_size]
                for k in range(self.num_filters):
                    output[:, k, i, j] = np.sum(input_slice * self.filters[k], axis=(1,2,3))
        
        return output
    
    def backward(self, d_output, learning_rate):
        d_input = np.zeros_like(self.input)
        d_filters = np.zeros_like(self.filters)
        
        for i in range(d_output.shape[2]):
            for j in range(d_output.shape[3]):
                input_slice = self.input[:, :, i:i+self.filter_size, j:j+self.filter_size]
                for k in range(self.num_filters):
                    d_filters[k] += np.sum(input_slice * d_output[:, k, i, j][:, None, None, None], axis=0)
                    d_input[:, :, i:i+self.filter_size, j:j+self.filter_size] += \
                        self.filters[k] * d_output[:, k, i, j][:, None, None, None]
        
        self.filters -= learning_rate * d_filters
        return d_input

class MaxPoolLayer:
    def __init__(self, pool_size):
        self.pool_size = pool_size
        
    def forward(self, input):
        self.input = input
        self.batch_size, self.channels, self.height, self.width = input.shape
        self.output_height = self.height // self.pool_size
        self.output_width = self.width // self.pool_size
        
        output = np.zeros((self.batch_size, self.channels, self.output_height, self.output_width))
        
        for i in range(self.output_height):
            for j in range(self.output_width):
                h_start = i * self.pool_size
                h_end = h_start + self.pool_size
                w_start = j * self.pool_size
                w_end = w_start + self.pool_size
                input_slice = input[:, :, h_start:h_end, w_start:w_end]
                output[:, :, i, j] = np.max(input_slice, axis=(2, 3))
                
        return output
    
    def backward(self, d_output):
        d_input = np.zeros_like(self.input)
        
        for i in range(self.output_height):
            for j in range(self.output_width):
                h_start = i * self.pool_size
                h_end = h_start + self.pool_size
                w_start = j * self.pool_size
                w_end = w_start + self.pool_size
                
                input_slice = self.input[:, :, h_start:h_end, w_start:w_end]
                mask = input_slice == np.max(input_slice, axis=(2, 3))[:, :, None, None]
                d_input[:, :, h_start:h_end, w_start:w_end] = mask * d_output[:, :, i, j][:, :, None, None]
                
        return d_input

class FCLayer:
    def __init__(self, input_size, output_size):
        self.weights = np.random.randn(input_size, output_size) / np.sqrt(input_size)
        self.bias = np.zeros(output_size)
        
    def forward(self, input):
        self.input = input
        return np.dot(input, self.weights) + self.bias
    
    def backward(self, d_output, learning_rate):
        d_input = np.dot(d_output, self.weights.T)
        d_weights = np.dot(self.input.T, d_output)
        d_bias = np.sum(d_output, axis=0)
        
        self.weights -= learning_rate * d_weights
        self.bias -= learning_rate * d_bias
        return d_input

def relu(x):
    return np.maximum(0, x)

def relu_derivative(x):
    return (x > 0).astype(float)

def softmax(x):
    exp_x = np.exp(x - np.max(x, axis=1, keepdims=True))
    return exp_x / np.sum(exp_x, axis=1, keepdims=True)

def cross_entropy_loss(predictions, targets):
    epsilon = 1e-15
    predictions = np.clip(predictions, epsilon, 1 - epsilon)
    return -np.mean(np.sum(targets * np.log(predictions), axis=1))

class CNN:
    def __init__(self):
        self.conv1 = ConvLayer(num_filters=16, filter_size=3, input_channels=1)
        self.pool1 = MaxPoolLayer(pool_size=2)
        self.conv2 = ConvLayer(num_filters=32, filter_size=3, input_channels=16)
        self.pool2 = MaxPoolLayer(pool_size=2)
        self.fc1 = FCLayer(32 * 5 * 5, 128)
        self.fc2 = FCLayer(128, 10)
        
    def forward(self, x):
        # Ensure input has the correct shape (batch_size, channels, height, width)
        if len(x.shape) == 3:
            x = x[:, np.newaxis, :, :]
            
        # Save intermediate values for backpropagation
        self.layer_outputs = []
        
        # First convolution block
        x = self.conv1.forward(x)
        self.layer_outputs.append(x)
        x = relu(x)
        self.layer_outputs.append(x)
        x = self.pool1.forward(x)
        self.layer_outputs.append(x)
        
        # Second convolution block
        x = self.conv2.forward(x)
        self.layer_outputs.append(x)
        x = relu(x)
        self.layer_outputs.append(x)
        x = self.pool2.forward(x)
        self.layer_outputs.append(x)
        
        # Flatten and fully connected layers
        x = x.reshape(x.shape[0], -1)
        self.layer_outputs.append(x)
        x = self.fc1.forward(x)
        self.layer_outputs.append(x)
        x = relu(x)
        self.layer_outputs.append(x)
        x = self.fc2.forward(x)
        self.layer_outputs.append(x)
        
        # Output probability distribution
        x = softmax(x)
        return x
    
    def backward(self, d_output, learning_rate):
        # Backward pass through fully connected layers
        d_layer = d_output
        d_layer = self.fc2.backward(d_layer, learning_rate)
        d_layer *= relu_derivative(self.layer_outputs[-2])
        d_layer = self.fc1.backward(d_layer, learning_rate)
        
        # Reshape back to conv dimensions
        d_layer = d_layer.reshape(self.layer_outputs[5].shape)
        
        # Backward pass through second conv block
        d_layer = self.pool2.backward(d_layer)
        d_layer *= relu_derivative(self.layer_outputs[4])
        d_layer = self.conv2.backward(d_layer, learning_rate)
        
        # Backward pass through first conv block
        d_layer = self.pool1.backward(d_layer)
        d_layer *= relu_derivative(self.layer_outputs[1])
        d_layer = self.conv1.backward(d_layer, learning_rate)

def train_cnn(model, X_train, y_train, X_val, y_val, epochs=10, batch_size=32, learning_rate=0.01):
    train_losses = []
    train_accuracies = []
    val_losses = []
    val_accuracies = []
    
    n_batches = len(X_train) // batch_size
    
    for epoch in range(epochs):
        # Training
        epoch_loss = 0
        correct = 0
        
        # Shuffle training data
        indices = np.random.permutation(len(X_train))
        X_train = X_train[indices]
        y_train = y_train[indices]
        
        for i in range(n_batches):
            start_idx = i * batch_size
            end_idx = start_idx + batch_size
            
            # Get batch
            batch_X = X_train[start_idx:end_idx]
            batch_y = y_train[start_idx:end_idx]
            
            # Forward pass
            predictions = model.forward(batch_X)
            loss = cross_entropy_loss(predictions, batch_y)
            epoch_loss += loss
            
            # Calculate accuracy
            predicted_classes = np.argmax(predictions, axis=1)
            true_classes = np.argmax(batch_y, axis=1)
            correct += np.sum(predicted_classes == true_classes)
            
            # Backward pass
            d_output = predictions - batch_y
            model.backward(d_output, learning_rate)
            
            if i % 50 == 0:
                print(f"Epoch {epoch + 1}, Batch {i}/{n_batches}, Loss: {loss:.4f}")
        
        # Calculate training metrics
        epoch_loss /= n_batches
        epoch_accuracy = correct / (n_batches * batch_size)
        train_losses.append(epoch_loss)
        train_accuracies.append(epoch_accuracy)
        
        # Validation
        val_predictions = model.forward(X_val)
        val_loss = cross_entropy_loss(val_predictions, y_val)
        val_predicted_classes = np.argmax(val_predictions, axis=1)
        val_true_classes = np.argmax(y_val, axis=1)
        val_accuracy = np.mean(val_predicted_classes == val_true_classes)
        
        val_losses.append(val_loss)
        val_accuracies.append(val_accuracy)
        
        print(f"\nEpoch {epoch + 1}/{epochs}")
        print(f"Training Loss: {epoch_loss:.4f}, Training Accuracy: {epoch_accuracy:.4f}")
        print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}\n")
    
    return train_losses, train_accuracies, val_losses, val_accuracies

def plot_training_history(train_losses, train_accuracies, val_losses, val_accuracies):
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
    
    # Plot losses
    ax1.plot(train_losses, label='Training Loss')
    ax1.plot(val_losses, label='Validation Loss')
    ax1.set_title('Loss over epochs')
    ax1.set_xlabel('Epoch')
    ax1.set_ylabel('Loss')
    ax1.legend()
    
    # Plot accuracies
    ax2.plot(train_accuracies, label='Training Accuracy')
    ax2.plot(val_accuracies, label='Validation Accuracy')
    ax2.set_title('Accuracy over epochs')
    ax2.set_xlabel('Epoch')
    ax2.set_ylabel('Accuracy')
    ax2.legend()
    
    plt.tight_layout()
    plt.show()

def load_and_preprocess_data():
    # Load MNIST dataset
    X, y = fetch_openml('mnist_784', version=1, return_X_y=True, as_frame=False)
    X = X.astype('float32') / 255.0
    
    # Reshape images to (samples, height, width)
    X = X.reshape(-1, 28, 28)
    
    # Split data
    train_size = 60000
    X_train, X_test = X[:train_size], X[train_size:]
    y_train, y_test = y[:train_size], y[train_size:]
    
    # Convert labels to one-hot encoding
    y_train_onehot = np.zeros((len(y_train), 10))
    y_test_onehot = np.zeros((len(y_test), 10))
    y_train_onehot[np.arange(len(y_train)), y_train.astype(int)] = 1
    y_test_onehot[np.arange(len(y_test)), y_test.astype(int)] = 1
    
    return X_train, y_train_onehot, X_test, y_test_onehot

def main():
    # Load and preprocess data
    print("Loading data...")
    X_train, y_train, X_test, y_test = load_and_preprocess_data()
    print("Data loaded and preprocessed.")
    
    # Create and train model
    print("Creating and training model...")
    model = CNN()
    train_losses, train_accuracies, val_losses, val_accuracies = train_cnn(
        model, 
        X_train[:50000], 
        y_train[:50000], 
        X_train[50000:], 
        y_train[50000:],
        epochs=10,
        batch_size=32,
        learning_rate=0.01
    )
    
    # Plot training history
    plot_training_history(train_losses, train_accuracies, val_losses, val_accuracies)
    
    # Evaluate on test set
    test_predictions = model.forward(X_test)
    test_loss = cross_entropy_loss(test_predictions, y_test)
    test_accuracy = np.mean(np.argmax(test_predictions, axis=1) == np.argmax(y_test, axis=1))
    print(f"\nTest Loss: {test_loss:.4f}")
    print(f"Test Accuracy: {test_accuracy:.4f}")

if __name__ == "__main__":
    main()

Loading data...
Data loaded and preprocessed.
Creating and training model...
Epoch 1, Batch 0/1562, Loss: 2.3097
Epoch 1, Batch 50/1562, Loss: 0.7680
Epoch 1, Batch 100/1562, Loss: 0.3055
Epoch 1, Batch 150/1562, Loss: 0.2971
Epoch 1, Batch 200/1562, Loss: 0.1328
Epoch 1, Batch 250/1562, Loss: 0.0876
Epoch 1, Batch 300/1562, Loss: 0.0852
Epoch 1, Batch 350/1562, Loss: 0.1792
Epoch 1, Batch 400/1562, Loss: 0.2912
Epoch 1, Batch 450/1562, Loss: 0.0975
Epoch 1, Batch 500/1562, Loss: 0.2299
Epoch 1, Batch 550/1562, Loss: 0.0899
Epoch 1, Batch 600/1562, Loss: 0.0430
Epoch 1, Batch 650/1562, Loss: 0.1020
Epoch 1, Batch 700/1562, Loss: 0.1524
Epoch 1, Batch 750/1562, Loss: 0.0702
Epoch 1, Batch 800/1562, Loss: 0.0075
Epoch 1, Batch 850/1562, Loss: 0.0386
Epoch 1, Batch 900/1562, Loss: 0.1104
Epoch 1, Batch 950/1562, Loss: 0.1684
Epoch 1, Batch 1000/1562, Loss: 0.0201
Epoch 1, Batch 1050/1562, Loss: 0.0008
Epoch 1, Batch 1100/1562, Loss: 0.1495
Epoch 1, Batch 1150/1562, Loss: 0.1859
Epoch 1, B

KeyboardInterrupt: 