# CUDA Enabled CNN from Scratch
This notebook uses Cupy to run the CNN on a CUDA-enabled GPU. Make sure Cupy is installed in your environment.

In [3]:
import cupy as np
from convCUDA import Conv3x3
from maxpoolCUDA import MaxPool2
from softmaxCUDA import Softmax
from sklearn.datasets import fetch_openml

# Load MNIST from OpenML
print("Downloading MNIST from OpenML...")
mnist = fetch_openml('mnist_784', version=1)
images = mnist.data.values.reshape(-1, 28, 28).astype(np.uint8)
images = np.asarray(images)  # Convert images to Cupy array
labels = mnist.target.astype(int).values  # ensure labels are integers

test_images = images[:10000]
test_labels = labels[:10000]
train_images = images[10000:]
train_labels = labels[10000:]

conv = Conv3x3(8)                  # 28x28x1 -> 26x26x8
pool = MaxPool2()                  # 26x26x8 -> 13x13x8
softmax = Softmax(13 * 13 * 8, 10)   # 13x13x8 -> 10

def forward(image, label):
    '''
    Completes a forward pass of the CNN and calculates the accuracy and
    cross-entropy loss.
    - image is a 2d numpy array
    - label is a digit
    '''
    out = conv.forward((image / 255) - 0.5)
    out = pool.forward(out)
    out = softmax.forward(out)

    loss = -np.log(out[label])
    acc = 1 if np.argmax(out) == label else 0

    return out, loss, acc

def train(im, label, lr=.005):
  '''
  Completes a full training step on the given image and label.
  Returns the cross-entropy loss and accuracy.
  - image is a 2d numpy array
  - label is a digit
  - lr is the learning rate
  '''
  out, loss, acc = forward(im, label)

  # Calculate initial gradient
  gradient = np.zeros(10)
  gradient[label] = -1 / out[label]

  # Backprop through softmax, maxpool, and conv layers
  gradient = softmax.backprop(gradient, lr)
  gradient = pool.backprop(gradient)
  conv.backprop(gradient, lr)

  return loss, acc

print('MNIST CNN initialized!')

# Train!
loss = 0
num_correct = 0
for i, (im, label) in enumerate(zip(train_images, train_labels)):
  if i % 100 == 99:
    print(
      '[Step %d] Past 100 steps: Average Loss %.3f | Accuracy: %d%%' %
      (i + 1, loss / 100, num_correct)
    )
    loss = 0
    num_correct = 0

  l, acc = train(im, label)
  loss += l
  num_correct += acc

# Parallel mini-batch training
batch_size = 64
num_batches = train_images.shape[0] // batch_size
loss_total = 0
correct_total = 0
for b in range(num_batches):
    batch_imgs = train_images[b*batch_size:(b+1)*batch_size]  # shape (N, 28, 28)
    batch_labels = train_labels[b*batch_size:(b+1)*batch_size]  # added extraction of labels
    # Vectorized forward pass:
    conv_out = conv.forward_batch(batch_imgs)  # shape (N, 26, 26, num_filters)
    pool_out = pool.forward_batch(conv_out)      # shape (N, 13, 13, num_filters)
    # Flatten for softmax:
    N = pool_out.shape[0]
    flat = pool_out.reshape(N, -1)
    softmax_out = softmax.forward_batch(flat)      # shape (N, 10)
    # Compute cross-entropy loss & accuracy vectorized:
    one_hot = np.zeros_like(softmax_out)
    one_hot[np.arange(N), batch_labels] = 1
    loss = -np.sum(one_hot * np.log(softmax_out + 1e-7)) / N
    pred_labels = np.argmax(softmax_out, axis=1)
    acc = np.mean(pred_labels == batch_labels) * 100
    loss_total += loss
    correct_total += acc
    if (b+1) % 10 == 0:
        print(f'Batch {(b+1)} / {num_batches}: Loss = {loss:.3f}, Accuracy = {acc:.1f}%')

# Synchronize GPU streams to ensure all parallel computations are complete
np.cuda.Device().synchronize()

# Test the CNN
print('\n--- Testing the CNN ---')
loss = 0
num_correct = 0
for im, label in zip(test_images, test_labels):
  _, l, acc = forward(im, label)
  loss += l
  num_correct += acc

num_tests = len(test_images)
print('Test Loss:', loss / num_tests)
print('Test Accuracy:', num_correct / num_tests)

Downloading MNIST from OpenML...
MNIST CNN initialized!
MNIST CNN initialized!


AttributeError: 'MaxPool2' object has no attribute 'last_input'