In [None]:
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.20.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.5 kB)
Collecting astunparse>=1.6.0 (from tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow)
  Downloading flatbuffers-25.9.23-py2.py3-none-any.whl.metadata (875 bytes)
Collecting google_pasta>=0.1.1 (from tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting libclang>=13.0.0 (from tensorflow)
  Downloading libclang-18.1.1-py2.py3-none-manylinux2010_x86_64.whl.metadata (5.2 kB)
Collecting tensorboard~=2.20.0 (from tensorflow)
  Downloading tensorboard-2.20.0-py3-none-any.whl.metadata (1.8 kB)
Collecting wheel<1.0,>=0.23.0 (from astunparse>=1.6.0->tensorflow)
  Downloading wheel-0.45.1-py3-none-any.whl.metadata (2.3 kB)
Collecting tensorboard-data-server<0.8.0,>=0.7.0 (from tensorboard~=2.20.0->tensorflow)
  Downloading tensorboard_data_server-0.

In [None]:
import numpy as np
from tensorflow.keras.datasets import mnist
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

def load_data_library(val_split=0.2):
    print("1. Loading MNIST data from Keras library...")

    (X_train_full, y_train_full), (X_test_raw, _) = mnist.load_data()

    print(f"Original X_train shape: {X_train_full.shape}")
    print(f"Original X_test shape: {X_test_raw.shape}")

    X_train_full = X_train_full.astype("float32") / 255.0
    X_test = X_test_raw.astype("float32") / 255.0

    X_train_full = X_train_full.reshape(-1, 28, 28, 1)
    X_test = X_test.reshape(-1, 28, 28, 1)

    y_train_full_oh = to_categorical(y_train_full, 10)

    X_train, X_val, y_train, y_val = train_test_split(
        X_train_full,
        y_train_full_oh,
        test_size=val_split,
        random_state=42
    )

    print("\n--- Final Shapes ---")
    print(f"X_train: {X_train.shape}")
    print(f"y_train: {y_train.shape}")
    print(f"X_val:   {X_val.shape}")
    print(f"y_val:   {y_val.shape}")
    print(f"X_test:  {X_test.shape}")

    return (X_train, y_train), (X_val, y_val), X_test

if __name__ == '__main__':
    (X_train, y_train), (X_val, y_val), X_test = load_data_library()
    print("\nLibrary load successful!")



1. Loading MNIST data from Keras library...
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/mnist.npz
[1m11490434/11490434[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Original X_train shape: (60000, 28, 28)
Original X_test shape: (10000, 28, 28)

--- Final Shapes ---
X_train: (48000, 28, 28, 1)
y_train: (48000, 10)
X_val:   (12000, 28, 28, 1)
y_val:   (12000, 10)
X_test:  (10000, 28, 28, 1)

Library load successful!


In [None]:
import numpy as np

class Layer:
    def __init__(self):
        self.input = None

    def forward(self, input):
        raise NotImplementedError

    def backward(self, output_gradient):
        raise NotImplementedError

class Conv2D(Layer):
    def __init__(self, input_channels, num_filters, filter_size, stride=1, padding=0):
        super().__init__()
        self.num_filters = num_filters
        self.filter_size = filter_size
        self.stride = stride
        self.padding = padding

        scale = np.sqrt(2. / (filter_size * filter_size * input_channels))
        self.weights = np.random.randn(
            num_filters, filter_size, filter_size, input_channels
        ) * scale
        self.biases = np.zeros(num_filters)

    def forward(self, input):
        self.input = input
        (batch_size, h_in, w_in, c_in) = input.shape
        if self.padding > 0:
            self.input_padded = np.pad(
                input,
                ((0, 0), (self.padding, self.padding), (self.padding, self.padding), (0, 0)),
                'constant'
            )
        else:
            self.input_padded = input

        (batch_size, h_pad, w_pad, c_in) = self.input_padded.shape
        h_out = (h_pad - self.filter_size)
        w_out = (w_pad - self.filter_size)
        output = np.zeros((batch_size, h_out, w_out, self.num_filters))
        for b in range(batch_size):
            for y  in range(h_out):
                for x in range(w_out):
                    y_start = y * self.stride
                    y_end = y_start + self.filter_size
                    x_start = x * self.stride
                    x_end = x_start + self.filter_size
                    input_slice = self.input_padded[b, y_start:y_end, x_start:x_end, :]
                    for f in range(self.num_filters):
                        current_filter = self.weights[f]
                        current_bias = self.biases[f]
                        conv_val = np.sum(input_slice * current_filter) + current_bias
                        output[b, y, x, f] = conv_val

        return output
    def backward(self, output_gradient):
        (batch_size, h_out, w_out, num_filters) = output_gradient.shape

        d_input = np.zeros_like(self.input)
        d_input_padded = np.zeros_like(self.input_padded)
        self.d_weights = np.zeros_like(self.weights)
        self.d_biases = np.zeros_like(self.biases)
        for f in range(num_filters):
            self.d_biases[f] = np.sum(output_gradient[:, :, :, f])
        for b in range(batch_size):
            for y in range(h_out):
                for x in range(w_out):
                    y_start = y * self.stride
                    y_end = y_start + self.filter_size
                    x_start = x * self.stride
                    x_end = x_start + self.filter_size
                    input_slice = self.input_padded[b, y_start:y_end, x_start:x_end, :]
                    for f in range(num_filters):
                        grad = output_gradient[b, y, x, f]
                        self.d_weights[f] += input_slice * grad
                        d_input_padded[b, y_start:y_end, x_start:x_end, :] += self.weights[f] * grad
        if self.padding > 0:
            d_input = d_input_padded[:, self.padding:-self.padding, self.padding:-self.padding, :]
        else:
            d_input = d_input_padded
        return d_input

class MaxPooling(Layer):
    def __init__(self, pool_size, stride):
        super().__init__()
        self.pool_size = pool_size
        self.stride = stride
        self.debug_printed = False # Flag to print only once

    def forward(self, input):
        self.input = input
        (batch_size, h_in, w_in, c_in) = input.shape

        # --- DEBUGGING BLOCK ---
        if not self.debug_printed:
            print(f"DEBUG: MaxPooling Input Shape: {input.shape}")
            self.debug_printed = True
        # -----------------------

        h_out = (h_in - self.pool_size) // self.stride + 1
        w_out = (w_in - self.pool_size) // self.stride + 1

        # Safety Check
        if h_out <= 0 or w_out <= 0:
            raise ValueError(f"Output dimension is zero/negative! h_in={h_in}, pool={self.pool_size}")

        output = np.zeros((batch_size, h_out, w_out, c_in))

        for b in range(batch_size):
            for c in range(c_in):
                for y in range(h_out):
                    for x in range(w_out):
                        y_start = y * self.stride
                        y_end = y_start + self.pool_size
                        x_start = x * self.stride
                        x_end = x_start + self.pool_size

                        input_slice = input[b, y_start:y_end, x_start:x_end, c]

                        # FIX: Check if slice is empty before calling max
                        if input_slice.size == 0:
                             print(f"ERROR at b={b}, y={y}, x={x}. Slice indices: {y_start}:{y_end}, {x_start}:{x_end}")
                             print(f"Input shape was: {input.shape}")
                             return output # Return incomplete output to avoid crash

                        output[b, y, x, c] = np.max(input_slice)
        return output

    def backward(self, output_gradient):
        # (Use the same backward code as Step 3)
        (batch_size, h_in, w_in, c_in) = self.input.shape
        (batch_size, h_out, w_out, c_in) = output_gradient.shape
        d_input = np.zeros_like(self.input)

        for b in range(batch_size):
            for c in range(c_in):
                for y in range(h_out):
                    for x in range(w_out):
                        y_start = y * self.stride
                        y_end = y_start + self.pool_size
                        x_start = x * self.stride
                        x_end = x_start + self.pool_size
                        input_slice = self.input[b, y_start:y_end, x_start:x_end, c]
                        max_val = np.max(input_slice)
                        mask = (input_slice == max_val)
                        d_input[b, y_start:y_end, x_start:x_end, c] += mask * output_gradient[b, y, x, c]
        return d_input

class ReLU(Layer):
    def __init__(self):
        super().__init__()
    def forward(self, input):
        self.input = input
        return np.maximum(0, input)
    def backward(self, output_gradient):
        relu_grad = (self.input > 0).astype(float)
        return output_gradient * relu_grad

class Flatten(Layer):
    def __init__(self):
        super().__init__()
        self.input_shape = None

    def forward(self, input):
        self.input_shape = input.shape
        batch_size = input.shape[0]
        return input.reshape(batch_size, -1)
    def backward(self, output_gradient):
        return output_gradient.reshape(self.input_shape)

class Dense(Layer):
    def __init__(self, input_size, output_size):
        super().__init__()
        scale = np.sqrt(2. / input_size)
        self.weights = np.random.randn(input_size, output_size) * scale
        self.biases = np.zeros((1, output_size))
    def forward(self, input):
        self.input = input
        return np.dot(input, self.weights) + self.biases
    def backward(self, output_gradient):
        self.d_weights = np.dot(self.input.T, output_gradient)
        self.d_biases = np.sum(output_gradient, axis=0, keepdims=True)
        d_input = np.dot(output_gradient, self.weights.T)
        return d_input

class Softmax(Layer):
    def __init__(self):
        super().__init__()
        self.output = None
    def forward(self, input):
        stable_input = input - np.max(input, axis=1, keepdims=True)
        exp_scores = np.exp(stable_input)
        self.output = exp_scores / np.sum(exp_scores, axis=1, keepdims=True)
        return self.output
    def backward(self, output_gradient):
        pass

In [None]:
class SoftmaxCrossEntropy:
    def __init__(self):
        self.y_pred = None
        self.y_true = None
    def loss(self, logits, y_true):
        self.y_true = y_true
        exps = np.exp(logits - np.max(logits, axis=1, keepdims=True))
        self.y_pred = exps / np.sum(exps, axis=1, keepdims=True)
        epsilon = 1e-12
        self.y_pred = np.clip(self.y_pred, epsilon, 1. - epsilon)
        correct_logprobs = -np.log(self.y_pred[range(len(y_true)), y_true.argmax(axis=1)])
        data_loss = np.mean(correct_logprobs)
        return data_loss
    def backward(self):
        d_logits = self.y_pred - self.y_true
        d_logits = d_logits / len(self.y_true)
        return d_logits

In [None]:
import numpy as np
import time

class SGD:
    def __init__(self, learning_rate=0.01):
        self.learning_rate = learning_rate
    def update(self, layer):
        if hasattr(layer, 'weights'):
            layer.weights -= self.learning_rate * layer.d_weights
            layer.biases -= self.learning_rate * layer.d_biases

def train(network, loss_layer, X_train, y_train, epochs=5, batch_size=32, learning_rate=0.01):
    optimizer = SGD(learning_rate)
    print(f"Training on {len(X_train)} samples...")
    for epoch in range(epochs):
        start_time = time.time()
        epoch_loss = 0
        indices = np.arange(len(X_train))
        np.random.shuffle(indices)
        X_train = X_train[indices]
        y_train = y_train[indices]
        for i in range(0, len(X_train), batch_size):
            X_batch = X_train[i:i+batch_size]
            y_batch = y_train[i:i+batch_size]
            output = X_batch
            for layer in network:
                output = layer.forward(output)
            loss = loss_layer.loss(output, y_batch)
            epoch_loss += loss
            grad = loss_layer.backward()
            for layer in reversed(network):
                grad = layer.backward(grad)
            for layer in network:
                optimizer.update(layer)
        avg_loss = epoch_loss / (len(X_train) // batch_size)
        duration = time.time() - start_time
        print(f"Epoch {epoch + 1}/{epoch} - Loss: {avg_loss:.4f} - Time: {duration:.2f}s")

def predict(network, X):
    output = X
    for layer in network:
        output = layer.forward(output)
    exps = np.exp(output - np.max(output, axis=1, keepdims=True))
    probs = exps / np.sum(exps, axis=1, keepdims=True)
    return np.argmax(probs, axis=1)

def accuracy(network, X, y_true_one_hot):
    preds = predict(network, X)
    y_true = np.argmax(y_true_one_hot, axis=1)
    return np.mean(preds == y_true)


network = [
    Conv2D(input_channels=1, num_filters=8, filter_size=3, stride=1, padding=0),
    ReLU(),
    MaxPooling(pool_size=2, stride=2),
    Flatten(),
    Dense(input_size=1152, output_size=10)
]
loss_layer = SoftmaxCrossEntropy()

print("Starting training... (This might be slow because it's pure NumPy!)")

train(
    network,
    loss_layer,
    X_train,
    y_train,
    epochs=3,
    batch_size=32,
    learning_rate=0.05
)

print("\nCalculating VAlidation Accuracy...")
acc = accuracy(network, X_val, y_val)
print(f"Validation Accuracy: {acc * 100:.2f}%")

Starting training... (This might be slow because it's pure NumPy!)
Training on 48000 samples...
DEBUG: MaxPooling Input Shape: (32, 25, 25, 8)
Epoch 1/0 - Loss: 0.3441 - Time: 3346.83s
Epoch 2/1 - Loss: 0.1743 - Time: 3320.07s
