In [1]:
import numpy as np
import time
from torchvision.datasets import MNIST

In [2]:
def one_hot_encode(labels, num_classes):
    return np.eye(num_classes)[labels]

def download_mnist(is_train: bool):
    dataset = MNIST(root='./data',
                    transform=lambda x: np.array(x).flatten(),
                    download=True,
                    train=is_train)
    
    mnist_data = []
    mnist_labels = []
    for image, label in dataset:
        mnist_data.append(image)
        mnist_labels.append(label)
    
    mnist_data = np.array(mnist_data, dtype='float64')
    mnist_labels = np.array(mnist_labels)
    
    mnist_labels = one_hot_encode(mnist_labels, num_classes=10)
    
    return mnist_data, mnist_labels
    
train_X, train_Y = download_mnist(True)
test_X, test_Y = download_mnist(False)

print(f"train_X shape: {train_X.shape}")
print(f"train_Y (one-hot) shape: {train_Y.shape}")
print(f"test_X shape: {test_X.shape}")
print(f"test_Y (one-hot) shape: {test_Y.shape}")

train_X shape: (60000, 784)
train_Y (one-hot) shape: (60000, 10)
test_X shape: (10000, 784)
test_Y (one-hot) shape: (10000, 10)


## Normalize the data

In [3]:
train_X /= 255.0
test_X /= 255.0

In [4]:
# print(f"Min value: {min([min(t) for t in train_X])}")
# print(f"Max value: {max([max(t) for t in train_X])}")

## Hyperparameters

In [5]:
input_size = train_X.shape[1]
output_size = train_Y.shape[1]
batch_size = 64
learning_rate = 0.07
epochs = 75

## Batches

In [6]:
def generate_batches(X, Y, batch_size):
    num_samples = X.shape[0]
    for i in range(0, num_samples, batch_size):
        X_batch = X[i:i+batch_size]
        Y_batch = Y[i:i+batch_size]
        yield X_batch, Y_batch

## Activation Functions

In [7]:
def softmax(x):
    return np.exp(x)/np.sum(np.exp(x),axis=1, keepdims=True)

def relu(x):
    return np.maximum(0, x)

def softmax_derivative(output):
    return output * (1 - output)

def relu_derivative(x):
    return (x > 0).astype(float)

## Loss functions

In [8]:
def cross_entropy(batch_Y, Y_pred):
    return -np.sum(batch_Y * np.log(Y_pred + 1e-10)) / len(batch_Y)
    

## Main Loop

In [9]:
# W = np.random.randn(input_size, output_size) * 0.1
# b = np.zeros(output_size)

# for epoch in range(epochs):
#     for batch_X, batch_Y in generate_batches(train_X, train_Y, batch_size):
#         #print(f"Batch_X shape:{batch_X.shape}")
#         #print(f"Weights shape: {W.shape}")
#         #print(f"Bias shape: {b.shape}")
        
#         Z = np.dot(batch_X, W) + b 
#         #print(f"Z shape: {Z.shape}")
#         Y_pred = softmax(Z)
#         #print(f"Y_pred shape: {Y_pred.shape}")
#         Y_hat = np.argmax(Y_pred, axis=1)
#         #print(f"Prediction shape: {Y_hat.shape}")

#         loss = -np.sum(batch_Y * np.log(Y_pred + 1e-10)) / batch_size  # Cross-entropy loss
        
#         # Backprop
#         error = batch_Y - Y_pred

#         dW = np.dot(batch_X.T, error) / batch_size
#         db = np.sum(error, axis=0) / batch_size

#         W += learning_rate * dW
#         b += learning_rate * db
        
#     print(f"Epoch {epoch+1}/{epochs} Loss: {loss:.4f}")


In [10]:
activation_func_dict = {
    'softmax': softmax,
    'relu': relu
}

loss_func_dict = {
    "crossentropy": cross_entropy
}


activation_deriv_func_dict = {
    'relu': relu_derivative,
    'softmax': softmax_derivative
}

In [11]:
class LayerDense:
    def __init__(self, shape, activation='softmax'):
        n_inputs, n_neurons = shape[0], shape[1]
        self.weights = 0.15 * np.random.randn(n_neurons, n_inputs).T
        self.biases = np.zeros((1, n_neurons))
        self.activation = activation_func_dict[activation]
        self.activation_deriv = activation_deriv_func_dict[activation]

        # Unitialized
        self.inputs = None
        self.output = None
        self.d_weights = None
        self.d_biases = None

    def forward(self, inputs):
        self.inputs = inputs
        Z = np.dot(inputs, self.weights) + self.biases
        self.output = self.activation(Z)
        return self.output

    def backward(self, d_output):
        d_activation = d_output * self.activation_deriv(self.output)

        # Gradients
        batch_size = self.inputs.shape[0]
        self.d_weights = np.dot(self.inputs.T, d_activation) / batch_size
        self.d_biases = np.sum(d_activation, axis=0, keepdims=True) / batch_size
        d_input = np.dot(d_activation, self.weights.T)

        return d_input

class Dropout:
    def __init__(self, rate=0.1):
        self.rate = rate
        self.mask = None

    def forward(self, inputs, training=True):
        if training:
            # 1 with probability (1 - rate), 0 with probability rate
            self.mask = np.random.binomial(1, 1 - self.rate, size=inputs.shape) / (1 - self.rate)
            return inputs * self.mask
        else:
            # inference, pass the input unchanged
            return inputs

    def backward(self, d_output):
        # dropout mask during the backward pass
        return d_output * self.mask
        
class NeuralNetwork:
    def __init__(self, layers, debug=False):
        self.layers = layers
        self.debug = debug

        if debug:
            for i, layer in enumerate(layers):
                print(f"Layer_{i} shape: {layer.weights.shape}")
        
    def __forward(self, batch_X, training=True):
        input_values = batch_X
        for layer in self.layers:
            if isinstance(layer, Dropout):
                output_values = layer.forward(input_values, training=training)
            else:
                output_values = layer.forward(input_values)
            input_values = output_values
        
        if self.debug:
            print(f"Output:\n{output_values}")
    
        return output_values


    def __backward(self, batch_Y, Y_pred):
        d_output = Y_pred - batch_Y  
        # Cross-entropy loss derivative with softmax output
        
        for layer in reversed(self.layers):
            d_output = layer.backward(d_output)

    def train(self, train_X, train_Y, epochs, loss_type="crossentropy", learning_rate=0.01):
        print("Starting the crazy stuff")
        loss_func = loss_func_dict[loss_type]
        for epoch in range(epochs):
            for batch_X, batch_Y in generate_batches(train_X, train_Y, batch_size):
                Y_pred = self.__forward(batch_X, training=True)
                
                loss = loss_func(batch_Y, Y_pred)
                error = batch_Y - Y_pred

                self.__backward(batch_Y, Y_pred)

                for layer in self.layers:
                    if isinstance(layer, Dropout):
                        continue
                    
                    layer.weights -= learning_rate * layer.d_weights
                    layer.biases -= learning_rate * layer.d_biases
                
            print(f"Epoch {epoch + 1}, Loss: {loss:.4f}")

    def evaluate(self, test_X, test_Y):
        y_test_pred = model.__forward(test_X, training=False)
    
        y_test_hat = np.argmax(y_test_pred, axis=1)
    
        correct_predictions = np.sum(y_test_hat == np.argmax(test_Y, axis=1))
        accuracy = correct_predictions / test_Y.shape[0]
        
        print(f"Test Accuracy: {accuracy:.4f}")

            
    

In [12]:
# layer1 = LayerDense((4, 5))
# layer2 = LayerDense((5, 2))

# print(f"Weights: (one line is the weights for one neuron):\n {layer1.weights.T}")

# print(f"Biases for this layer: {layer1.biases}")

In [13]:
model = NeuralNetwork([
        LayerDense((input_size, 100), activation='relu'),
        Dropout(rate=0.33),
        LayerDense((100, output_size), activation='softmax'),
    ], debug=False)


In [14]:
start_time = time.time()
model.train(train_X, train_Y, epochs=epochs, learning_rate=learning_rate)
end_time = time.time()

print(f"Training completed in {(end_time-start_time):.2f} seconds")
model.evaluate(test_X, test_Y)

Starting the crazy stuff
Epoch 1, Loss: 0.7185
Epoch 2, Loss: 0.3452
Epoch 3, Loss: 0.3505
Epoch 4, Loss: 0.2063
Epoch 5, Loss: 0.2340
Epoch 6, Loss: 0.2236
Epoch 7, Loss: 0.2503
Epoch 8, Loss: 0.1055
Epoch 9, Loss: 0.1243
Epoch 10, Loss: 0.1457
Epoch 11, Loss: 0.1165
Epoch 12, Loss: 0.1298
Epoch 13, Loss: 0.1334
Epoch 14, Loss: 0.0999
Epoch 15, Loss: 0.1112
Epoch 16, Loss: 0.1117
Epoch 17, Loss: 0.1447
Epoch 18, Loss: 0.0569
Epoch 19, Loss: 0.1050
Epoch 20, Loss: 0.0808
Epoch 21, Loss: 0.0903
Epoch 22, Loss: 0.0648
Epoch 23, Loss: 0.0983
Epoch 24, Loss: 0.0629
Epoch 25, Loss: 0.0883
Epoch 26, Loss: 0.0679
Epoch 27, Loss: 0.0701
Epoch 28, Loss: 0.0700
Epoch 29, Loss: 0.0602
Epoch 30, Loss: 0.0787
Epoch 31, Loss: 0.0562
Epoch 32, Loss: 0.1091
Epoch 33, Loss: 0.0376
Epoch 34, Loss: 0.0543
Epoch 35, Loss: 0.0783
Epoch 36, Loss: 0.0383
Epoch 37, Loss: 0.0570
Epoch 38, Loss: 0.0440
Epoch 39, Loss: 0.0636
Epoch 40, Loss: 0.0738
Epoch 41, Loss: 0.0541
Epoch 42, Loss: 0.0579
Epoch 43, Loss: 0.