In [1]:
import numpy as np
from torchvision.datasets import MNIST
def download_mnist(is_train: bool):
    dataset = MNIST(root='./data1',
        transform=lambda x: np.array(x).flatten(),
        download=True,
        train=is_train)
    
    mnist_data = []
    mnist_labels = []
    
    for image, label in dataset:
        mnist_data.append(image)
        mnist_labels.append(label)
        
    return mnist_data, mnist_labels
    
i_train_X, i_train_Y = download_mnist(True)
i_test_X, i_test_Y = download_mnist(False)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1000)>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to ./data1\MNIST\raw\train-images-idx3-ubyte.gz


100.0%


Extracting ./data1\MNIST\raw\train-images-idx3-ubyte.gz to ./data1\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1000)>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to ./data1\MNIST\raw\train-labels-idx1-ubyte.gz


100.0%


Extracting ./data1\MNIST\raw\train-labels-idx1-ubyte.gz to ./data1\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1000)>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to ./data1\MNIST\raw\t10k-images-idx3-ubyte.gz


100.0%


Extracting ./data1\MNIST\raw\t10k-images-idx3-ubyte.gz to ./data1\MNIST\raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
<urlopen error [SSL: CERTIFICATE_VERIFY_FAILED] certificate verify failed: certificate has expired (_ssl.c:1000)>

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to ./data1\MNIST\raw\t10k-labels-idx1-ubyte.gz


100.0%


Extracting ./data1\MNIST\raw\t10k-labels-idx1-ubyte.gz to ./data1\MNIST\raw



In [2]:
def convert_to_one_hot_encoding(labels: np.ndarray) -> np.ndarray:
    return np.eye(10)[labels]

def normalized(input: np.ndarray) -> np.ndarray:
    return input / 255

In [3]:
def transform_initial_data(train_X, train_Y, test_X, test_Y):
    train_Y = convert_to_one_hot_encoding(train_Y)
    test_Y = convert_to_one_hot_encoding(test_Y)
    
    train_X = np.array(train_X)
    test_X = np.array(test_X)

    train_X = normalized(train_X)
    test_X = normalized(test_X)
    
    return train_X, train_Y, test_X, test_Y

In [25]:
import math
def xavier_uniform(fan_in, fan_out):
    return math.sqrt(6/(fan_in + fan_out))

def he_uniform(fan_in, fan_out):
    return math.sqrt(6 / fan_in)

In [93]:
def initialize_weights_and_biases(neurons_per_layer):
    limits = [
        he_uniform(neurons_per_layer[i-1], neurons_per_layer[i])
        for i in range(1, len(neurons_per_layer))
    ]

    weights = [
        np.random.uniform(low=-limits[i-1], high=limits[i-1], size=(neurons_per_layer[i-1], neurons_per_layer[i]))
        for i in range(1, len(neurons_per_layer))
    ]

    biases = [
        #np.random.uniform(low=-limits[i-1], high=limits[i-1], size=(1, neurons_per_layer[i]))
        np.zeros((1, neurons_per_layer[i]))
        for i in range(1, len(neurons_per_layer))
    ]

    return weights, biases
    

In [119]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def sigmoid_prime(y):
    return y * (1 - y)

def tanh(z):
    return 2*sigmoid(2*z) - 1

def tanh_prime(y):
    return 1 - y*y

RELU_LEAK = 0.01

def leaky_relu(z):
    return np.where(z>0, z, z*RELU_LEAK)

def leaky_relu_prime(y):
    return np.where(y>0, 1, RELU_LEAK)


In [27]:
def softmax(z):
        e_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return e_z / e_z.sum(axis=1, keepdims=True)

In [129]:
activation_functions = {
        'leaky_relu': (leaky_relu, leaky_relu_prime),
        'sigmoid': (sigmoid, sigmoid_prime),
        'tanh': (tanh, tanh_prime)
    }

activation_function, activation_function_prime = activation_functions['leaky_relu']

In [130]:
def forward_propagation_train(inputs, weights, biases):
    output_per_layer = [inputs]
    DROPOUT_RATE = 0.0
    
    for i in range(len(weights)):
        w = weights[i]
        b = biases[i]
        x = output_per_layer[-1]
        
        z = x @ w + b

        #DROPOUT
        if i < len(weights) - 1:
            z *= np.random.choice([0, 1/(1-DROPOUT_RATE)], z.shape, p=[DROPOUT_RATE, 1-DROPOUT_RATE])
            
        y = activation_function(z) if i < len(weights) - 1 else softmax(z)
        output_per_layer.append(y)
        
    return output_per_layer

def forward_propagation_test(inputs, weights, biases):
    output_per_layer = [inputs]
    
    for i in range(len(weights)):
        w = weights[i]
        b = biases[i]
        x = output_per_layer[-1]
        
        z = x @ w + b
        
        y = activation_function(z) if i < len(weights) - 1 else softmax(z)
        output_per_layer.append(y)
        
    return output_per_layer

In [10]:
def cross_entropy(prediction_output, train_output):
        return - np.sum(train_output * np.log(prediction_output))

In [117]:
def back_propagation(weights, biases, outputs_per_layer, labels, learning_rate):
        error = outputs_per_layer[-1] - labels
        #LEARNING_RATE = 0.0001
        FORGET_RATE = 0.00001
        for i in reversed(range(len(weights))):
            y = outputs_per_layer[i]
            y_1 = outputs_per_layer[i+1]

            back_error = activation_function_prime(y_1) * error @ weights[i].T

            weights[i] -= learning_rate * (y.T @ error  + FORGET_RATE * weights[i])
            biases[i] -= learning_rate * error.sum(axis = 0)

            error = back_error

In [16]:
def predict(y):
    value_count = y.shape[1]
    y = np.argmax(y, axis=1)
    y = np.eye(value_count)[y]
    return y

def test_neural_network(test_input, weights, biases, test_output):
    fp = forward_propagation_test(test_input, weights, biases)
    predictions = predict(fp[-1])

    accuracy = np.mean(np.sum(predictions * test_output, axis=1))

    return accuracy

In [108]:
def update_learning_rate(current_learning_rate, accuracies, epoch, last_update_epoch):
    min_learning_rate = 1e-6
    patience = 7
    factor = 0.5
    
    if epoch - last_update_epoch >= patience:
        if len(accuracies) > patience and all(accuracies[-patience - 1] >= acc for acc in accuracies[-patience:]):
            new_learning_rate = max(current_learning_rate * factor, min_learning_rate)
            print(f"Reducing learning rate from {current_learning_rate} to {new_learning_rate}")
            return new_learning_rate, epoch
        
    return current_learning_rate, last_update_epoch

In [131]:
def train_neural_network(train_input, train_output, test_input, test_output):
    
    EPOCH_COUNT = 500
    BATCH_SIZE = 50

    train_count = train_input.shape[0]
    neurons_per_layer = [train_input.shape[1], 100, train_output.shape[1]]
    weights, biases = initialize_weights_and_biases(neurons_per_layer)

    accuracies_train = []
    accuracies_test = []
    learning_rate = 0.001
    last_update_epoch = -1

    for i in range(EPOCH_COUNT):
        indices = np.arange(0, train_count)
        np.random.shuffle(indices)

        train_X = train_input[indices]
        train_Y = train_output[indices]

        for j in range(0, train_count, BATCH_SIZE):
            batch_X = train_X[j:j + BATCH_SIZE]
            batch_Y = train_Y[j:j + BATCH_SIZE]

            outputs_per_layer = forward_propagation_train(batch_X, weights, biases)

            back_propagation(weights, biases, outputs_per_layer, batch_Y, learning_rate)

        acc_train = test_neural_network(train_input, weights, biases, train_output)
        acc_test = test_neural_network(test_input, weights, biases, test_output)
        accuracies_train.append(acc_train)
        accuracies_test.append(acc_test)

        learning_rate, last_update_epoch = update_learning_rate(learning_rate, accuracies_train, i, last_update_epoch)

        print(f"{i+1}: test:{acc_test * 100: .2f}% train:{acc_train * 100: .2f}% {learning_rate}")

In [132]:
train_X, train_Y, test_X, test_Y = transform_initial_data(i_train_X, i_train_Y, i_test_X, i_test_Y)

train_neural_network(train_X, train_Y, test_X, test_Y)

1: test: 91.00% train: 90.68% 0.001
2: test: 91.63% train: 91.62% 0.001
3: test: 92.30% train: 92.35% 0.001
4: test: 92.51% train: 92.68% 0.001
5: test: 92.95% train: 93.08% 0.001
6: test: 92.92% train: 93.25% 0.001
7: test: 92.92% train: 93.19% 0.001
8: test: 93.12% train: 93.38% 0.001
9: test: 93.16% train: 93.45% 0.001
10: test: 93.35% train: 93.66% 0.001
11: test: 93.13% train: 93.69% 0.001
12: test: 93.23% train: 93.74% 0.001
13: test: 93.29% train: 93.74% 0.001
14: test: 93.24% train: 93.68% 0.001
15: test: 92.97% train: 93.49% 0.001
16: test: 93.20% train: 93.78% 0.001
17: test: 93.25% train: 93.69% 0.001
18: test: 93.38% train: 93.85% 0.001
19: test: 93.44% train: 93.92% 0.001
20: test: 93.31% train: 93.76% 0.001
21: test: 93.26% train: 93.85% 0.001
22: test: 93.52% train: 93.90% 0.001
23: test: 93.39% train: 93.94% 0.001
24: test: 93.40% train: 93.99% 0.001
25: test: 93.51% train: 93.94% 0.001
26: test: 93.42% train: 93.93% 0.001
27: test: 93.15% train: 93.78% 0.001
28: test: 

KeyboardInterrupt: 