# CIFAR-10 classification with a neural network
# Author: Daniil Roman

## 1. Load CIFAR-10 dataset and prepare data


In [1]:
from cifar10_downloader import cifar10
import numpy as np
from tqdm import tqdm

def prepare_data(channel_numbers=3, img_dim=32, train_size=None, test_size=None):
    # already normalised
    train_x, train_y, test_x, test_y = cifar10()
    train_y = np.array([np.argmax(i) for i in train_y]).reshape(len(train_x), 1)
    train_data = np.hstack((train_x, train_y))

    test_y = np.array([np.argmax(i) for i in test_y]).reshape(len(test_y), 1)
    test_x = test_x.reshape(len(test_x), channel_numbers, img_dim, img_dim)

    if (train_size != None):
        return train_data[:train_size], test_x[:test_size], test_y[:test_size].flatten()
    return train_data, test_x, test_y.flatten()

train_size = 10
test_size = 5
img_dim = 32
channel_numbers = 3
train_data, test_x, test_y = prepare_data(channel_numbers, img_dim, train_size=train_size, test_size=test_size)


## 2. NN implementation, layers, activations, loss functions and optimizer

## UTILS

In [2]:
def split_to_x_y(data, n_c=3, dim=32):
    X = data[:, 0:-1]
    X = X.reshape(len(data), n_c, dim, dim)
    Y = data[:, -1:]
    return X, Y.flatten()

def xavier_init(size):
    bound = np.sqrt(2. / np.sum(size))
    return np.random.uniform(-bound, bound, size=size)


def softmax_crossentropy(forward_output, y):
    y = y.flatten().astype(int)
    forward_output_for_answers = forward_output[np.arange(len(forward_output)), y]
    return - forward_output_for_answers + np.log(np.sum(np.exp(forward_output), axis=-1))


def grad_softmax_crossentropy(forward_output, y):
    ones_for_answers = np.zeros_like(forward_output)
    ones_for_answers[np.arange(len(forward_output)), y.flatten().astype(int)] = 1

    softmax = np.exp(forward_output) / np.exp(forward_output).sum(axis=-1, keepdims=True)

    return (- ones_for_answers + softmax) / forward_output.shape[0]


def im2col_indices(x, field_height, field_width, padding=1, stride=1):
    """ An implementation of im2col based on some fancy indexing """
    # Zero-pad the input
    p = padding
    x_padded = np.pad(x, ((0, 0), (0, 0), (p, p), (p, p)), mode='constant')

    k, i, j = get_im2col_indices(x.shape, field_height, field_width, padding, stride)

    cols = x_padded[:, k, i, j]
    C = x.shape[1]
    cols = cols.transpose(1, 2, 0).reshape(field_height * field_width * C, -1)
    return cols


def get_im2col_indices(x_shape, field_height, field_width, padding=1, stride=1):
    # First figure out what the size of the output should be
    N, C, H, W = x_shape
    assert (H + 2 * padding - field_height) % stride == 0
    assert (W + 2 * padding - field_height) % stride == 0
    out_height = int((H + 2 * padding - field_height) / stride + 1)
    out_width = int((W + 2 * padding - field_width) / stride + 1)

    i0 = np.repeat(np.arange(field_height), field_width)
    i0 = np.tile(i0, C)
    i1 = stride * np.repeat(np.arange(out_height), out_width)
    j0 = np.tile(np.arange(field_width), field_height * C)
    j1 = stride * np.tile(np.arange(out_width), out_height)
    i = i0.reshape(-1, 1) + i1.reshape(1, -1)
    j = j0.reshape(-1, 1) + j1.reshape(1, -1)

    k = np.repeat(np.arange(C), field_height * field_width).reshape(-1, 1)

    return (k.astype(int), i.astype(int), j.astype(int))


def col2im_indices(cols, x_shape, field_height=3, field_width=3, padding=1,
                   stride=1):
    """ An implementation of col2im based on fancy indexing and np.add.at """
    N, C, H, W = x_shape
    H_padded, W_padded = H + 2 * padding, W + 2 * padding
    x_padded = np.zeros((N, C, H_padded, W_padded), dtype=cols.dtype)
    k, i, j = get_im2col_indices(x_shape, field_height, field_width, padding, stride)
    cols_reshaped = cols.reshape(C * field_height * field_width, -1, N)
    cols_reshaped = cols_reshaped.transpose(2, 0, 1)
    np.add.at(x_padded, (slice(None), k, i, j), cols_reshaped)
    if padding == 0:
        return x_padded
    return x_padded[:, :, padding:-padding, padding:-padding]


## MODEL

In [3]:
class Step:
    def __init__(self):
        self.forward_input = None
        self.backward_param = None

        self.learning_rate = None

    def forward(self, image):
        output = self._forward(image)
        self.forward_input = image
        return output

    def _forward(self, image):
        raise Exception("Not implemented.")

    def backward(self, input, learning_rate):
        self.learning_rate = learning_rate
        backward_param = self._backward(input)
        self.backward_param = backward_param
        return backward_param

    def _backward(self, input):
        raise Exception("Not implemented.")

    def get_derivatives(self):
        return []

    def init_derivative(self):
        pass


class Model:
    def __init__(self, lr):
        self.steps = []
        self.learning_rate = lr

    def forward(self, input):
        current_step_output = input
        for step in self.steps:
            current_step_output = step.forward(current_step_output)
        return current_step_output

    def backward(self, derivative_of_loss):
        current_step_output = derivative_of_loss
        self.steps.reverse()
        for step in self.steps:
            current_step_output = step.backward(current_step_output, self.learning_rate)
        self.steps.reverse()

    def get_derivatives(self):
        grads = []

        for step in self.steps:
            grads += step.get_derivatives()

        return grads

    def init_derivative(self):
        for step in self.steps:
            step.init_derivative()

    def predict(self, x):
        result = self.forward(x)
        return result.argmax(axis=-1)


class Convolution(Step):

    def __init__(self, size, stride=1, padding=0):
        super(Convolution, self).__init__()
        filter, bios = self._init(size)

        self.filter, self.bios, self.stride, self.padding = filter, bios, stride, padding
        self.init_derivative()

    def init_derivative(self):
        self.derivative_filter, self.derivative_bios = np.zeros(self.filter.shape), np.zeros(self.bios.shape)

    def _init(self, size):
        filter = xavier_init(size)
        bios = np.zeros((filter.shape[0], 1))
        return filter, bios

    def _forward(self, X):
        n_filters, d_filter, h_filter, w_filter = self.filter.shape
        n_x, d_x, h_x, w_x = X.shape
        h_out = (h_x - h_filter + 2 * self.padding) / self.stride + 1
        w_out = (w_x - w_filter + 2 * self.padding) / self.stride + 1

        if not h_out.is_integer() or not w_out.is_integer():
            raise Exception('Invalid output dimension!')

        h_out, w_out = int(h_out), int(w_out)

        X_col = im2col_indices(X, h_filter, w_filter, padding=self.padding, stride=self.stride)
        W_col = self.filter.reshape(n_filters, -1)

        out = W_col @ X_col + self.bios
        out = out.reshape(n_filters, h_out, w_out, n_x)
        out = out.transpose(3, 0, 1, 2)

        self.cache = (X, self.filter, self.bios, self.stride, self.padding, X_col)

        return out

    def _backward(self, input):
        derivative_conv, derivative_filter, derivative_bios = self.conv_backward(input)
        self.derivative_filter += derivative_filter
        self.derivative_bios += derivative_bios
        self.learning_step(self.learning_rate)
        return derivative_conv

    def conv_backward(self, input):
        X, W, b, stride, padding, X_col = self.cache
        n_filter, d_filter, h_filter, w_filter = W.shape

        db = np.sum(input, axis=(0, 2, 3))
        db = db.reshape(n_filter, -1)

        dout_reshaped = input.transpose(1, 2, 3, 0).reshape(n_filter, -1)
        dW = dout_reshaped @ X_col.T
        dW = dW.reshape(W.shape)

        W_reshape = W.reshape(n_filter, -1)
        dX_col = W_reshape.T @ dout_reshaped
        dX = col2im_indices(dX_col, X.shape, h_filter, w_filter, padding=padding, stride=stride)

        return dX, dW, db

    def get_derivatives(self):
        return [self.derivative_filter, self.derivative_bios]

    def learning_step(self, learning_rate):
        self.filter -= learning_rate * self.derivative_filter
        self.bios -= learning_rate * self.derivative_bios


class FullyConnected(Step):

    def __init__(self, input_units, output_units):
        super(FullyConnected, self).__init__()
        self.weights = xavier_init((input_units, output_units))
        self.biases = np.zeros(output_units)

    def _forward(self, input):
        return input @ self.weights + self.biases

    def _backward(self, input):
        grad_input = input @ self.weights.T

        grad_weights = self.forward_input.T @ input
        grad_biases = np.sum(input, axis=0)

        assert grad_weights.shape == self.weights.shape and grad_biases.shape == self.biases.shape
        self.weights = self.weights - self.learning_rate * grad_weights
        self.biases = self.biases - self.learning_rate * grad_biases

        return grad_input


class Relu(Step):
    def _forward(self, input):
        return np.maximum(0, input)

    def _backward(self, input):
        relu_derivative = self.forward_input > 0
        return input * relu_derivative


class MaxPool(Step):
    def __init__(self, size, stride, padding=0):
        super(MaxPool, self).__init__()
        self.size, self.stride, self.padding = size, stride, padding

    def _forward(self, X):
        n, d, h, w = X.shape
        h_out = (h - self.size) / self.stride + 1
        w_out = (w - self.size) / self.stride + 1

        if not w_out.is_integer() or not h_out.is_integer():
            raise Exception('Invalid output dimension!')

        h_out, w_out = int(h_out), int(w_out)

        X_reshaped = X.reshape(n * d, 1, h, w)
        X_col = im2col_indices(X_reshaped, self.size, self.size, padding=self.padding, stride=self.stride)

        max_index = np.argmax(X_col, axis=0)
        out = X_col[max_index, range(max_index.size)]

        out = out.reshape(h_out, w_out, n, d)
        out = out.transpose(2, 3, 0, 1)

        self.cache = (X, self.size, self.stride, X_col, max_index)

        return out

    def _backward(self, input):
        X, size, stride, X_col, pool_cache = self.cache
        n, d, w, h = X.shape

        dX_col = np.zeros_like(X_col)

        dX = col2im_indices(dX_col, (n * d, 1, h, w), size, size, padding=self.padding, stride=stride)
        dX = dX.reshape(X.shape)

        return dX


class Flatten(Step):
    def _forward(self, input):
        (n_channels, height, width) = input[0].shape
        flattened = input.reshape((len(input), n_channels * height * width))
        return flattened

    def _backward(self, input):
        return input.reshape(self.forward_input.shape)

## RUN

In [4]:
def SGD(model, X, Y):
    out = model.forward(X)

    loss = softmax_crossentropy(out, Y)
    loss_grad = grad_softmax_crossentropy(out, Y)

    model.backward(loss_grad)

    return np.mean(loss)

def train(train_data,model, img_dim=32, channel_numbers=3, batch_size=4, num_epochs=20, lr=0.01):

    print("LR:" + str(lr) + ", Batch Size:" + str(batch_size))
    for epoch in range(num_epochs):
        np.random.shuffle(train_data)
        batches = [train_data[k:k + batch_size] for k in range(0, train_data.shape[0], batch_size)]

        t = tqdm(batches)
        for x, batch in enumerate(t):
            X, Y = split_to_x_y(batch, channel_numbers, img_dim)
            loss = SGD(model, X, Y)
            t.set_description("Loss: %.2f" % (loss))

def predict(model, test_x, test_y):
    result = model.predict(test_x)
    assert result.shape == test_y.shape
    print(f"Score: {np.mean(result == test_y)}")

## 3. Check correctness

In [5]:
train_data, test_x, test_y = prepare_data()

In [101]:
lr=0.01
model = Model(lr)

model.steps = [
    Flatten(),
    FullyConnected(3072, 128),
    Relu(),
    FullyConnected(128, 10)
]

#### Train with 1 img

In [99]:
train_data_1 = train_data[:1]

train(train_data_1, model, batch_size=4, num_epochs=10, lr=lr)

x, y = split_to_x_y(train_data_1)
predict(model, x, y)

Loss: 2.30: 100%|██████████| 1/1 [00:00<00:00, 60.69it/s]
Loss: 0.98: 100%|██████████| 1/1 [00:00<00:00, 59.35it/s]
Loss: 0.50: 100%|██████████| 1/1 [00:00<00:00, 68.30it/s]
Loss: 0.28: 100%|██████████| 1/1 [00:00<00:00, 87.48it/s]
Loss: 0.18: 100%|██████████| 1/1 [00:00<00:00, 78.18it/s]
Loss: 0.13: 100%|██████████| 1/1 [00:00<00:00, 145.52it/s]
Loss: 0.10: 100%|██████████| 1/1 [00:00<00:00, 117.89it/s]
Loss: 0.08: 100%|██████████| 1/1 [00:00<00:00, 215.36it/s]
Loss: 0.06: 100%|██████████| 1/1 [00:00<00:00, 210.23it/s]
Loss: 0.05: 100%|██████████| 1/1 [00:00<00:00, 160.27it/s]


LR:0.01, Batch Size:4
Score: 1.0


#### Train with 10 img

In [102]:
train_data_10 = train_data[:10]

train(train_data_10, model, batch_size=4, num_epochs=10, lr=lr)

x, y = split_to_x_y(train_data_10)
predict(model, x, y)

Loss: 2.33: 100%|██████████| 3/3 [00:00<00:00, 72.14it/s]
Loss: 2.75: 100%|██████████| 3/3 [00:00<00:00, 108.90it/s]
Loss: 1.85: 100%|██████████| 3/3 [00:00<00:00, 191.78it/s]
Loss: 1.80: 100%|██████████| 3/3 [00:00<00:00, 145.14it/s]
Loss: 2.20: 100%|██████████| 3/3 [00:00<00:00, 227.42it/s]
Loss: 1.92: 100%|██████████| 3/3 [00:00<00:00, 138.08it/s]
Loss: 1.79: 100%|██████████| 3/3 [00:00<00:00, 226.45it/s]
Loss: 1.78: 100%|██████████| 3/3 [00:00<00:00, 104.29it/s]
Loss: 2.41: 100%|██████████| 3/3 [00:00<00:00, 103.90it/s]
Loss: 1.80: 100%|██████████| 3/3 [00:00<00:00, 202.10it/s]


LR:0.01, Batch Size:4
Score: 0.8


## 4. Experiments

In [6]:
train_data_T, test_x_T, test_y_T = train_data[:1000], test_x[:100], test_y[:100]
lr=0.01

In [110]:
lr=0.001
model = Model(lr)

model.steps = [
    Flatten(),
    FullyConnected(3072, 512),
    Relu(),
    FullyConnected(512, 128),
    Relu(),
    FullyConnected(128, 10)
]

train(train_data_T, model, batch_size=20, num_epochs=20, lr=lr)
predict(model, test_x_T, test_y_T)

Loss: 2.29: 100%|██████████| 50/50 [00:01<00:00, 29.20it/s]
Loss: 2.31: 100%|██████████| 50/50 [00:01<00:00, 28.95it/s]
Loss: 2.27: 100%|██████████| 50/50 [00:01<00:00, 27.35it/s]
Loss: 2.24: 100%|██████████| 50/50 [00:01<00:00, 31.17it/s]
Loss: 2.26: 100%|██████████| 50/50 [00:01<00:00, 30.63it/s]
Loss: 2.31: 100%|██████████| 50/50 [00:01<00:00, 33.28it/s]
Loss: 2.26: 100%|██████████| 50/50 [00:01<00:00, 30.13it/s]
Loss: 2.27: 100%|██████████| 50/50 [00:01<00:00, 28.70it/s]
Loss: 2.30: 100%|██████████| 50/50 [00:01<00:00, 29.15it/s]
Loss: 2.21: 100%|██████████| 50/50 [00:01<00:00, 26.21it/s]
Loss: 2.14: 100%|██████████| 50/50 [00:01<00:00, 28.74it/s]
Loss: 2.27: 100%|██████████| 50/50 [00:01<00:00, 30.62it/s]
Loss: 2.29: 100%|██████████| 50/50 [00:01<00:00, 28.69it/s]
Loss: 2.14: 100%|██████████| 50/50 [00:01<00:00, 29.94it/s]
Loss: 2.20: 100%|██████████| 50/50 [00:01<00:00, 32.46it/s]
Loss: 2.18: 100%|██████████| 50/50 [00:01<00:00, 31.55it/s]
Loss: 2.20: 100%|██████████| 50/50 [00:0

LR:0.001, Batch Size:20
Score: 0.24


In [109]:
lr=0.001
model = Model(lr)

model.steps = [
    Convolution(size=(8, channel_numbers, 5, 5), stride=1),
    Relu(),
    Convolution(size=(3, 8, 5, 5), stride=1),
    Relu(),
    MaxPool(size=2, stride=2),
    Flatten(),
    FullyConnected(432, 256),
    Relu(),
    FullyConnected(256, 128),
    Relu(),
    FullyConnected(128, 10)
]

train(train_data_T, model, batch_size=20, num_epochs=20, lr=lr)
predict(model, test_x_T, test_y_T)


Loss: 2.30: 100%|██████████| 50/50 [00:24<00:00,  2.04it/s]
Loss: 2.31: 100%|██████████| 50/50 [00:23<00:00,  2.10it/s]
Loss: 2.28: 100%|██████████| 50/50 [00:25<00:00,  1.94it/s]
Loss: 2.30: 100%|██████████| 50/50 [00:24<00:00,  2.03it/s]
Loss: 2.29: 100%|██████████| 50/50 [00:23<00:00,  2.12it/s]
Loss: 2.27: 100%|██████████| 50/50 [00:27<00:00,  1.80it/s]
Loss: 2.31: 100%|██████████| 50/50 [00:28<00:00,  1.75it/s]
Loss: 2.27: 100%|██████████| 50/50 [00:27<00:00,  1.79it/s]
Loss: 2.26: 100%|██████████| 50/50 [00:23<00:00,  2.09it/s]
Loss: 2.27: 100%|██████████| 50/50 [00:26<00:00,  1.91it/s]
Loss: 2.27: 100%|██████████| 50/50 [00:26<00:00,  1.89it/s]
Loss: 2.29: 100%|██████████| 50/50 [00:27<00:00,  1.82it/s]
Loss: 2.28: 100%|██████████| 50/50 [00:25<00:00,  1.96it/s]
Loss: 2.30: 100%|██████████| 50/50 [00:26<00:00,  1.90it/s]
Loss: 2.25: 100%|██████████| 50/50 [00:26<00:00,  1.91it/s]
Loss: 2.26: 100%|██████████| 50/50 [00:25<00:00,  1.97it/s]
Loss: 2.29: 100%|██████████| 50/50 [00:2

LR:0.001, Batch Size:20
Score: 0.11


In [112]:
lr=0.01
model = Model(lr)

model.steps = [
    Convolution(size=(2, channel_numbers, 4, 4), stride=2),
    Relu(),
    MaxPool(size=5, stride=2),
    Flatten(),
    FullyConnected(72, 32),
    Relu(),
    FullyConnected(32, 10)
]

train(train_data_T, model, batch_size=20, num_epochs=20, lr=lr)
predict(model, test_x_T, test_y_T)

Loss: 2.30: 100%|██████████| 50/50 [00:02<00:00, 21.42it/s]
Loss: 2.29: 100%|██████████| 50/50 [00:03<00:00, 16.26it/s]
Loss: 2.30: 100%|██████████| 50/50 [00:04<00:00, 11.49it/s]
Loss: 2.26: 100%|██████████| 50/50 [00:04<00:00, 11.50it/s]
Loss: 2.26: 100%|██████████| 50/50 [00:04<00:00, 11.98it/s]
Loss: 2.26: 100%|██████████| 50/50 [00:04<00:00, 11.78it/s]
Loss: 2.23: 100%|██████████| 50/50 [00:04<00:00, 11.63it/s]
Loss: 2.27: 100%|██████████| 50/50 [00:04<00:00, 11.30it/s]
Loss: 2.24: 100%|██████████| 50/50 [00:04<00:00, 11.80it/s]
Loss: 2.36: 100%|██████████| 50/50 [00:04<00:00, 11.31it/s]
Loss: 2.22: 100%|██████████| 50/50 [00:04<00:00, 10.86it/s]
Loss: 2.09: 100%|██████████| 50/50 [00:04<00:00, 11.02it/s]
Loss: 2.24: 100%|██████████| 50/50 [00:04<00:00, 10.66it/s]
Loss: 2.28: 100%|██████████| 50/50 [00:04<00:00, 10.86it/s]
Loss: 2.19: 100%|██████████| 50/50 [00:04<00:00, 12.40it/s]
Loss: 2.29: 100%|██████████| 50/50 [00:04<00:00, 11.00it/s]
Loss: 2.27: 100%|██████████| 50/50 [00:0

LR:0.01, Batch Size:20
Score: 0.25


In [113]:
lr=0.01
model = Model(lr)

model.steps = [
    Convolution(size=(1, channel_numbers, 4, 4), stride=2),
    Relu(),
    Flatten(),
    FullyConnected(225, 128),
    Relu(),
    FullyConnected(128, 10)
]

train(train_data_T, model, batch_size=20, num_epochs=20, lr=lr)
predict(model, test_x_T, test_y_T)



Loss: 2.31: 100%|██████████| 50/50 [00:01<00:00, 25.03it/s]
Loss: 2.30: 100%|██████████| 50/50 [00:02<00:00, 24.16it/s]
Loss: 2.30: 100%|██████████| 50/50 [00:01<00:00, 25.39it/s]
Loss: 2.31: 100%|██████████| 50/50 [00:01<00:00, 25.08it/s]
Loss: 2.30: 100%|██████████| 50/50 [00:02<00:00, 16.68it/s]
Loss: 2.30: 100%|██████████| 50/50 [00:03<00:00, 13.84it/s]
Loss: 2.30: 100%|██████████| 50/50 [00:03<00:00, 15.67it/s]
Loss: 2.30: 100%|██████████| 50/50 [00:03<00:00, 14.39it/s]
Loss: 2.30: 100%|██████████| 50/50 [00:03<00:00, 15.70it/s]
Loss: 2.30: 100%|██████████| 50/50 [00:03<00:00, 13.79it/s]
Loss: 2.30: 100%|██████████| 50/50 [00:03<00:00, 15.04it/s]
Loss: 2.29: 100%|██████████| 50/50 [00:03<00:00, 14.45it/s]
Loss: 2.30: 100%|██████████| 50/50 [00:03<00:00, 15.25it/s]
Loss: 2.30: 100%|██████████| 50/50 [00:03<00:00, 15.11it/s]
Loss: 2.31: 100%|██████████| 50/50 [00:03<00:00, 16.01it/s]
Loss: 2.27: 100%|██████████| 50/50 [00:03<00:00, 14.29it/s]
Loss: 2.30: 100%|██████████| 50/50 [00:0

LR:0.01, Batch Size:20
Score: 0.06


In [114]:
lr=0.1
model = Model(lr)

model.steps = [
    Flatten(),
    FullyConnected(3072, 100),
    Relu(),
    FullyConnected(100, 200),
    Relu(),
    FullyConnected(200, 100),
    Relu(),
    FullyConnected(100, 10)
]

train(train_data_T, model, batch_size=32, num_epochs=30, lr=lr)
predict(model, test_x_T, test_y_T)

Loss: 2.26: 100%|██████████| 32/32 [00:00<00:00, 142.48it/s]
Loss: 2.01: 100%|██████████| 32/32 [00:00<00:00, 119.57it/s]
Loss: 2.20: 100%|██████████| 32/32 [00:00<00:00, 137.44it/s]
Loss: 1.94: 100%|██████████| 32/32 [00:00<00:00, 125.96it/s]
Loss: 2.27: 100%|██████████| 32/32 [00:00<00:00, 132.08it/s]
Loss: 2.18: 100%|██████████| 32/32 [00:00<00:00, 70.34it/s]
Loss: 1.74: 100%|██████████| 32/32 [00:00<00:00, 117.31it/s]
Loss: 2.00: 100%|██████████| 32/32 [00:00<00:00, 95.20it/s] 
Loss: 1.44: 100%|██████████| 32/32 [00:00<00:00, 106.37it/s]
Loss: 2.09: 100%|██████████| 32/32 [00:00<00:00, 112.39it/s]
Loss: 1.79: 100%|██████████| 32/32 [00:00<00:00, 103.52it/s]
Loss: 1.77: 100%|██████████| 32/32 [00:00<00:00, 112.42it/s]
Loss: 2.36: 100%|██████████| 32/32 [00:00<00:00, 120.37it/s]
Loss: 2.28: 100%|██████████| 32/32 [00:00<00:00, 113.64it/s]
Loss: 1.63: 100%|██████████| 32/32 [00:00<00:00, 115.47it/s]
Loss: 1.42: 100%|██████████| 32/32 [00:00<00:00, 89.57it/s] 
Loss: 1.49: 100%|████████

LR:0.1, Batch Size:32
Score: 0.24


In [7]:
lr=0.1
model = Model(lr)

model.steps = [
    Convolution(size=(16, 3, 5, 5)),
    Relu(),
    Flatten(),
    FullyConnected(12544, 1000),
    Relu(),
    FullyConnected(1000, 10)
]

train(train_data_T, model, batch_size=32, num_epochs=10, lr=lr)
predict(model, test_x_T, test_y_T)

Loss: 2.18: 100%|██████████| 32/32 [00:13<00:00,  2.43it/s]
Loss: 2.11: 100%|██████████| 32/32 [00:13<00:00,  2.39it/s]
Loss: 2.16: 100%|██████████| 32/32 [00:13<00:00,  2.43it/s]
Loss: 2.43: 100%|██████████| 32/32 [00:13<00:00,  2.42it/s]
Loss: 2.70: 100%|██████████| 32/32 [00:13<00:00,  2.37it/s]
Loss: 2.09: 100%|██████████| 32/32 [00:13<00:00,  2.36it/s]
Loss: 3.08: 100%|██████████| 32/32 [00:13<00:00,  2.41it/s]
Loss: 2.32: 100%|██████████| 32/32 [00:13<00:00,  2.39it/s]
Loss: 2.23: 100%|██████████| 32/32 [00:13<00:00,  2.40it/s]
Loss: 2.18: 100%|██████████| 32/32 [00:13<00:00,  2.41it/s]


LR:0.1, Batch Size:32
Score: 0.12


## 5. Results
### The best NN architecture
1. Fully connected (out=100, ReLU)
2. Fully connected (out=200, ReLU)
3. Fully connected (out=100, ReLU)
4. Fully connected (out=10, ReLU)
5. Softmax

### Learning rate strategy and batch size
lr: 0.1

batch size: 5000

epoch: 150

### Data augmentations
-

### The best model test accuracy
0.454

In [13]:
lr=0.1
model = Model(lr)

model.steps = [
    Flatten(),
    FullyConnected(3072, 100),
    Relu(),
    FullyConnected(100, 200),
    Relu(),
    FullyConnected(200, 100),
    Relu(),
    FullyConnected(100, 10)
]

train(train_data, model, batch_size=5000, num_epochs=150, lr=lr)
predict(model, test_x, test_y)


LR:0.1, Batch Size:5000
Score: 0.454


Loss: 2.29: 100%|██████████| 10/10 [00:02<00:00,  4.12it/s]
Loss: 2.27: 100%|██████████| 10/10 [00:02<00:00,  3.83it/s]
Loss: 2.25: 100%|██████████| 10/10 [00:02<00:00,  4.24it/s]
Loss: 2.21: 100%|██████████| 10/10 [00:02<00:00,  3.57it/s]
Loss: 2.20: 100%|██████████| 10/10 [00:04<00:00,  2.33it/s]
Loss: 2.19: 100%|██████████| 10/10 [00:04<00:00,  2.41it/s]
Loss: 2.16: 100%|██████████| 10/10 [00:03<00:00,  2.64it/s]
Loss: 2.15: 100%|██████████| 10/10 [00:03<00:00,  2.81it/s]
Loss: 2.11: 100%|██████████| 10/10 [00:04<00:00,  2.42it/s]
Loss: 2.09: 100%|██████████| 10/10 [00:04<00:00,  2.16it/s]
Loss: 2.06: 100%|██████████| 10/10 [00:04<00:00,  2.31it/s]
Loss: 2.03: 100%|██████████| 10/10 [00:03<00:00,  2.55it/s]
Loss: 2.03: 100%|██████████| 10/10 [00:04<00:00,  2.02it/s]
Loss: 2.02: 100%|██████████| 10/10 [00:03<00:00,  2.67it/s]
Loss: 1.99: 100%|██████████| 10/10 [00:03<00:00,  2.70it/s]
Loss: 1.96: 100%|██████████| 10/10 [00:03<00:00,  2.67it/s]
Loss: 2.01: 100%|██████████| 10/10 [00:0