## Prepare your dataset

In [None]:
import torch
import torchvision
import torchvision.transforms as transforms
import tqdm

def load_mnist_data(root_path='./data', batch_size=4):
    transform = transforms.Compose(
        [transforms.ToTensor(),
        transforms.Normalize((0.5), (0.5))]
    )

    trainset = torchvision.datasets.MNIST(root=root_path, train=True, download=True, transform=transform)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=2)

    testset = torchvision.datasets.MNIST(root=root_path, train=False, download=True, transform=transform)
    testloader = torch.utils.data.DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=2)

    return trainloader, testloader

## Building your neural network

In [None]:
import numpy as np
from typing import Any, Callable, Tuple

##################################
# For matrices or arbitrary size #
##################################
class MyWeightTensor:
    def __init__(self, shape: Tuple or int, init_weight_fn: Callable = np.random.randn, init_weights: 'MyWeightTensor' or np.ndarray or int or float = None):
        assert isinstance(shape, tuple) or isinstance(shape, int) or isinstance(shape, float), f'Allowed shapes: tuple, int, float, got: {type(shape)}'
        self.shape = shape

        if init_weights is not None:
            if isinstance(init_weights, MyWeightTensor):
                self.values = init_weights.values
            else:
                if isinstance(shape, tuple):
                    assert isinstance(init_weights, np.ndarray)
                else:
                    assert isinstance(init_weights, int) or isinstance(init_weights, float)
                
                self.values = init_weights
        else:
            if isinstance(shape, int):
                self.shape = (self.shape,)
                self.values = init_weight_fn(shape)
            else:
                self.values = init_weight_fn(*shape)
    
    @property
    def T(self) -> 'MyWeightTensor':
        _T = self.values.T
        return MyWeightTensor(shape=_T.shape, init_weights=_T)
    
    def __add__(self, other) -> 'MyWeightTensor':
        if isinstance(other, MyWeightTensor):
            other = other.values
        else:
            assert isinstance(other, np.ndarray) or isinstance(other, int) or isinstance(other, float)
        
        return MyWeightTensor(shape=self.values.shape, init_weights=self.values + other)

    def __mul__(self, other) -> 'MyWeightTensor':
        if isinstance(other, MyWeightTensor):
            other = other.values
        else:
            assert isinstance(other, np.ndarray) or isinstance(other, int) or isinstance(other, float)
        
        _dot = np.dot(self.values, other)

        return MyWeightTensor(shape=_dot.shape, init_weights=_dot)


###############################
# For creating a linear layer #
###############################
class MyLinearLayer:
    def __init__(self, in_features: int, out_features: int, init_weight_fn: Callable = np.random.randn) -> None:
        self.in_features = in_features
        self.out_features = out_features

        self.weights = MyWeightTensor(shape=(out_features, in_features), init_weight_fn=init_weight_fn)
        self.bias = MyWeightTensor(shape=out_features, init_weight_fn=init_weight_fn)

        self.latest_input = None
        self.latest_output = None

    def __call__(self, tensor: np.ndarray or MyWeightTensor) -> MyWeightTensor:
        self.latest_input = tensor

        bs = -1
        if len(tensor.shape) == 2:
            # batch size included
            bs = tensor.shape[0]
            _w = self.weights * tensor.T
        else:
            _w = self.weights * tensor
        
        _bias = self.bias.values
        if bs != -1:
            _bias = np.tile(_bias, bs).reshape(bs, -1)
        
        self.latest_output = (_w + _bias.T).T

        return MyWeightTensor(shape=self.latest_output.shape, init_weights=self.latest_output)
    
    def derivative(self) -> float:
        assert self.latest_output is not None, 'Cannot calculate grad without a single forward pass.'
        # Linear activation derivation
        return np.ones(shape=self.latest_output.shape)

In [None]:
####################################
# Creating a custom neural network #
####################################

def xavier_normal_init(*shape) -> np.ndarray:
    assert len(shape) <= 2, 'Can only init max 2d tensors'
    fan_in = shape[0]
    if len(shape) == 1:
        fan_out = fan_in
    else:
        fan_out = shape[1]
    gain = 1.0

    std = gain * np.sqrt(2.0 / (fan_in + fan_out))
    return np.random.normal(loc=0.0, scale=std, size=shape)


class MyNeuralNetwork:
    def __init__(self) -> None:
        # init_weight_fn = lambda *shape: np.random.randn(*shape) / 10
        init_weight_fn = lambda *shape: xavier_normal_init(*shape)
        self.layers = [
            MyLinearLayer(in_features=784, out_features=32, init_weight_fn=init_weight_fn),
            MyLinearLayer(in_features=32, out_features=32, init_weight_fn=init_weight_fn),
            MyLinearLayer(in_features=32, out_features=10, init_weight_fn=init_weight_fn)
        ]
    
    def __call__(self, tensor: np.ndarray) -> Any:
        x = tensor
        for layer in self.layers:
            x = layer(x)
        
        return x

## Implement your loss function

In [None]:
import torch.nn.functional as F


def softmax(input: np.ndarray) -> np.ndarray:
    _softmax = np.asarray([np.exp(_in) /np.sum(np.exp(_in), axis=0) for _in in input])

    return _softmax


class CrossEntropyLoss:
    def __init__(self) -> None:
        pass

    def __call__(self, predictions: MyWeightTensor or np.ndarray, targets: MyWeightTensor or np.ndarray) -> np.ndarray:
        """
        Computes cross entropy between targets snd predictions.    
        Returns: List of cross entropy losses (batch-wise)
        """
        if isinstance(predictions, MyWeightTensor):
            predictions = predictions.values
        
        if isinstance(targets, MyWeightTensor):
            targets = targets.values

        assert predictions.shape[0] == targets.shape[0]
        if len(targets.shape) == 2:
            targets = targets.reshape(-1)
        predictions = torch.as_tensor(predictions)
        targets = torch.as_tensor(targets)

        loss = np.array([F.cross_entropy(pred, t).item() for pred, t in zip(predictions, targets)])

        return loss
    
    def derivative(self) -> Callable:
        # y_hat is the prediction
        # y is the target value
        def _derivative(y_hat: MyWeightTensor or np.ndarray, y: MyWeightTensor or np.ndarray) -> np.ndarray:
            if isinstance(y_hat, MyWeightTensor):
                y_hat = y_hat.values
            
            if isinstance(y, MyWeightTensor):
                y = y.values

            _y = np.zeros(shape=y_hat.shape)
            np.put_along_axis(_y, y, 1, axis=-1)

            y_hat = softmax(y_hat)

            return y_hat - _y
        
        return _derivative

## Implement the training loop

In [None]:
def train(model: MyNeuralNetwork, batch_size: int, learning_rate: float, loss_fn: Callable, epochs: int = 10):
    train_loader, _ = load_mnist_data(batch_size=batch_size)

    for epoch in range(epochs):
        running_loss = 0.0
        running_accuracy = []
        for imgs, targets in tqdm.tqdm(train_loader, desc=f'Training iteration {epoch + 1}'):

            # for custom model
            imgs = imgs.numpy()
            targets = targets.numpy()

            if len(targets.shape) == 1:
                targets = targets.reshape(-1, 1)

            imgs = imgs.reshape(-1, 28 * 28)

            imgs = MyWeightTensor(shape=imgs.shape, init_weights=imgs)

            outputs = model(imgs).values

            loss = loss_fn(outputs, targets)
            avg_loss = np.mean(loss)

            # print statistics
            running_loss += avg_loss

            # Calculate the Accuracy (how many of all samples are correctly classified?)
            max_outputs = np.argmax(outputs, axis=1)
            accuracy = (max_outputs == targets.flatten()).mean()
            running_accuracy.append(accuracy)

            #########################
            # Start backpropagation #
            #########################

            # Your code for backpropagation!


            #######################
            # End backpropagation #
            #######################

        print(f'Epoch {epoch + 1} finished with loss: {running_loss / len(train_loader):.3f} and accuracy: {torch.tensor(running_accuracy).mean():.3f}')

In [None]:
#############################
# Execute the training loop #
#############################
model = MyNeuralNetwork()
batch_size = 4
learning_rate = 0.001
epochs = 10
loss_fn = CrossEntropyLoss()

train(
    model=model,
    batch_size=batch_size,
    learning_rate=learning_rate,
    epochs=epochs,
    loss_fn=loss_fn
)