# Deep Learning Cheatsheet FS24


## Code snippets


#### Imports

In [1]:
import numpy as np
import os
import math
from matplotlib import pyplot
import copy
import zipfile
import csv
import io
import torch
import csv
import pandas as pd
import torchvision
import torchvision.models
import torchvision.transforms as tfs
from torchvision.datasets import ImageFolder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

#### Device

In [None]:
# normal config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# apple silicon chip
mps_device = torch.device("mps")

### Data Tasks

#### Data Generation

Datasets: 

1. $X_1: t = \sin(3x)$ for $x\in[-1,1]$
2. $X_2: t = e^{-4x^2}$ for $x\in[-1,1]$
3. $X_3: t = x^5 + 3x^4 - 6x^3 -12x^2 + 5x + 129$ for $x\in[-4,2.5]$

Generate dataset $X_1$, for $N=60$ samples randomly drawn from range $x\in[-1,1]$. \
Generate data $X_2$ for $N=50$ samples randomly drawn from range $x\in[-1,1]$.  \
Generate dataset $X_3$ for $N=200$ samples randomly drawn from range $x\in[-4,2.5]$. \
Implement all three datasets as lists of tuples: $\{(\vec x^{[n]}, t^{[n]})\mid 1\leq n\leq N\}$. \

In [None]:
# 1D list
np.random.uniform(low=-1, high=1, size=5 + 1)
# 2D list
np.random.uniform(low=-1, high=1, size=(5, 10))

# special datasets
X1 = [(np.array([1, x]), np.sin(3 * x)) for x in np.random.uniform(low=-1, high=1, size=60)]
X2 = [(np.array([1, x]), np.exp(-4 * x ** 2)) for x in np.random.uniform(low=-1, high=1, size=50)]
X3 = [(np.array([1, x]), x ** 5 + 3 * x ** 4 - 6 * x ** 3 - 12 * x ** 2 + 5 * x + 129) for x in
      np.random.uniform(low=-4, high=2.5, size=200)]

#### Read Data Examples

In [6]:
def data_from_zip_as_np_array(course="por"):
    # download data file from URL
    dataset_zip_file = "student.zip"
    if not os.path.exists(dataset_zip_file):
        import urllib.request
        urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/00320/student.zip",
                                   dataset_zip_file)
        print("Downloaded datafile", dataset_zip_file)
    # collect inputs
    inputs = []
    targets = []
    # some default values: yes=1, no=-1
    yn = {"yes": 1., "no": -1.}
    # read through dataset (without actually unzipping to a file):
    # ... open zip file
    zip = zipfile.ZipFile(dataset_zip_file)
    # ... open data file inside of zip file and convert bytes to text
    datafile = io.TextIOWrapper(zip.open(os.path.join(F"student-{course}.csv"), 'r'))
    # ... read through the lines via CSV reader, using the correct delimiter
    reader = csv.reader(datafile, delimiter=";")
    # ... skip header line
    next(reader)
    for splits in reader:
        # read input values
        inputs.append([
            1.,  #### BIAS ####
            {"GP": 1., "MS": -1.}[splits[0]],  # school
            float(splits[29]),  # absences
        ])
        # read targets values
        targets.append([
            float(splits[32]),  # grade for tertiary school
        ])
    print(F"Loaded dataset with {len(targets)} samples")
    return np.array(inputs).transpose(), np.array(targets).transpose()


def dataset_from_file(dataset_file="winequality-red.csv", delimiter=";"):
    # read dataset
    with open(dataset_file, 'r') as f:
        df = pd.read_csv(filepath_or_buffer=f, delimiter=delimiter, header=0)
    # convert to torch.tensor
    data = torch.tensor(df.values)
    # get the input (data samples) without the target information
    X = data[:, :-1].float()
    if dataset_file == "winequality-red.csv":
        # target is in the last column and needs to be converted to long
        T = data[:, -1].long()
        T = torch.sub(T, 3)
    else:
        # target is in the last column and needs to be of type float
        T = data[:, -1].reshape(-1, 1).float()
    return X, T


# intialize data
X, T = data_from_zip_as_np_array("my_dataset")

#### Data initialization examples

In [None]:
# data initialization
K = 15
D = len(X)
O = 3
# Weight initialization Xavier method
W1 = np.random.uniform(low=-1 / np.sqrt(D), high=1 / np.sqrt(D), size=(K + 1, D))
W2 = np.random.uniform(low=-1 / np.sqrt(K), high=1 / np.sqrt(K), size=(O, K + 1))
Theta = [W1, W2]

#### Data split

In [None]:
def split_training_data(X, T, train_percentage=0.8, shuffle=True):
    if shuffle:
        # Combine X and T along axis 1
        combined_data = np.concatenate((X, T), axis=1)
        # Shuffle the combined data along axis 0
        np.random.shuffle(combined_data)
        # Split X and T again after shuffling
        X = combined_data[:, :X.shape[1]]
        T = combined_data[:, X.shape[1]:]

    # split into 80/20 training/validation
    training_number = int(X.shape[0] * train_percentage)
    validation_numer = (X.shape[0] - training_number) * -1
    X_train = X[:training_number]
    T_train = T[:training_number]
    X_val = X[validation_numer:]
    T_val = T[validation_numer:]

    return X_train, T_train, X_val, T_val


X_train, T_train, X_val, T_val = split_training_data(X=X, T=T)

#### Torch Datasets

In [None]:
# Fashion MNIST dataset
def f_mnist_datasets(transform):
    trainset = torchvision.datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
    testset = torchvision.datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)

    # returns PIL.Image.Image without transorm
    return trainset, testset


trainset, testset = f_mnist_datasets(transform=None)

##### Data Loaders
Data loaders simplify interaction with data. Shuffle data, create batches, etc. A bridge between dataset and model

In [None]:
B = 512
trainloader = torch.utils.data.DataLoader(trainset, batch_size=B, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=B, shuffle=False)

##### Data transform
Convert images to tensors. First resize, then crop, then convert to tensor and normalize values

In [None]:
imagenet_transform = tfs.Compose([
    tfs.Resize(256),
    tfs.CenterCrop(224),
    tfs.ToTensor(),
    tfs.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

##### Dataset loader - ImageFolder

In [None]:
train_dir = './intel-image-classification/seg_train/seg_train/'
test_dir = './intel-image-classification/seg_test/seg_test/'

trainset = ImageFolder(
    root=train_dir,
    transform=imagenet_transform
)

testset = ImageFolder(
    root=test_dir,
    transform=imagenet_transform
)

### Normalize

In [None]:
# get min and max values
min_val = np.max(X, axis=1)
max_val = np.min(X, axis=1)

# assure to handle x_0 correctly
min_val[0] = 0
max_val[0] = 1


def normalize(x, min_val, max_val):
    # normalize the given data with the given minimum and maximum values
    return np.transpose((x.transpose() - min_val) / (max_val - min_val))


# Normalize our dataset
X = normalize(X, min_val, max_val)

### Standardize

In [None]:
def standardize(X_train, X_val):
    # compute statistics
    mean = torch.mean(X_train, dim=0)
    std = torch.std(X_train, dim=0)

    # standardize both X_train and X_val
    X_train = (X_train - mean) / std
    X_val = (X_val - mean) / std
    return X_train, X_val


X_train, X_val = standardize(X_train=X_train, X_val=X_val)

### Accuracy
Accuracy check for categorical or binary classification

In [None]:
def accuracy(Z, T):
    # check if we have binary or categorical classification
    if len(T.shape) == 2:
        # binary classification
        y = (Z >= 0).float()
        return torch.mean((y == T).float())
    else:
        # categorical classification
        y = torch.argmax(Z, dim=1)
        return torch.mean((y == T).float())

### Network implementations

#### Activation functions

In [None]:
def logistic(x):
    return 1 / (1 + np.exp(-x))

#### Loss Functions

##### Squared Loss
$\mathcal J^{L_2} = \frac1B \|\mathbf Y - \mathbf T\|_F^2$ for given network outputs $\mathbf Y$ and target values $\mathbf T$.

In [None]:
def loss(Y, T):
    return (1 / T.shape[1]) * np.linalg.norm(Y - T, "fro") ** 2

#### Batch creation

In [5]:
# used in enumerate -> yield
def batch_with_shuffle(X, T, batch_size=16):
    num_of_samples = X.shape[1]
    shuffle_idx = np.random.permutation(num_of_samples)
    i = 0
    new_epoch = True
    while True:
        # shuffle dataset in each epoch   
        if (i + batch_size) >= X.shape[1]:
            shuffle_idx = np.random.permutation(X.shape[1])
            i = 0
            new_epoch = True
        # yield the batch
        yield X[:, shuffle_idx[i:i + batch_size]], T[:, shuffle_idx[i:i + batch_size]], new_epoch
        new_epoch = False
        i += batch_size

#### Network examples

In [4]:
# Network for a given input vector and parameters Theta
def simple_network(x, Theta):
    W1, w2 = Theta
    # linear combination for hidden layer
    a_ = np.dot(W1, x)
    # activation function
    h_ = logistic(a_)
    # adding bias
    h = np.insert(h_, 0, 1)
    y = np.dot(w2, h)
    return y, h


# Multi-target network -> output matrix Y, hidden unit output H
def multi_target_network(X, Theta):
    W1, W2 = Theta
    # compute activation
    A = np.dot(W1, X)
    # compute hidden unit output
    H = 1 / (1 + np.exp(-A))
    H[0] = 1
    # compute network output
    Y = np.dot(W2, H)
    return Y, H

#### Gradient implementations

##### Gradient from formula

For a given dataset $X$ the gradient of loss $J^{L_2}$ is defined as:
\begin{align}
  \frac{\partial \mathcal J}{\partial w_{kd}^{(1)}} &= \frac{2}{N} \sum\limits_{n=1}^N (y^{[n]}-t^{[n]}) w_{k}^{(2)} (1-h_{k}^{[n]}) h_{k}^{[n]} x_{d}^{[n]}\\
  \frac{\partial \mathcal J}{\partial w_{k}^{(2)}} &= \frac{2}{N} \sum\limits_{n=1}^N (y^{[n]}-t^{[n]}) h_{k}^{[n]}
\end{align}


In [3]:
def basic_gradient(X, Theta):
    # split parameters for easier handling
    W1, w2 = Theta
    # define gradient with respect to both parameters
    dW1 = np.zeros_like(W1)
    dw2 = np.zeros_like(w2)
    # iterate over dataset
    for x, t in X:
        # compute the gradient
        y, h = simple_network(x, Theta)
        dy = (y - t)
        # compute gradient first layer
        dh = dy * w2[1:] * (1 - h[1:]) * h[1:]
        dW1 += 2 / len(X) * np.outer(dh, x)
        # compute second layer
        dw2 += 2 / len(X) * dy * h
    return dW1, dw2


def basic_gradient_descent(X, Theta, eta):
    epochs = 10000
    # perform iterative gradient descent
    for epoch in range(epochs):
        # compute the gradient
        grad = basic_gradient(X, Theta)
        # update the parameters
        W1_new, w2_new = Theta
        # update weights using gradient
        W1_new -= eta * grad[0]
        w2_new -= eta * grad[1]

        # update Theta with new weights
        Theta = (W1_new, w2_new)

        if np.linalg.norm(grad[0]) < 1e-6:
            break

    # return optimized parameters
    return Theta


basic_gradient_descent(X1, Theta, eta=0.25)

##### Gradient - clever implementation (stochastic with batches)

$\nabla_{\vec{w}^{(1)}}=\frac{2}{B}\sum\limits_{b=1}^{B}[(y^{(b)}-t^{(b)})\vec{w}^{[2]}\odot\vec{h}^{[b]}\odot(1-\vec{h}^{[b]})]\otimes\vec{x}^{[b]}$ \
$\nabla_{\vec{w}^{(2)}}=\frac{2}{B}\sum\limits_{b=1}^{B}(y^{[b]}-t^{[b]})\vec{h}^{[b]}$



In [None]:
def clever_gradient(X, T, Y, H, Theta):
    W1, W2 = Theta
    dy = Y - T
    # first layer gradient
    g1 = (2 / Y.shape[1]) * np.dot((np.dot(W2.T, dy)) * H * (1 - H), X.T)
    # second layer gradient
    g2 = (2 / Y.shape[1]) * np.dot(dy, H.T)

    return g1, g2


def gradient_descent_with_batches(X, T, Theta, B, eta=0.001, mu=None):
    loss_values = []

    max_epochs = 10000
    max_batches = T.shape[1] // B * max_epochs

    # iterate over batches
    for index, (x, t, e) in enumerate(batch_with_shuffle(X=X, T=T, batch_size=B)):
        if index < max_batches:
            # compute network output
            y, h = multi_target_network(X=x, Theta=Theta)
            # compute and append loss
            if e:
                loss_values.append(loss(Y=y, T=t))  # append loss of first batch

            # compute gradient
            g1, g2 = clever_gradient(X=x, T=t, Y=y, H=h, Theta=Theta)

            # save previous theta for momentum
            Theta_old = Theta

            # and apply gradient descent
            Theta[0] -= eta * g1
            Theta[1] -= eta * g2

            # apply momentum learning if desired
            if mu:
                Theta[0] += mu * (Theta[0] - Theta_old[0])
                Theta[1] += mu * (Theta[1] - Theta_old[1])
        else:
            break

    # return the obtained loss values at the end
    return loss_values


# Stochastic gradient -> batch size
SGD = gradient_descent_with_batches(X=X, T=T, Theta=Theta, B=16)

### Classification

### PyTorch

#### Tensor
[torch.Tensor](https://pytorch.org/docs/stable/tensors.html)

Most important attributes:
* Initialize -> torch.Tensor()
* Tensor.T -> returns the tensor with reversed dimensions
* Tensor.shape -> returns the size (rows, columns)
* Tensor.dtype -> return datatype of data in tensor
* Tensor.device -> where the tensor is stored
* Tensor.requires_grad -> whether a tensor requires gradient calculation during back propagation
* Tensor.numel() -> total # of elements in tensor

#### Loss Function

##### Binary Cross-Entropy loss
Used for binary classification tasks

In [None]:
bce_loss = torch.nn.BCEWithLogitsLoss()

##### Cross-Entropy loss
Used for multi class classification tasks

In [None]:
ce_loss = torch.nn.CrossEntropyLoss()

#### Network Examples

In [None]:
# simple fully connected network with tanh
def SiimpleFullyConnectedNetwork(D, K, O):
    return torch.nn.Sequential(
        torch.nn.Linear(D, K),
        torch.nn.Tanh(),
        torch.nn.Linear(K, O)
    )

simple_fc_network = SiimpleFullyConnectedNetwork(X.shape[1], 10, 1)


# 3 fully connected layers with flatten and sigmoid
def TripleFullyConnectedNetwork(D, K1, K2, O):
    return torch.nn.Sequential(
        torch.nn.Flatten(),
        torch.nn.Linear(D, K1),
        torch.nn.Sigmoid(),
        torch.nn.Linear(K1, K2),
        torch.nn.Sigmoid(),
        torch.nn.Linear(K2, O)
    )

triple_fc_network = TripleFullyConnectedNetwork(D=28*28, K1=128, K2=64, O=10)


def DoubleConvoluionalNetwork(Q1, Q2, O):
    return torch.nn.Sequential(
        torch.nn.Conv2d(in_channels=1, out_channels=Q1, kernel_size=(7, 7), stride=1, padding=0),
        torch.nn.MaxPool2d(kernel_size=(2, 2), stride=2),
        torch.nn.Sigmoid(),
        torch.nn.Conv2d(in_channels=Q1, out_channels=Q2, kernel_size=(5, 5), stride=1, padding=2),
        torch.nn.MaxPool2d(kernel_size=(2, 2), stride=2),
        torch.nn.Sigmoid(),
        torch.nn.Flatten(),
        torch.nn.Linear(5 * 5 * Q2, O)
    )

double_conv_network = DoubleConvoluionalNetwork(Q1=16, Q2=16, O=10)

#### Replace last network layer - Feature Extraction

In [None]:
def replace_last_layer(network, O=6):
    # replace the last linear layer with the new layer
    num_of_in_features = network.fc.in_features
    network.fc = torch.nn.Linear(num_of_in_features, O)
    return network

network_replaced_last_layer = replace_last_layer(simple_fc_network)  # Use network_2 defined above and replace the last layer

### Training Loops

**IMPORTANT:** *do not forget to set the network to training mode `network.train()` and then to `network.eval()`

#### Basic
With Stochastic Gradient Optimizer

In [None]:
def basic_train(network, X_train, T_train, X_val, T_val, loss_function, learning_rate=0.1, epochs=10000):
    optimizer = torch.optim.SGD(params=network.parameters(), lr=learning_rate)
    # collect loss and accuracy values
    train_loss, train_acc, val_loss, val_acc = [], [], [], []
    for epoch in range(epochs):
        # train on training set
        optimizer.zero_grad()
        # ... compute network output on training data
        Z = network(X_train)
        # ... compute loss from network output and target data
        loss = loss_function(Z, T_train)
        # ... perform parameter update
        loss.backward()
        optimizer.step()
        # ... remember loss
        train_loss.append(loss.item())
        # ... compute training set accuracy
        train_acc.append(accuracy(Z, T_train).item())

        # test on validation data
        with torch.no_grad():
            # ... compute network output on validation data
            Z_v = network(X_val)
            # ... compute loss from network output and target data
            loss_v = loss_function(Z_v, T_val)
            # ... remember loss
            val_loss.append(loss_v.item())
            # ... compute validation set accuracy
            val_acc.append(accuracy(Z_v, T_val).item())

    # return the four lists of losses and accuracies
    return train_loss, train_acc, val_loss, val_acc

# call basic train
results = basic_train(network=simple_fc_network, X_train=X_train, T_train=T_train, X_val=X_val, T_val=T_val, loss_function=loss)

#### With batch loss

In [None]:
def batch_train(network, epochs, eta, momentum):
    # select loss function and optimizer
    loss = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(network.parameters(), lr=eta, momentum=momentum)
    
    device = torch.device("cuda")
    network = network.to(device)
    # collect loss values and accuracies over the training epochs
    val_loss, val_acc = [], []

    for epoch in range(epochs):
        print("Epoch ", epoch)
        # train network on training data
        for x, t in trainloader:
            # put data to device
            z = network(x.to(device))
            # train
            optimizer.zero_grad()
            J = loss(z, t.to(device))
            J.backward()
            optimizer.step()

        # test network on test data
        with torch.no_grad():
            total_loss = 0
            correct = 0
            for x, t in testloader:
                # put data to device
                x = x.to(device)
                t = t.to(device)
                # compute validation loss
                z = network(x)
                J = loss(z, t)
                # compute validation accuracy
                correct += torch.sum(torch.argmax(z, dim=1) == t).item()
                total_loss += J.item() * len(t)
            acc = correct / len(testset)
            avg_loss = total_loss / len(testset)
            val_loss.append(avg_loss)
            val_acc.append(acc)

    # return loss and accuracy values
    return val_loss, val_acc

# call batch train
fc_loss, fc_acc = batch_train(network=triple_fc_network, epochs=100, eta=0.01, momentum=0.9)
cv_loss, cv_acc = batch_train(network=double_conv_network, epochs=100, eta=0.01, momentum=0.9)

#### Train with evaluation

In [None]:
def train_eval(network, epochs=5, lr=0.001, momentum=0.9):
    device = torch.device("mps")
    network.to(device)

    optimizer = torch.optim.SGD(network.parameters(), lr=lr, momentum=momentum)
    loss = torch.nn.CrossEntropyLoss()

    total_train_loss, total_train_accuracy = 0, 0
    total_samples = 0
    
    train_loss, train_acc, val_loss, val_acc = 0, 0, 0, 0

    for epoch in range(epochs):
        # training process
        network.train()
        batch_train_loss, batch_train_accuracy = [], []
        for x, t in trainloader:
            x, t = x.to(device), t.to(device)
            optimizer.zero_grad()
            J = loss(network(x), t)
            J.backward()
            optimizer.step()
            batch_train_loss.append(J.item() * x.size(0))
            batch_train_accuracy.append((network(x).argmax(dim=1) == 1).float().mean().item() * x.size(0))
            total_samples += x.size(0)
            
            train_loss = sum(batch_train_loss) / total_samples
            train_acc += sum(batch_train_accuracy) / total_samples

        print(f"Epoch {epoch + 1}/{epochs}:")
        print(f"Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.4f}")

        # testing process
        network.eval()
        total_val_samples = 0
        with torch.no_grad():
            batch_val_loss, batch_val_accuracy = [], []
            for x, t in testloader:
                x, t = x.to(device), t.to(device)
                J = loss(network(x), t)
                batch_val_loss.append(J.item() * x.size(0))
                batch_val_accuracy.append((network(x).argmax(dim=1) == t).float().mean().item() * x.size(0))
                total_val_samples += x.size(0)
    
        val_loss = sum(batch_val_loss) / total_val_samples
        val_acc = sum(batch_val_accuracy) / total_val_samples
        print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")

    # Save predictions and target labels of the test set after the last epoch
    pred, target = [], []  #Store only the test results
    with torch.no_grad():
        for x, t in testloader:
            x, t = x.to(device), t.to(device)
            pred.append(network(x).argmax(dim=1).cpu().numpy())
            target.append(t.cpu().numpy())
    
    return pred, target

pred_unfrozen, targ_unfrozen = train_eval(network=network_replaced_last_layer)

### Plotting

#### Line chart with two variables

In [None]:
def plot_line_chart_two_variables_with_range(X, Theta, R):
    # create list from mult dimensional array
    x_data = np.array([record[0] for record in X])
    t_data = np.array([record[1] for record in X])
    # first, plot data samples -> style 'x' as point
    pyplot.plot(x_data[:, 1], t_data, "rx", label="Data")
    # define equidistant points from min (R[0]) to max (R[1]) to evaluate the network
    x = np.arange(R[0], R[1], 100)
    # compute the network outputs for these values
    y = [simple_network(np.array([1, x_]), Theta)[0] for x_ in x]
    # plot network approximation -> as a line
    pyplot.plot(x, y, "k-", label="network")
    pyplot.legend()


pyplot.subplot(131)
plot_line_chart_two_variables_with_range(X1, None, [-1.5, 1.5])
pyplot.title('Dataset X1')

#### Train vs validation loss

In [None]:
def plot_train_vs_val_loss(train_loss, train_acc, val_loss, val_acc):
    pyplot.figure(figsize=(10, 3))
    ax = pyplot.subplot(121)
    ax.plot(train_loss, "g-", label="Training set loss")
    ax.plot(val_loss, "b-", label="Validation set loss")
    ax.legend()

    ax = pyplot.subplot(122)
    ax.plot(train_acc, "g-", label="Training set accuracy")
    ax.plot(val_acc, "b-", label="Validation set accuracy")
    ax.legend()

#### MNIST images plot

In [None]:
pyplot.rcParams['image.cmap'] = 'gray'
fig, axes = pyplot.subplots(4, 10, figsize=(10, 4))

# 4 x 10 images
index = 0
for i in range(4):
    for j in range(10):
        img, _ = trainset[index]
        axes[i][j].imshow(img, cmap='gray')
        axes[i][j].axis("off")
        index += 1


#### Loss plots

In [None]:
pyplot.figure(figsize=(10, 3))
ax = pyplot.subplot(121)
# plot loss values of FC and CV network over epochs
ax.plot(fc_loss, "g-", label="Fully connected loss")
ax.plot(cv_loss, "b-", label="Convolutional loss")
ax.set_xlabel("Epoch")
ax.set_ylabel("Loss")
ax.legend()
ax.set_title("Validation Loss")

ax = pyplot.subplot(122)
# plot accuracy values of FC and CV network over epochs
ax.plot(fc_acc, "g-", label="Fully connected accuracy")
ax.plot(cv_acc, "b-", label="Convolutional accuracy")
ax.set_xlabel("Epoch")
ax.set_ylabel("Accuracy")
ax.legend()
ax.set_title("Validation Accuracy")

#### Confusion Matrix

In [None]:
classes = trainset.classes

pred_unfrozen_flat = np.concatenate(pred_unfrozen)
targ_unfrozen_flat = np.concatenate(targ_unfrozen)

# compute confusion matrix
matrix_unfrozen = confusion_matrix(pred_unfrozen_flat, targ_unfrozen_flat)  # Use predictions and target from the fine-tuned network without frozen layers

# plot confusion matrices
plot_conf_matrix = ConfusionMatrixDisplay(matrix_unfrozen, display_labels=classes)
plot_conf_matrix.plot(xticks_rotation="vertical")
plt.show()

## Theory

### Compute gradient
 -> See cheatsheet - [derivatives](Derivative%20Rules%20Cheatsheet.md)

### Convolutional Networks


#### Network output calculations

***Output size of network -> (in + 2p - k) / s + 1***

1. A [2D convolutional layer](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html) with $Q_1$ channels, kernel size $7\times7$, stride 1 and padding 0.
2. A [2D maximum pooling](https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html) with pooling size $2\times2$ and stride 2.
3. A `Sigmoid` activation function.
4. A 2D convolutional layer with $Q_2$ channels, kernel size $5\times5$, stride 1 and padding 2.
5. A 2D maximum pooling with pooling size $2\times2$ and stride 2.
6. A `Sigmoid` activation function.
7. A flattening layer to turn the 3D feature map into a 1D vector.
8. A fully-connected layer with the appropriate number of inputs and $O$ outputs.

Consider the network as defined above
Assume that the input is a $28\times28$ grayscale image.
How many hidden neurons do we need in the final fully-connected layer for a given number $Q_2$ of output channels of the second convolution?

(Write steps of computation.)

1. Input image size: $28\times28$
2. 1st Convolutional layer: $(28 + 2*0 - 7) / 1 + 1 = 22\times22$
3. 1st Max pooling layer: $(22/2)\times(22/2) = 11\times11$
4. 1st Sigmoid activation function: no influence
5. 2nd Convolutional layer: $(11 + 2*2 - 5) / 1 + 1 = 11\times11$
6. 2nd Max pooling layer: $(11/2)\times(11/2) = 5.5\times5.5$ -> round up: $6\times6$
7. 2nd Sigmoid activation function: no influence
8. Flattening layer: $6\times6 = 36$

#### Learnable parameters calculations

1. A `torch.nn.Flatten` layer to turn the $28\times28$ pixel image (2D) into a $28*28$ pixel vector (1D)
2. A fully-connected layer with D input neurons and K1 outputs.
3. A `Sigmoid` activation function.
 4. A fully-connected layer with K1 input neurons and K2 outputs.
5. A `Sigmoid`activation function.
6. A fully-connected layer with K2 input neurons and O outputs.

fully_connected(D=28*28, K1=128, K2=64, O=10)

> #### Fully-connected Network:
>
> - first fully-connected layer: $(28*28+1)*128 =$ **100480**
> - second fully-connected layer: $(128+1)*64 =$ **8256**
> - third fully-connected layer: $(64+1)*10 =$ **650**
> - total: $100480 + 8256 + 650 =$ **109386**

1. A [2D convolutional layer](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html) with $Q_1$ channels, kernel size $7\times7$, stride 1 and padding 0.
2. A [2D maximum pooling](https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html) with pooling size $2\times2$ and stride 2.
3. A `Sigmoid` activation function.
4. A 2D convolutional layer with $Q_2$ channels, kernel size $5\times5$, stride 1 and padding 2.
5. A 2D maximum pooling with pooling size $2\times2$ and stride 2.
6. A `Sigmoid` activation function.
7. A flattening layer to turn the 3D feature map into a 1D vector.
8. A fully-connected layer with the appropriate number of inputs and $O$ outputs.
 
convolutional(Q1=16, Q2=16, O=10)

> #### Convolutional Network:
> - first convolutional layer: $(7*7+1)*16$ = **800**
> - second convolutional layer: $(16*5*5+1)*16$ = **6416**
> - fully-connected layer: $(16*5*5+1)*10$ = **4010**
> - total: $800 + 6416 + 4010 = **11226**

***can check with torch.Tensor.numel()***