# Deep Learning Cheatsheet FS24


## Code snippets


#### Imports

In [1]:
import numpy as np
import os
import math
from matplotlib import pyplot
import copy
import zipfile
import csv
import io
import torch
import csv
import pandas as pd
import torchvision
import torchvision.models
import torchvision.transforms as tfs
from torchvision.datasets import ImageFolder
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

#### Device

In [None]:
# normal config
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# apple silicon chip
mps_device = torch.device("mps")

#### Target Vectors

Classes definition

* Known class indexes: (1, 4, 5, 8)
* negative class indexes: (0, 2, 3, 7)
* Unknown class indexes: (6,9)

Target vector definition
 $\vec t^n = 1 : \vec t^n = (1,0,0,0)$

 $\vec t^n = 4 : \vec t^n = (0,1,0,0)$

 $\vec t^n = 5 : \vec t^n = (0,0,1,0)$

 $\vec t^n = 8 : \vec t^n = (0,0,0,1)$

 else: $\vec t^n = (\frac14,\frac14,\frac14,\frac14)$


In [None]:
known_classes = (1, 4, 5, 8)
negative_classes = (0, 2, 3, 7)
unknown_classes = (6, 9)
O = len(known_classes)

# define one-hot vectors
labels_known = [torch.tensor([1, 0, 0, 0]),
                torch.tensor([0, 1, 0, 0]),
                torch.tensor([0, 0, 1, 0]),
                torch.tensor([0, 0, 0, 1])]
label_unknown = torch.tensor([0.25, 0.25, 0.25, 0.25])


def target_vector(index):
    # select correct one-hot vector for known classes, and the 1/O-vectors for unknown classes
    if index in known_classes:
        return labels_known[known_classes.index(index)]
    else:
        return label_unknown

### Data Tasks

#### Data Generation

Datasets: 

1. $X_1: t = \sin(3x)$ for $x\in[-1,1]$
2. $X_2: t = e^{-4x^2}$ for $x\in[-1,1]$
3. $X_3: t = x^5 + 3x^4 - 6x^3 -12x^2 + 5x + 129$ for $x\in[-4,2.5]$

Generate dataset $X_1$, for $N=60$ samples randomly drawn from range $x\in[-1,1]$. \
Generate data $X_2$ for $N=50$ samples randomly drawn from range $x\in[-1,1]$.  \
Generate dataset $X_3$ for $N=200$ samples randomly drawn from range $x\in[-4,2.5]$. \
Implement all three datasets as lists of tuples: $\{(\vec x^{[n]}, t^{[n]})\mid 1\leq n\leq N\}$. \

In [None]:
# 1D list
np.random.uniform(low=-1, high=1, size=5 + 1)
# 2D list
np.random.uniform(low=-1, high=1, size=(5, 10))

# special datasets
X1 = [(np.array([1, x]), np.sin(3 * x)) for x in np.random.uniform(low=-1, high=1, size=60)]
X2 = [(np.array([1, x]), np.exp(-4 * x ** 2)) for x in np.random.uniform(low=-1, high=1, size=50)]
X3 = [(np.array([1, x]), x ** 5 + 3 * x ** 4 - 6 * x ** 3 - 12 * x ** 2 + 5 * x + 129) for x in
      np.random.uniform(low=-4, high=2.5, size=200)]

#### Read Data Examples

In [6]:
def data_from_zip_as_np_array(course="por"):
    # download data file from URL
    dataset_zip_file = "student.zip"
    if not os.path.exists(dataset_zip_file):
        import urllib.request
        urllib.request.urlretrieve("https://archive.ics.uci.edu/ml/machine-learning-databases/00320/student.zip",
                                   dataset_zip_file)
        print("Downloaded datafile", dataset_zip_file)
    # collect inputs
    inputs = []
    targets = []
    # some default values: yes=1, no=-1
    yn = {"yes": 1., "no": -1.}
    # read through dataset (without actually unzipping to a file):
    # ... open zip file
    zip = zipfile.ZipFile(dataset_zip_file)
    # ... open data file inside of zip file and convert bytes to text
    datafile = io.TextIOWrapper(zip.open(os.path.join(F"student-{course}.csv"), 'r'))
    # ... read through the lines via CSV reader, using the correct delimiter
    reader = csv.reader(datafile, delimiter=";")
    # ... skip header line
    next(reader)
    for splits in reader:
        # read input values
        inputs.append([
            1.,  #### BIAS ####
            {"GP": 1., "MS": -1.}[splits[0]],  # school
            float(splits[29]),  # absences
        ])
        # read targets values
        targets.append([
            float(splits[32]),  # grade for tertiary school
        ])
    print(F"Loaded dataset with {len(targets)} samples")
    return np.array(inputs).transpose(), np.array(targets).transpose()


def dataset_from_file(dataset_file="winequality-red.csv", delimiter=";"):
    # read dataset
    with open(dataset_file, 'r') as f:
        df = pd.read_csv(filepath_or_buffer=f, delimiter=delimiter, header=0)
    # convert to torch.tensor
    data = torch.tensor(df.values)
    # get the input (data samples) without the target information
    X = data[:, :-1].float()
    if dataset_file == "winequality-red.csv":
        # target is in the last column and needs to be converted to long
        T = data[:, -1].long()
        T = torch.sub(T, 3)
    else:
        # target is in the last column and needs to be of type float
        T = data[:, -1].reshape(-1, 1).float()
    return X, T


def read_data_csv_pandas(datafile):
    # Read/open datafile CSV file into a pandas DataFrame
    data = pd.read_csv(datafile)
    # Extract date and convert to numpy array
    date = np.array(data['Date'], dtype=np.datetime64)
    # Extract closing prices and convert to torch Tensor
    price = torch.tensor(data['Close'].values, dtype=torch.float32)
    return date, price

gail_data = read_data_csv_pandas('./GAIL.csv')


# intialize data
X, T = data_from_zip_as_np_array("my_dataset")

#### Data initialization examples

In [None]:
# data initialization
K = 15
D = len(X)
O = 3
# Weight initialization Xavier method
W1 = np.random.uniform(low=-1 / np.sqrt(D), high=1 / np.sqrt(D), size=(K + 1, D))
W2 = np.random.uniform(low=-1 / np.sqrt(K), high=1 / np.sqrt(K), size=(O, K + 1))
Theta = [W1, W2]

#### Data split

In [None]:
def split_training_data_with_shuffle(X, T, train_percentage=0.8, shuffle=True):
    if shuffle:
        # Combine X and T along axis 1
        combined_data = np.concatenate((X, T), axis=1)
        # Shuffle the combined data along axis 0
        np.random.shuffle(combined_data)
        # Split X and T again after shuffling
        X = combined_data[:, :X.shape[1]]
        T = combined_data[:, X.shape[1]:]

    # split into 80/20 training/validation
    training_number = int(X.shape[0] * train_percentage)
    validation_numer = (X.shape[0] - training_number) * -1
    X_train = X[:training_number]
    T_train = T[:training_number]
    X_val = X[validation_numer:]
    T_val = T[validation_numer:]

    return X_train, T_train, X_val, T_val

X_train, T_train, X_val, T_val = split_training_data_with_shuffle(X=X, T=T)


def train_test_split(stock_data):
    dates, prices = stock_data
    split_index = np.searchsorted(dates, np.datetime64('2018-01-01'))
    train_data = prices[:split_index]
    test_data = prices[split_index:]

    return train_data, test_data

gail_train, gail_test = train_test_split(gail_data)

#### Batch split
Utility function

This function needs to return three elements:
* First, the samples from the batch that belong to known classes.
* Second, the target vectors that belong to the known classes.
* Finally, the samples from the batch that belong to unknown classes.

In [None]:
def split_known_unknown(batch, targets):
    # select the indexes at which known and unknown samples exist
    known = torch.any(targets == 1, dim=1)
    unknown = torch.all(targets == 0.25, dim=1)
    # return the known samples, the targets of the known samples, as well as the unknown samples
    return batch[known], targets[known], batch[unknown]

#### Torch Datasets

In [None]:
# Fashion MNIST dataset
def f_mnist_datasets(transform):
    trainset = torchvision.datasets.FashionMNIST(root='./data', train=True, download=True, transform=transform)
    testset = torchvision.datasets.FashionMNIST(root='./data', train=False, download=True, transform=transform)

    # returns PIL.Image.Image without transorm
    return trainset, testset


trainset, testset = f_mnist_datasets(transform=None)
validationset = testset

##### Data transform
Convert images to tensors. First resize, then crop, then convert to tensor and normalize values

In [None]:
# images to tensors
imagenet_transform = tfs.Compose([
    tfs.Resize(256),
    tfs.CenterCrop(224),
    tfs.ToTensor(),
    tfs.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

# to tensor transformer
transform = torchvision.transforms.ToTensor()

##### Dataset loader - ImageFolder

In [None]:
train_dir = './intel-image-classification/seg_train/seg_train/'
test_dir = './intel-image-classification/seg_test/seg_test/'

trainset = ImageFolder(
    root=train_dir,
    transform=imagenet_transform
)

testset = ImageFolder(
    root=test_dir,
    transform=imagenet_transform
)

##### Dataset Constructor
A dataset class that derives from `torchvision.datasets.MNIST` in `PyTorch` and adapts some parts of it.

In [None]:
class DataSet(torchvision.datasets.MNIST):
    def __init__(self, purpose="train"):
        # call base class constructor to handle the data loading
        # make sure that you get the correct part of the data based on the purpose
        super(DataSet, self).__init__(
            root='./data',
            train=True if purpose == "train" else False,
            download=True,
            transform=torchvision.transforms.ToTensor()
        )
        # select the valid classes based on the current purpose
        if purpose == "train":
            self.classes = known_classes + negative_classes
        elif purpose == "valid":
            self.classes = known_classes + negative_classes
        else:
            self.classes = known_classes + unknown_classes

        # select the samples that belong to these classes
        samples = np.column_stack([self.targets == c for c in self.classes]).any(axis=1)
        # sub-select the data of valid classes
        self.data = self.data[samples]
        # select the targets of valid classes, and already turn them into target vectors
        self.targets = self.targets[samples]

    def __getitem__(self, index):
        # perform appropriate actions on the data and the targets
        # the format of data should be in [0, 1]
        (input, target) = super().__getitem__(index)
        target = target_vector(target)
        return input, target

##### Torch Dataset - Simple

In [None]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, data, S):
        # store the data and targets as required
        self.X, self.T = create_sequences_targets(data, S)

    def __getitem__(self, index):
        # return input and target value for the given index
        return torch.unsqueeze(self.X[index], -1), torch.unsqueeze(self.T[index], -1)

    def __len__(self):
        # return the length of this dataset
        return len(self.X)


# instantiate dataset and data loader for a reasonable sequence length S
S = 14
train_gail_scaled = None
gail_train_dataset = Dataset(train_gail_scaled, S)
gail_train_dataloader = torch.utils.data.DataLoader(gail_train_dataset, batch_size=256, shuffle=True)

##### Torch Dataset - MixedDataset

In [None]:
class MixedDataset(torch.utils.data.Dataset):
    def __init__(self, root='./data', purpose="train", transform=None, anomaly_size=2000):
        # load MNIST dataset based on "purpose"
        self.mnist_dataset = torchvision.datasets.MNIST(root=root, train=(purpose == "train"), download=True,
                                                        transform=transform)

        # load FashionMNIST dataset when "purpose" is "anomaly_detection" and randomly select samples with size "anomaly_size"
        if purpose == "anomaly_detection":
            fashion_mnist_dataset = torchvision.datasets.FashionMNIST(root=root, train=False, download=True,
                                                                      transform=transform)
            indices = np.random.choice(len(fashion_mnist_dataset), anomaly_size, replace=False)
            self.fashion_mnist_dataset = torch.utils.data.Subset(fashion_mnist_dataset, indices)
            self.data = torch.utils.data.ConcatDataset([self.mnist_dataset, self.fashion_mnist_dataset])
        else:
            self.data = self.mnist_dataset

    def __len__(self):
        # return length of the desired dataset based on its purpose
        return len(self.data)

    def __getitem__(self, idx):
        # perform appropriate actions on the data, target, and its data type indicator (return 1 for regular and -1 for anomalous)
        image = self.data[idx][0]
        target = self.data[idx][1]
        data_type = 1 if idx < len(self.mnist_dataset) else -1

        return image, target, data_type

##### Data Loaders
Data loaders simplify interaction with data. Shuffle data, create batches, etc. A bridge between dataset and model

In [None]:
B = 512
trainloader = torch.utils.data.DataLoader(trainset, batch_size=B, shuffle=True)
testloader = torch.utils.data.DataLoader(testset, batch_size=B, shuffle=False)

# instantiate anomaly detection dataset and data loader
anomaly_detection_dataset = MixedDataset(purpose="anomaly_detection", transform=transform, anomaly_size=2000)
anomaly_detection_loader = torch.utils.data.DataLoader(anomaly_detection_dataset, batch_size=1000, shuffle=True)

validationloader = torch.utils.data.DataLoader(validationset, batch_size=B, shuffle=False)

##### Data Loader with MNIST data
MNIST data is initialized directly in the constructor. 

In [None]:
train_loader = torch.utils.data.DataLoader(
    torchvision.datasets.MNIST('data',
                               train=True,
                               download=True,
                               transform=torchvision.transforms.ToTensor()),
    batch_size=64,
    shuffle=True)

# validation set and data loader
validation_loader = torch.utils.data.DataLoader(
    torchvision.datasets.MNIST('data',
                               train=False,
                               download=True,
                               transform=torchvision.transforms.ToTensor()),
    batch_size=64,
    shuffle=True
)

### Normalize

In [None]:
# get min and max values
min_val = np.max(X, axis=1)
max_val = np.min(X, axis=1)

# assure to handle x_0 correctly
min_val[0] = 0
max_val[0] = 1

def normalize(x, min_val, max_val):
    # normalize the given data with the given minimum and maximum values
    return np.transpose((x.transpose() - min_val) / (max_val - min_val))


# Normalize our dataset
X = normalize(X, min_val, max_val)


########### Normalize min-max scaler ###################
def min_max_scaler(train_data, test_data):
    # Compute the correct statistics
    min_train = torch.min(train_data)
    max_train = torch.max(train_data)
    min_test = torch.min(test_data)
    max_test = torch.max(test_data)
    min_val = torch.min(min_train, min_test)
    max_val = torch.max(max_train, max_test)

    # Scale the training data
    train_data_scaled = (train_data - min_val) / (max_val - min_val)
    # Scale the test data using the same min and max values
    test_data_scaled = (test_data - min_val) / (max_val - min_val)

    return train_data_scaled, test_data_scaled, min_val, max_val


def inverse_min_max_scaler(scaled_data, min_val, max_val):
    # Revert the scaling
    original_data = scaled_data * (max_val - min_val) + min_val

    return original_data

# scale and un-scale data
train_gail_scaled, test_gail_scaled, min_gail, max_gail = min_max_scaler(gail_train, gail_test)
reversed_gail_data = inverse_min_max_scaler(train_gail_scaled, min_val, max_val)

### Standardize

In [None]:
def standardize(X_train, X_val):
    # compute statistics
    mean = torch.mean(X_train, dim=0)
    std = torch.std(X_train, dim=0)

    # standardize both X_train and X_val
    X_train = (X_train - mean) / std
    X_val = (X_val - mean) / std
    return X_train, X_val


X_train, X_val = standardize(X_train=X_train, X_val=X_val)

### Accuracy
Accuracy check for categorical or binary classification

In [None]:
def accuracy(Z, T):
    # check if we have binary or categorical classification
    if len(T.shape) == 2:
        # binary classification
        y = (Z >= 0).float()
        return torch.mean((y == T).float())
    else:
        # categorical classification
        y = torch.argmax(Z, dim=1)
        return torch.mean((y == T).float())

### Confidence

The function computes the confidence value for a given batch of samples

In [None]:
def confidence(logits, targets):
    # compute softmax confidences
    conf = torch.nn.functional.softmax(logits, dim=1)
    # split between known and unknown
    batch_known, targets_known, batch_unknown = split_known_unknown(conf, targets)
    # compute confidence score for known targets
    conf_known = torch.sum(torch.max(batch_known, dim=1)[0])
    # compute confidence score for unknown targets
    conf_unknown = torch.sum(1 - torch.max(batch_unknown, dim=1)[0] + 1 / O)
    return conf_known + conf_unknown

### True Positive/Negative rate calculation

In [None]:
def compute_tpr_tnr(predictions, truth):
    # convert list into numpy array
    predictions = np.array(predictions)
    truth = np.array(truth)
    # Compute the confusion matrix or tp, tn, fp, fn
    tn, fp, fn, tp = confusion_matrix(truth, predictions).ravel()
    # Compute TPR and TNR
    tpr = tp / (tp + fn)
    tnr = tn / (tn + fp)

    return tpr, tnr

### Create sequence

In [None]:
def create_sequences_targets(data: torch.Tensor, S):
    # Initialize empty lists to hold the input sequences and the corresponding target values
    X, T = [], []
    # Go through the data to extract sequences based on S
    for i in range(len(data) - S):
        X.append(data[i:i + S])
        T.append(data[i + S])

    # Convert lists of sequences and targets into PyTorch tensors
    return torch.stack(X), torch.stack(T)

### Item prediction
Requiers a network which has been trained using the train data, and the test data which it than uses to predict the next sequence items

In [None]:
def predict(network, test_dataloader):
    network.eval()
    predictions = []

    for x, _ in test_dataloader:
        x = x.to(device)
        with torch.no_grad():
            y = network(x)
            predictions.append(y)
            
    return torch.cat(predictions).cpu()

### Network implementations

#### Activation functions

In [None]:
def logistic(x):
    return 1 / (1 + np.exp(-x))

#### Loss Functions

##### Squared Loss
$\mathcal J^{L_2} = \frac1B \|\mathbf Y - \mathbf T\|_F^2$ for given network outputs $\mathbf Y$ and target values $\mathbf T$.

In [None]:
def loss(Y, T):
    return (1 / T.shape[1]) * np.linalg.norm(Y - T, "fro") ** 2

#### Batch creation

In [5]:
# used in enumerate -> yield
def batch_with_shuffle(X, T, batch_size=16):
    num_of_samples = X.shape[1]
    shuffle_idx = np.random.permutation(num_of_samples)
    i = 0
    new_epoch = True
    while True:
        # shuffle dataset in each epoch   
        if (i + batch_size) >= X.shape[1]:
            shuffle_idx = np.random.permutation(X.shape[1])
            i = 0
            new_epoch = True
        # yield the batch
        yield X[:, shuffle_idx[i:i + batch_size]], T[:, shuffle_idx[i:i + batch_size]], new_epoch
        new_epoch = False
        i += batch_size

#### Network examples

In [4]:
# Network for a given input vector and parameters Theta
def simple_network(x, Theta):
    W1, w2 = Theta
    # linear combination for hidden layer
    a_ = np.dot(W1, x)
    # activation function
    h_ = logistic(a_)
    # adding bias
    h = np.insert(h_, 0, 1)
    y = np.dot(w2, h)
    return y, h


# Multi-target network -> output matrix Y, hidden unit output H
def multi_target_network(X, Theta):
    W1, W2 = Theta
    # compute activation
    A = np.dot(W1, X)
    # compute hidden unit output
    H = 1 / (1 + np.exp(-A))
    H[0] = 1
    # compute network output
    Y = np.dot(W2, H)
    return Y, H

#### Gradient implementations

##### Gradient from formula

For a given dataset $X$ the gradient of loss $J^{L_2}$ is defined as:
\begin{align}
  \frac{\partial \mathcal J}{\partial w_{kd}^{(1)}} &= \frac{2}{N} \sum\limits_{n=1}^N (y^{[n]}-t^{[n]}) w_{k}^{(2)} (1-h_{k}^{[n]}) h_{k}^{[n]} x_{d}^{[n]}\\
  \frac{\partial \mathcal J}{\partial w_{k}^{(2)}} &= \frac{2}{N} \sum\limits_{n=1}^N (y^{[n]}-t^{[n]}) h_{k}^{[n]}
\end{align}


In [3]:
def basic_gradient(X, Theta):
    # split parameters for easier handling
    W1, w2 = Theta
    # define gradient with respect to both parameters
    dW1 = np.zeros_like(W1)
    dw2 = np.zeros_like(w2)
    # iterate over dataset
    for x, t in X:
        # compute the gradient
        y, h = simple_network(x, Theta)
        dy = (y - t)
        # compute gradient first layer
        dh = dy * w2[1:] * (1 - h[1:]) * h[1:]
        dW1 += 2 / len(X) * np.outer(dh, x)
        # compute second layer
        dw2 += 2 / len(X) * dy * h
    return dW1, dw2


def basic_gradient_descent(X, Theta, eta):
    epochs = 10000
    # perform iterative gradient descent
    for epoch in range(epochs):
        # compute the gradient
        grad = basic_gradient(X, Theta)
        # update the parameters
        W1_new, w2_new = Theta
        # update weights using gradient
        W1_new -= eta * grad[0]
        w2_new -= eta * grad[1]

        # update Theta with new weights
        Theta = (W1_new, w2_new)

        if np.linalg.norm(grad[0]) < 1e-6:
            break

    # return optimized parameters
    return Theta


basic_gradient_descent(X1, Theta, eta=0.25)

##### Gradient - clever implementation (stochastic with batches)

$\nabla_{\vec{w}^{(1)}}=\frac{2}{B}\sum\limits_{b=1}^{B}[(y^{(b)}-t^{(b)})\vec{w}^{[2]}\odot\vec{h}^{[b]}\odot(1-\vec{h}^{[b]})]\otimes\vec{x}^{[b]}$ \
$\nabla_{\vec{w}^{(2)}}=\frac{2}{B}\sum\limits_{b=1}^{B}(y^{[b]}-t^{[b]})\vec{h}^{[b]}$



In [None]:
def clever_gradient(X, T, Y, H, Theta):
    W1, W2 = Theta
    dy = Y - T
    # first layer gradient
    g1 = (2 / Y.shape[1]) * np.dot((np.dot(W2.T, dy)) * H * (1 - H), X.T)
    # second layer gradient
    g2 = (2 / Y.shape[1]) * np.dot(dy, H.T)

    return g1, g2


def gradient_descent_with_batches(X, T, Theta, B, eta=0.001, mu=None):
    loss_values = []

    max_epochs = 10000
    max_batches = T.shape[1] // B * max_epochs

    # iterate over batches
    for index, (x, t, e) in enumerate(batch_with_shuffle(X=X, T=T, batch_size=B)):
        if index < max_batches:
            # compute network output
            y, h = multi_target_network(X=x, Theta=Theta)
            # compute and append loss
            if e:
                loss_values.append(loss(Y=y, T=t))  # append loss of first batch

            # compute gradient
            g1, g2 = clever_gradient(X=x, T=t, Y=y, H=h, Theta=Theta)

            # save previous theta for momentum
            Theta_old = Theta

            # and apply gradient descent
            Theta[0] -= eta * g1
            Theta[1] -= eta * g2

            # apply momentum learning if desired
            if mu:
                Theta[0] += mu * (Theta[0] - Theta_old[0])
                Theta[1] += mu * (Theta[1] - Theta_old[1])
        else:
            break

    # return the obtained loss values at the end
    return loss_values


# Stochastic gradient -> batch size
SGD = gradient_descent_with_batches(X=X, T=T, Theta=Theta, B=16)

### Classification

### PyTorch

#### Tensor
[torch.Tensor](https://pytorch.org/docs/stable/tensors.html)

Most important attributes:
* Initialize -> torch.Tensor()
* Tensor.T -> returns the tensor with reversed dimensions
* Tensor.shape -> returns the size (rows, columns)
* Tensor.dtype -> return datatype of data in tensor
* Tensor.device -> where the tensor is stored
* Tensor.requires_grad -> whether a tensor requires gradient calculation during back propagation
* Tensor.numel() -> total # of elements in tensor

#### Loss Function

##### Binary Cross-Entropy loss
Used for binary classification tasks

In [None]:
bce_loss = torch.nn.BCEWithLogitsLoss()

##### Cross-Entropy loss
Used for multi class classification tasks

In [None]:
ce_loss = torch.nn.CrossEntropyLoss()

##### Mean-Squared error loss
Best used when the goal is to predict continuous values (regression tasks)

In [None]:
loss = torch.nn.MSELoss()

#### Network Examples

##### Sequential

In [None]:
# simple fully connected network with tanh
def SiimpleFullyConnectedNetwork(D, K, O):
    return torch.nn.Sequential(
        torch.nn.Linear(D, K),
        torch.nn.Tanh(),
        torch.nn.Linear(K, O)
    )


simple_fc_network = SiimpleFullyConnectedNetwork(X.shape[1], 10, 1)


# 3 fully connected layers with flatten and sigmoid
def TripleFullyConnectedNetwork(D, K1, K2, O):
    return torch.nn.Sequential(
        torch.nn.Flatten(),
        torch.nn.Linear(D, K1),
        torch.nn.Sigmoid(),
        torch.nn.Linear(K1, K2),
        torch.nn.Sigmoid(),
        torch.nn.Linear(K2, O)
    )


triple_fc_network = TripleFullyConnectedNetwork(D=28 * 28, K1=128, K2=64, O=10)


def DoubleConvoluionalNetwork(Q1, Q2, O):
    return torch.nn.Sequential(
        torch.nn.Conv2d(in_channels=1, out_channels=Q1, kernel_size=(7, 7), stride=1, padding=0),
        torch.nn.MaxPool2d(kernel_size=(2, 2), stride=2),
        torch.nn.Sigmoid(),
        torch.nn.Conv2d(in_channels=Q1, out_channels=Q2, kernel_size=(5, 5), stride=1, padding=2),
        torch.nn.MaxPool2d(kernel_size=(2, 2), stride=2),
        torch.nn.Sigmoid(),
        torch.nn.Flatten(),
        torch.nn.Linear(5 * 5 * Q2, O)
    )


double_conv_network = DoubleConvoluionalNetwork(Q1=16, Q2=16, O=10)

##### Module

The topology can be found in the following:
1. 2D convolutional layer with $Q_1$ channels, kernel size $7\times7$, stride 1 and padding 0
2. 2D maximum pooling layer with kernel size $2\times2$ and stride 2
3. activation function **PReLU**
4. 2D convolutional layer with $Q_2$ channels, kernel size $5\times5$, stride 1 and padding 2
5. 2D maximum pooling layer with kernel size $2\times2$ and stride 2
6. activation function **PReLU**
7. flatten layer to convert the convolution output into a vector
8. fully-connected layer with the correct number of inputs and $K$ outputs
9. fully-connected layer with $K$ inputs and $O$ outputs

In [None]:
class ModuleNetwork(torch.nn.Module):
    def __init__(self, Q1, Q2, K, O):
        # call base class constructor
        super(ModuleNetwork, self).__init__()
        # define convolutional layers
        self.conv1 = torch.nn.Conv2d(1, Q1, kernel_size=(7, 7), stride=1, padding=0)
        self.conv2 = torch.nn.Conv2d(Q1, Q2, kernel_size=(5, 5), stride=1, padding=2)
        # pooling and activation functions will be re-used for the different stages
        self.pool = torch.nn.MaxPool2d(kernel_size=(2, 2), stride=2)
        self.act = torch.nn.PReLU()
        # define fully-connected layers
        self.flatten = torch.nn.Flatten()
        self.fc1 = torch.nn.Linear(Q2 * 5 * 5, K)
        self.fc2 = torch.nn.Linear(K, O)

    def forward(self, x):
        # compute first layer of convolution, pooling and activation
        a = self.act(self.pool(self.conv1(x)))
        # compute second layer of convolution, pooling and activation
        a = self.act(self.pool(self.conv2(a)))
        # get the deep features as the output of the first fully-connected layer
        deep_features = self.fc1(self.flatten(a))
        # get the logits as the output of the second fully-connected layer
        logits = self.fc2(deep_features)
        # return both the logits and the deep features
        return logits, deep_features


# initiate class
module_network_adapted = ModuleNetwork(32, 32, 20, 4)

##### Encoder Network Module

(a) Encoder Network

*   2D convolutional layer with $Q_1$ output channels, kernel size $5\times5$, **stride 2** and padding 2
*   activation function ReLU
*   2D convolutional layer with $Q_2$ output channels, kernel size $5\times5$, **stride 2** and padding 2
*   flatten layer to convert the convolution output into a vector
*   activation function ReLU
*   fully-connected layer with the correct number of inputs and $K$ outputs

In [None]:
class Encoder(torch.nn.Module):
    def __init__(self, Q1, Q2, K):
        # call base class constrcutor
        super(Encoder, self).__init__()
        # convolutional define layers
        self.conv1 = torch.nn.Conv2d(in_channels=1, out_channels=Q1, kernel_size=5, stride=2, padding=2)
        self.conv2 = torch.nn.Conv2d(in_channels=Q1, out_channels=Q2, kernel_size=5, stride=2, padding=2)
        # activation functions will be re-used for the different stages
        self.act = torch.nn.ReLU()
        # define fully-connected layers
        self.flatten = torch.nn.Flatten()
        self.fc = torch.nn.Linear(Q2 * 7 * 7, K)

    def forward(self, x):
        # get the deep feature representation
        a = self.act(self.conv1(x))
        a = self.flatten(self.conv2(a))
        deep_feature = self.fc(self.act(a))
        return deep_feature

(b) Encoder (Decoder) Network

*   fully-connected layer with $K$ inputs and the correct number of outputs
*   activation function ReLU
*   reshaping to convert the vector into a convolution input
*   2D **fractionally-strided convolutional** layer with $Q_2$ input channels, kernel size $5\times5$, stride 2 and padding 2
*   activation function ReLU
*   2D **fractionally-strided convolutional** layer with $Q_1$ input channels, kernel size $5\times5$, stride 2 and padding 2

In [None]:
class Decoder(torch.nn.Module):
    def __init__(self, Q1, Q2, K):
        # call base class constrcutor
        super(Decoder, self).__init__()
        # fully-connected layer
        self.fc = torch.nn.Linear(K, Q2 * 7 * 7)
        # convolutional layers
        self.deconv1 = torch.nn.ConvTranspose2d(in_channels=Q2, out_channels=Q1, kernel_size=5, stride=2, padding=2,
                                                output_padding=1)
        self.deconv2 = torch.nn.ConvTranspose2d(in_channels=Q1, out_channels=1, kernel_size=5, stride=2, padding=2,
                                                output_padding=1)
        # activation function
        self.act = torch.nn.ReLU()
        # unflatten
        self.unflatten = torch.nn.Unflatten(1, (Q2, 7, 7))

    def forward(self, x):
        # reconstruct the output image
        a = self.unflatten(self.act(self.fc(x)))
        a = self.act(self.deconv1(a))
        output = torch.sigmoid(self.deconv2(a))
        return output

Combine the two

In [None]:
class AutoEncoder(torch.nn.Module):
    def __init__(self, Q1, Q2, K):
        super(AutoEncoder, self).__init__()
        self.encoder = Encoder(Q1, Q2, K)
        self.decoder = Decoder(Q1, Q2, K)

    def forward(self, x):
        # encode input
        deep_feature = self.encoder(x)
        # decode to output
        reconstructed = self.decoder(deep_feature)
        return reconstructed

##### Replace last network layer - Feature Extraction

In [None]:
def replace_last_layer(network, O=6):
    # replace the last linear layer with the new layer
    num_of_in_features = network.fc.in_features
    network.fc = torch.nn.Linear(num_of_in_features, O)
    return network


network_replaced_last_layer = replace_last_layer(
    simple_fc_network)  # Use network_2 defined above and replace the last layer

##### LSTM

In [None]:
class LSTMModel(torch.nn.Module):
    def __init__(self, D, K, O):
        super(LSTMModel, self).__init__()

        self.lstm = torch.nn.LSTM(D, K, batch_first=True)
        self.dropout = torch.nn.Dropout(0.2)
        self.linear = torch.nn.Linear(K, O)

    def forward(self, x):
        # lstm layer
        output, _ = self.lstm(x)
        # apply dropout to the output of lstm layer
        output = self.dropout(output)

        # get correct element of the output of the lstm layer
        last_output = output[:, -1, :]
        Z = self.linear(last_output)

        return Z
    
lstm_gail_network = LSTMModel(D, K, O)

##### Small-scale Network
A network with two convolutional and two fully-connected layers.
The first convolutional layer has kernel size $7 \times 7$, stride $=1$, and padding $=0$. The second one has kernel size $5\times5$, stride $=1$, and padding $=2$. Both are followed by a $2\times2$ maximum pooling and a ReLU activation.

In [None]:
class Network(torch.nn.Module):
    def __init__(self, Q1, Q2, K, O):
        # call base class constructor
        super(Network, self).__init__() # -> output = (input - kernel_size + 2 * padding) / stride + 1
        self.conv1 = torch.nn.Conv2d(1, Q1, 7, stride=1, padding=0)
        self.conv2 = torch.nn.Conv2d(Q1, Q2, 5, stride=1, padding=2)
        self.pool = torch.nn.MaxPool2d(2)
        self.act = torch.nn.ReLU()
        self.flatten = torch.nn.Flatten()
        self.fc1 = torch.nn.Linear(Q2 * 5 * 5, K)
        self.fc2 = torch.nn.Linear(K, O)


    def forward(self, x):
        x = self.act(self.pool(self.conv1(x))) # -> (28 - 7 + 2 * 0) / 1 + 1 = 22 -> 22 / 2 = 11 -> 11
        x = self.act(self.pool(self.conv2(x))) # -> (11 - 5 + 2 * 2) / 1 + 1 = 11 -> 11 / 2 = 5.5 -> 5
        x = self.flatten(x) # -> 5 * 5 * Q2
        return self.fc2(self.fc1(x))

#### Validation loop

For a given network and loss function, this function iterates over the validation set and computes the classification accuracy on the original validation set samples.

For each batch, select the correctly classified images. For these, generate two types of adversarial samples, using FGS and FGV defined above, respectively.

Finally, compute how many of the adversarial samples are still classified as the original class by the network.

In [None]:
def validation_loop(network, loss, alpha_fgs=0.3, alpha_fgv=0.6):
    total, correct_clean_count, correct_fgs_count, correct_fgv_count = 0, 0, 0, 0
    
    network = network.to(device)
    network.eval()

    # iterate over validation set samples
    for x, t in validation_loader:
        x, t = x.to(device), t.to(device)
        total += x.shape[0]
        with torch.no_grad():
            # classify original samples
            z = network(x)

            # compute classification accuracy on original samples
            _, predicted = torch.max(z.data, 1)
            correct_clean_count += (predicted == t).sum().item()

        # select the correctly classified samples
        correct_indices = (predicted == t)

        # create adversarial samples using FGS and FGV only if x_correct is not empty
        if correct_indices.sum().item() > 0:
            x_correct, t_correct = x[correct_indices], t[correct_indices]
            x_attack_fgs = FGS(x_correct, t_correct, network, loss, alpha=alpha_fgs)
            x_attack_fgv = FGV(x_correct, t_correct, network, loss, alpha=alpha_fgv)

            # check how many are correctly classified
            with torch.no_grad():
                # classify adversarial samples
                z_attack_fgs = network(x_attack_fgs)
                z_attack_fgv = network(x_attack_fgv)

                # compute classification accuracy on adversarial samples
                _, predicted_fgs = torch.max(z_attack_fgs.data, 1)
                _, predicted_fgv = torch.max(z_attack_fgv.data, 1)
                correct_fgs_count += (predicted_fgs == t_correct).sum().item()
                correct_fgv_count += (predicted_fgv == t_correct).sum().item()

    # compute clean and adversarial accuracies and return them
    clean_accuracy = correct_clean_count / total
    fgs_accuracy = correct_fgs_count / correct_clean_count
    fgv_accuracy = correct_fgv_count / correct_clean_count
    print("Clean acc: ", clean_accuracy, " , FGS acc: ", fgs_accuracy, " , FGV acc: ", fgv_accuracy)
    return clean_accuracy, fgs_accuracy, fgv_accuracy

#### Autograd

In [None]:
class AdaptedSoftMax(torch.autograd.Function):
    # implement the forward propagation
    @staticmethod
    def forward(ctx, logits, targets):
        # compute the log probabilities via log_softmax
        log_probabilities = torch.nn.functional.log_softmax(logits, dim=1)
        # save required values for backward pass
        ctx.save_for_backward(log_probabilities, targets)
        # compute loss
        loss = - torch.sum(targets * log_probabilities)
        return loss

    # implement Jacobian
    @staticmethod
    def backward(ctx, result):
        # get results stored from forward pass
        log_probabilities, targets = ctx.saved_tensors
        # compute derivative of loss w.r.t. the logits
        dJ_dz = torch.exp(log_probabilities) - targets
        # return the derivatives; none for derivative for the targets
        return dJ_dz, None

#### Image manipulation
(should not be part of the exam)

##### Fast Gradient Sign (FGS)

In [None]:
def FGS(x, t, network, loss, alpha=0.3):
    # tell autograd that we need the gradient for the input
    x.requires_grad_(True)
    # forward input
    z = network(x)
    # compute loss and gradient
    J = loss(z, t)
    J.backward()
    # get the gradient
    gradient = x.grad
    # create FGS adversarial sample
    adversarial_sample = x + alpha * torch.sign(gradient)
    adversarial_sample = torch.clamp(adversarial_sample, min=0, max=1)

    return adversarial_sample

##### Fast Gradient Value

In [None]:
def FGV(x, t, network, loss, alpha=0.6):
    # tell autograd that we need the gradient for the input
    x.requires_grad_(True)
    # forward input
    z = network(x)
    # compute loss and gradient
    J = loss(z, t)
    J.backward()
    # get the gradient
    gradient = x.grad
    max_abs_gradient = gradient.abs().view(gradient.shape[0], -1).max(dim=1)[0].view(-1, 1, 1, 1)

    adversarial_sample = x + alpha * gradient / max_abs_gradient
    
    # create FGV adversarial sample
    # adversarial_sample = X + alpha * gradient
    adversarial_sample = torch.clamp(adversarial_sample, min=0, max=1)

    return adversarial_sample

##### Noise

In [None]:
def noise(x, alpha=0.3):
    x = x.to(device)
    # generate noise
    noise = torch.randint(0, 2, x.shape).float().to(device) * 2 - 1
    # Add noise and clamp
    noisy_sample = torch.clamp(x + alpha * noise, min=0, max=1)

    return noisy_sample

### Training Loops

**IMPORTANT:** *do not forget to set the network to training mode `network.train()` and then to `network.eval()`

#### Basic
With Stochastic Gradient Optimizer

In [None]:
def basic_train(network, X_train, T_train, X_val, T_val, loss_function, learning_rate=0.1, epochs=10000):
    optimizer = torch.optim.SGD(params=network.parameters(), lr=learning_rate)
    # collect loss and accuracy values
    train_loss, train_acc, val_loss, val_acc = [], [], [], []
    for epoch in range(epochs):
        # train on training set
        optimizer.zero_grad()
        # ... compute network output on training data
        Z = network(X_train)
        # ... compute loss from network output and target data
        loss = loss_function(Z, T_train)
        # ... perform parameter update
        loss.backward()
        optimizer.step()
        # ... remember loss
        train_loss.append(loss.item())
        # ... compute training set accuracy
        train_acc.append(accuracy(Z, T_train).item())

        # test on validation data
        with torch.no_grad():
            # ... compute network output on validation data
            Z_v = network(X_val)
            # ... compute loss from network output and target data
            loss_v = loss_function(Z_v, T_val)
            # ... remember loss
            val_loss.append(loss_v.item())
            # ... compute validation set accuracy
            val_acc.append(accuracy(Z_v, T_val).item())

    # return the four lists of losses and accuracies
    return train_loss, train_acc, val_loss, val_acc


# call basic train
results = basic_train(network=simple_fc_network, X_train=X_train, T_train=T_train, X_val=X_val, T_val=T_val,
                      loss_function=loss)

#### With batch loss

In [None]:
def batch_train(network, epochs, eta, momentum):
    # select loss function and optimizer
    loss = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(network.parameters(), lr=eta, momentum=momentum)

    device = torch.device("cuda")
    network = network.to(device)
    # collect loss values and accuracies over the training epochs
    val_loss, val_acc = [], []

    for epoch in range(epochs):
        print("Epoch ", epoch)
        # train network on training data
        for x, t in trainloader:
            # put data to device
            z = network(x.to(device))
            # train
            optimizer.zero_grad()
            J = loss(z, t.to(device))
            J.backward()
            optimizer.step()

        # test network on test data
        with torch.no_grad():
            total_loss = 0
            correct = 0
            for x, t in testloader:
                # put data to device
                x = x.to(device)
                t = t.to(device)
                # compute validation loss
                z = network(x)
                J = loss(z, t)
                # compute validation accuracy
                correct += torch.sum(torch.argmax(z, dim=1) == t).item()
                total_loss += J.item() * len(t)
            acc = correct / len(testset)
            avg_loss = total_loss / len(testset)
            val_loss.append(avg_loss)
            val_acc.append(acc)

    # return loss and accuracy values
    return val_loss, val_acc


# call batch train
fc_loss, fc_acc = batch_train(network=triple_fc_network, epochs=100, eta=0.01, momentum=0.9)
cv_loss, cv_acc = batch_train(network=double_conv_network, epochs=100, eta=0.01, momentum=0.9)

#### Train with evaluation

In [None]:
def train_eval(network, epochs=5, lr=0.001, momentum=0.9):
    device = torch.device("mps")
    network.to(device)

    optimizer = torch.optim.SGD(network.parameters(), lr=lr, momentum=momentum)
    loss = torch.nn.CrossEntropyLoss()

    total_train_loss, total_train_accuracy = 0, 0
    total_samples = 0

    train_loss, train_acc, val_loss, val_acc = 0, 0, 0, 0

    for epoch in range(epochs):
        # training process
        network.train()
        batch_train_loss, batch_train_accuracy = [], []
        for x, t in trainloader:
            x, t = x.to(device), t.to(device)
            optimizer.zero_grad()
            J = loss(network(x), t)
            J.backward()
            optimizer.step()
            batch_train_loss.append(J.item() * x.size(0))
            batch_train_accuracy.append((network(x).argmax(dim=1) == 1).float().mean().item() * x.size(0))
            total_samples += x.size(0)

            train_loss = sum(batch_train_loss) / total_samples
            train_acc += sum(batch_train_accuracy) / total_samples

        print(f"Epoch {epoch + 1}/{epochs}:")
        print(f"Training Loss: {train_loss:.4f}, Training Accuracy: {train_acc:.4f}")

        # testing process
        network.eval()
        total_val_samples = 0
        with torch.no_grad():
            batch_val_loss, batch_val_accuracy = [], []
            for x, t in testloader:
                x, t = x.to(device), t.to(device)
                J = loss(network(x), t)
                batch_val_loss.append(J.item() * x.size(0))
                batch_val_accuracy.append((network(x).argmax(dim=1) == t).float().mean().item() * x.size(0))
                total_val_samples += x.size(0)

        val_loss = sum(batch_val_loss) / total_val_samples
        val_acc = sum(batch_val_accuracy) / total_val_samples
        print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_acc:.4f}")

    # Save predictions and target labels of the test set after the last epoch
    pred, target = [], []  #Store only the test results
    with torch.no_grad():
        for x, t in testloader:
            x, t = x.to(device), t.to(device)
            pred.append(network(x).argmax(dim=1).cpu().numpy())
            target.append(t.cpu().numpy())

    return pred, target


pred_unfrozen, targ_unfrozen = train_eval(network=network_replaced_last_layer)

#### Training with confidence

In [None]:
def train(network, epochs, eta, momentum, loss_function, V1=True):
    # Set GPU
    network = network.to(device)
    # SGD optimizer with appropriate learning rate
    optimizer = torch.optim.SGD(network.parameters(), lr=eta, momentum=momentum)

    for epoch in range(epochs):
        # evaluate average confidence for training and validation set
        train_conf = validation_conf = 0.
        network.train()
        for x, t in trainloader:
            optimizer.zero_grad()
            # extract logits (and deep features) from network
            logits, deep_features = network(x.to(device))
            # compute loss
            if V1:
                loss = loss_function.apply(logits, t.to(device))
            else:
                loss = loss_function(logits, t.to(device))
            # perform weight update
            loss.backward()
            optimizer.step()

            # compute training confidence
            train_conf += confidence(logits, t.to(device))

        network.eval()
        # compute validation confidence
        with torch.no_grad():
            for x, t in testloader:
                # extract logits (and deep features)
                logits, deep_features = network(x.to(device))
                # compute validation confidence
                validation_conf += confidence(logits, t.to(device))
        # print average confidence for training and validation
        print(
            f"\rEpoch {epoch}; train: {train_conf / len(trainset):1.5f}, val: {validation_conf / len(validationset):1.5f}")

    return network

#### Simple train
Optimizer is a parameter passed to the function

In [None]:
def simple_train(network, train_dataloader, optimizer, loss, device, epochs=50):
    network.to(device)

    for epoch in range(epochs):
        train_loss = 0.0
        total_sample = 0
        network.train()
        for x, t in train_dataloader:
            x = x.to(device)
            t = t.to(device)
            optimizer.zero_grad()
            y = network(x)
            J = loss(y, t)
            J.backward()
            optimizer.step()
            train_loss += J.item() * x.size(0)
            total_sample += x.size(0)

        # print average loss for training and validation
        print(f"\rEpoch {epoch + 1}; train loss: {train_loss / total_sample:1.5f}")

loss = torch.nn.MSELoss()
epochs = 50
optimizer_gail = torch.optim.Adam(lstm_gail_network.parameters(), lr=0.05)
train(lstm_gail_network, gail_train_dataloader, optimizer_gail, loss, device, epochs)

### Plotting

#### Line chart with two variables

In [None]:
def plot_line_chart_two_variables_with_range(X, Theta, R):
    # create list from mult dimensional array
    x_data = np.array([record[0] for record in X])
    t_data = np.array([record[1] for record in X])
    # first, plot data samples -> style 'x' as point
    pyplot.plot(x_data[:, 1], t_data, "rx", label="Data")
    # define equidistant points from min (R[0]) to max (R[1]) to evaluate the network
    x = np.arange(R[0], R[1], 100)
    # compute the network outputs for these values
    y = [simple_network(np.array([1, x_]), Theta)[0] for x_ in x]
    # plot network approximation -> as a line
    pyplot.plot(x, y, "k-", label="network")
    pyplot.legend()


pyplot.subplot(131)
plot_line_chart_two_variables_with_range(X1, None, [-1.5, 1.5])
pyplot.title('Dataset X1')

#### Train vs validation loss

In [None]:
def plot_train_vs_val_loss(train_loss, train_acc, val_loss, val_acc):
    pyplot.figure(figsize=(10, 3))
    ax = pyplot.subplot(121)
    ax.plot(train_loss, "g-", label="Training set loss")
    ax.plot(val_loss, "b-", label="Validation set loss")
    ax.legend()

    ax = pyplot.subplot(122)
    ax.plot(train_acc, "g-", label="Training set accuracy")
    ax.plot(val_acc, "b-", label="Validation set accuracy")
    ax.legend()

#### MNIST images plot

In [None]:
pyplot.rcParams['image.cmap'] = 'gray'
fig, axes = pyplot.subplots(4, 10, figsize=(10, 4))

# 4 x 10 images
index = 0
for i in range(4):
    for j in range(10):
        img, _ = trainset[index]
        axes[i][j].imshow(img, cmap='gray')
        axes[i][j].axis("off")
        index += 1


#### Loss plots

In [None]:
pyplot.figure(figsize=(10, 3))
ax = pyplot.subplot(121)
# plot loss values of FC and CV network over epochs
ax.plot(fc_loss, "g-", label="Fully connected loss")
ax.plot(cv_loss, "b-", label="Convolutional loss")
ax.set_xlabel("Epoch")
ax.set_ylabel("Loss")
ax.legend()
ax.set_title("Validation Loss")

ax = pyplot.subplot(122)
# plot accuracy values of FC and CV network over epochs
ax.plot(fc_acc, "g-", label="Fully connected accuracy")
ax.plot(cv_acc, "b-", label="Convolutional accuracy")
ax.set_xlabel("Epoch")
ax.set_ylabel("Accuracy")
ax.legend()
ax.set_title("Validation Accuracy")

#### Confusion Matrix

In [None]:
classes = trainset.classes

pred_unfrozen_flat = np.concatenate(pred_unfrozen)
targ_unfrozen_flat = np.concatenate(targ_unfrozen)

# compute confusion matrix
matrix_unfrozen = confusion_matrix(pred_unfrozen_flat,
                                   targ_unfrozen_flat)  # Use predictions and target from the fine-tuned network without frozen layers

# plot confusion matrices
plot_conf_matrix = ConfusionMatrixDisplay(matrix_unfrozen, display_labels=classes)
plot_conf_matrix.plot(xticks_rotation="vertical")
plt.show()

#### Feature Magnitude Plot

In [None]:
def plot_features(network):
    # collect feature magnitudes for
    known, negative, unknown = [], [], []

    with torch.no_grad():
        # extract deep features magnitudes for validation set
        for x, t in validationloader:
            # extract deep features (and logits)
            logits, deep_features = network(x)
            # compute norms
            norms = torch.norm(deep_features, dim=1)
            # split between known and unknown
            batch_known, targets_known, batch_unknown = split_known_unknown(norms, t)
            # collect norms of known samples
            known.extend(batch_known)
            # collect norms of negative samples
            negative.extend(batch_unknown)

        for x, t in testloader:
            # extract deep features (and logits)
            logits, deep_features = network(x)
            # compute norms
            norms = torch.norm(deep_features, dim=1)
            # split between known and unknown
            batch_known, targets_known, batch_unknown = split_known_unknown(norms, t)
            # collect norms of known samples
            known.extend(batch_known)
            # collect norms of unknown samples
            unknown.extend(batch_unknown)

    # plot the norms as histograms
    pyplot.figure(figsize=(4, 2))

    # keep the same maximum magnitude
    max_mag = 20
    # plot the three histograms
    pyplot.hist(known, bins=100, range=(0, max_mag), density=True, color="g", histtype="step", label="Known")
    pyplot.hist(negative, bins=100, range=(0, max_mag), density=True, color="b", histtype="step", label="Negative")
    pyplot.hist(unknown, bins=100, range=(0, max_mag), density=True, color="r", histtype="step", label="Unknown")

    # beautify plot
    pyplot.legend()
    pyplot.xlabel("Deep Feature Magnitude")
    pyplot.ylabel("Density")


plot_features(module_network_adapted)

#### Stock Plots

In [None]:
stock_data = {
    'GAIL': (gail_data[0], train_gail_scaled, test_gail_scaled),  # gail_data[0] contains dates for GAIL
   #  'NTPC': (ntpc_data[0], train_ntpc_scaled, test_ntpc_scaled)  # ntpc_data[0] contains dates for NTPC
}

# Create a figure with two subplots side by side (1 row, 2 columns)
fig, ax = plt.subplots(1, 2, figsize=(16, 4))

for idx, (stock_name, (dates, train_data, test_data)) in enumerate(stock_data.items()):
    plt.subplot(1, 2, idx + 1)
    # Plot the training data on the left side
    plt.plot(dates[:len(train_data)], train_data, label='Training')
    # Plot the test data on the right side
    plt.plot(dates[len(train_data):], test_data, label='Test')
    # Add a vertical line at 2018-01-01 for reference
    date = mdates.date2num(np.datetime64('2018-01-01'))
    plt.axvline(date, color='black', ls='--')
    # Set labels and title for the subplot
    plt.xlabel('Time')
    plt.ylabel('Price')
    plt.title(stock_name)
    # Display legend for the plotted lines
    plt.legend()
    plt.grid(True, linestyle='--', linewidth=0.5, color='gray')

plt.tight_layout()
plt.show()

## Theory

### Compute gradient
 -> See cheatsheet - [derivatives](Derivative%20Rules%20Cheatsheet.md)

### Convolutional Networks


#### Network output calculations

***Output size of network -> (in + 2p - k) / s + 1***

1. A [2D convolutional layer](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html) with $Q_1$ channels, kernel size $7\times7$, stride 1 and padding 0.
2. A [2D maximum pooling](https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html) with pooling size $2\times2$ and stride 2.
3. A `Sigmoid` activation function.
4. A 2D convolutional layer with $Q_2$ channels, kernel size $5\times5$, stride 1 and padding 2.
5. A 2D maximum pooling with pooling size $2\times2$ and stride 2.
6. A `Sigmoid` activation function.
7. A flattening layer to turn the 3D feature map into a 1D vector.
8. A fully-connected layer with the appropriate number of inputs and $O$ outputs.

Consider the network as defined above
Assume that the input is a $28\times28$ grayscale image.
How many hidden neurons do we need in the final fully-connected layer for a given number $Q_2$ of output channels of the second convolution?

(Write steps of computation.)

1. Input image size: $28\times28$
2. 1st Convolutional layer: $(28 + 2*0 - 7) / 1 + 1 = 22\times22$
3. 1st Max pooling layer: $(22/2)\times(22/2) = 11\times11$
4. 1st Sigmoid activation function: no influence
5. 2nd Convolutional layer: $(11 + 2*2 - 5) / 1 + 1 = 11\times11$
6. 2nd Max pooling layer: $(11/2)\times(11/2) = 5.5\times5.5$ -> round up: $6\times6$
7. 2nd Sigmoid activation function: no influence
8. Flattening layer: $6\times6 = 36$

#### Learnable parameters calculations

1. A `torch.nn.Flatten` layer to turn the $28\times28$ pixel image (2D) into a $28*28$ pixel vector (1D)
2. A fully-connected layer with D input neurons and K1 outputs.
3. A `Sigmoid` activation function.
 4. A fully-connected layer with K1 input neurons and K2 outputs.
5. A `Sigmoid`activation function.
6. A fully-connected layer with K2 input neurons and O outputs.

fully_connected(D=28*28, K1=128, K2=64, O=10)

> #### Fully-connected Network:
>
> - first fully-connected layer: $(28*28+1)*128 =$ **100480**
> - second fully-connected layer: $(128+1)*64 =$ **8256**
> - third fully-connected layer: $(64+1)*10 =$ **650**
> - total: $100480 + 8256 + 650 =$ **109386**

1. A [2D convolutional layer](https://pytorch.org/docs/stable/generated/torch.nn.Conv2d.html) with $Q_1$ channels, kernel size $7\times7$, stride 1 and padding 0.
2. A [2D maximum pooling](https://pytorch.org/docs/stable/generated/torch.nn.MaxPool2d.html) with pooling size $2\times2$ and stride 2.
3. A `Sigmoid` activation function.
4. A 2D convolutional layer with $Q_2$ channels, kernel size $5\times5$, stride 1 and padding 2.
5. A 2D maximum pooling with pooling size $2\times2$ and stride 2.
6. A `Sigmoid` activation function.
7. A flattening layer to turn the 3D feature map into a 1D vector.
8. A fully-connected layer with the appropriate number of inputs and $O$ outputs.
 
convolutional(Q1=16, Q2=16, O=10)

> #### Convolutional Network:
> - first convolutional layer: $(7*7+1)*16$ = **800**
> - second convolutional layer: $(16*5*5+1)*16$ = **6416**
> - fully-connected layer: $(16*5*5+1)*10$ = **4010**
> - total: $800 + 6416 + 4010 = **11226**

***can check with torch.Tensor.numel()***