<font size=25>Laboratory 4 summary</font>

In this lab you will gain debugging experience by solving the most typical deep learning bugs. 

There are 8 exercises, each one with a corresponding cell. Run the cell, inspect the error and fix the code. The bugs can be fixed by editing one or two lines of code.

# **Bugs everywhere**

In [None]:
from __future__ import print_function, division
import os
import torch
import random
from typing import Iterator, List, Callable, Tuple
from functools import partial
import warnings
from math import *
import zipfile
from tqdm import tqdm
from PIL import Image

# Sklearn
from sklearn.datasets import load_digits
# Numpy
import numpy as np
# Pandas
import pandas as pd

# PyTorch packages
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.utils.data import RandomSampler, Sampler
from torchvision import transforms, utils, datasets
from torchvision.transforms import ToTensor, ToPILImage
import torch.nn as nn

# matplotlib
from matplotlib import rc, cm
rc('animation', html='jshtml')
import matplotlib.pyplot as plt
from mpl_toolkits import mplot3d
import matplotlib.animation as animation
%matplotlib notebook
#warnings.filterwarnings("ignore")
plt.ion()   # interactive mode

## Exercise 1: Getting started


In [None]:
class MLP(nn.Module):
    def __init__(self, 
                 input_size: int, 
                 hidden_size: int, 
                 activation_fn: Callable):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.hidden_layer = nn.Linear(input_size, hidden_size)
        self.output_layer = nn.Linear(hidden_size, 2)
        self.activation_fn = activation_fn

    def forward(self, x):
        h = self.hidden_layer(x)
        h = self.activation_fn(h)
        out = self.output_layer(h)

        return out

model = MLP(input_size=100, hidden_size=256, activation_fn=nn.ReLU())

# issue: wrong input shape
#x = torch.rand(32, 200)

# solution
x = torch.rand(32, 100)

y = model(x)
assert y.shape[0] == 32 and y.shape[1] == 2, "Wrong output shape"

## Exercise 2: Getting in shape

In [None]:
class MLP(nn.Module):
    def __init__(self, 
                 input_size: int, 
                 hidden_size: int, 
                 activation_fn: Callable):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.hidden_layer = nn.Linear(input_size, hidden_size)
        self.output_layer = nn.Linear(hidden_size, 2)
        self.activation_fn = activation_fn

    def forward(self, x):
        h = self.hidden_layer(x)
        h = self.activation_fn(h)
        out = self.output_layer(h)

        return out

model = MLP(input_size=784, hidden_size=256, activation_fn=nn.ReLU())
mnist_trainset = datasets.MNIST(
    root='./data', 
    train=True, 
    download=True, 
    transform=transforms.ToTensor())
x, l = mnist_trainset[10]

# issue: shape of x is 1x3x32x32, but the network takes a tensor of shape 784
# solution: resize x before feeding it to the network
# when -1 is passed as an argument, the actual dimension is inferred from the
# remaining dimensions
# 1x3x32x32 -> 1x(784)
x = x.view(1, -1) 
y = model(x)
assert y.shape[0] == 1 and y.shape[1] == 2, "Wrong output shape"

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw



## Exercise 3: It's the little things

In [None]:
class MLP(nn.Module):
    def __init__(self, 
                 input_size: int, 
                 hidden_size: int, 
                 activation_fn: Callable):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.hidden_layer = nn.Linear(input_size, hidden_size)
        self.output_layer = nn.Linear(hidden_size, 2)
        self.activation_fn = activation_fn

    def forward(self, x):
        h = self.hidden_layer(x)
        
        # issue: activation_fn is accidentally applied over x
        # instead of h => shape mismatch
        # h = self.activation_fn(x)

        # solution: 
        h = self.activation_fn(h)
        
        out = self.output_layer(h)

        return out

# issue: activation_fn is accidentally passed as a function pointer,
# instead of object
# model = MLP(input_size=784, hidden_size=256, activation_fn=nn.ReLU)

# solution:
model = MLP(input_size=784, hidden_size=256, activation_fn=nn.ReLU())

x = torch.rand(32, 784)
y = model(x)
assert y.shape[0] == 32 and y.shape[1] == 2, "Wrong output shape"

## Exercise 4: No one left behind

In [None]:
class MLP(nn.Module):
    def __init__(self, 
                 input_size: int, 
                 hidden_size: int, 
                 batch_size: int,
                 activation_fn: Callable):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.hidden_layer = nn.Linear(input_size, hidden_size)
        self.output_layer = nn.Linear(hidden_size, 10)
        self.batch_size = batch_size
        self.activation_fn = activation_fn

    def forward(self, x):
        # issue: for batch_size=32, we always expected tensors of size 
        # 32 x 1 x 28 x 28. However, the last batch in the dataset was 16,
        # so the view function below reshaped the 16 x 1 x 28 x 28 tensor to
        # 32 x (1 x 14 x 14) = 32 x 784. This new dimension (784) turned out
        # to be incompatible with the first layer of the network
        # -
        # input x has shape: batch_size x 1 x 28 x 28 
        # we resize it to:   batch_size x 784
        # x = x.view(self.batch_size, -1)

        # solution 2:
        # We extract the actual batch size first and then resize the tensor
        # accordingly.
        current_batch_size = x.shape[0]
        x = x.view(current_batch_size, -1)

        # solution 3:
        # We resize the tensor based on the 1st input layer dimension
        x = x.view(-1, 784)

        h = self.hidden_layer(x)
        h = self.activation_fn(h)
        out = self.output_layer(h)

        return out

# instantiate model
BATCH_SIZE=32
model = MLP(
    input_size=784, 
    hidden_size=256, 
    activation_fn=nn.ReLU(), 
    batch_size=BATCH_SIZE
)

# instantiate MNIST dataset
val_dataset = datasets.MNIST(
    root='./data', 
    train=False, 
    download=True, 
    transform=transforms.ToTensor())
print("validation dataset size = ", len(val_dataset))

# issue: dataset has 10000 examples, so the last batch has only 16 elements
# instead of 32. The forward method in the MLP class has a bug.
# It accidentally reshapes the last batch Tensor from 16 x 1 x 28 x 28
# to 32 x (1 x 14 x 14) = 32 x 196. Therefore, the new tensor shape is
# incompatible with the first layer of the model.
# val_dataloader = DataLoader(
#     val_dataset,
#     batch_size=BATCH_SIZE,
#     shuffle=True
# )

# solution 1: set the drop_last argument in the DataLoader to True. This
# drops the last batch (which may have a different shape) so that all
# the batches have the same number of examples
val_dataloader = DataLoader(
    val_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
    drop_last=False
)

loss_crt = nn.CrossEntropyLoss()
epoch_loss = 0.0
for batch_images, batch_labels in val_dataloader:
    # batch_size x 2
    out = model(batch_images)
    loss = loss_crt(out, batch_labels)
    epoch_loss += loss.item()

epoch_loss /= len(val_dataloader)
print("Validation loss = ", epoch_loss)

## Exercise 5: Left to their own devices


In [None]:
class MLP(nn.Module):
    def __init__(self, 
                 input_size: int, 
                 hidden_size: int, 
                 batch_size: int,
                 device: torch.device,
                 activation_fn: Callable):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.hidden_layer = nn.Linear(input_size, hidden_size)
        self.output_layer = nn.Linear(hidden_size, 10)
        self.activation_fn = activation_fn
        self.device = device
        self.batch_size = batch_size

    def forward(self, x):
        # issue: Tensor.to() is not an in-place operation, so tensor x
        # remains on CPU
        # -
        # move input data to GPU (if available)
        # x.to(self.device)

        # solution
        x = x.to(self.device)

        # reshape tensor
        # batch_size x 784
        x = x.view(self.batch_size, -1)

        h = self.hidden_layer(x)
        h = self.activation_fn(h)
        out = self.output_layer(h)

        return out

BATCH_SIZE=32
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("device = ", device)

# instantiate model
model = MLP(
    input_size=784, hidden_size=256, activation_fn=nn.ReLU(), batch_size=32,
    device=device
)

# move model to GPU (Module.to() is an in-place operation)
model.to(device)

# instantiate MNIST dataset
train_dataset = datasets.MNIST(
    root='./data', 
    train=True, 
    download=True, 
    transform=transforms.ToTensor())
print("train dataset size = ", len(train_dataset))

# instantiate dataloader
train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
)

loss_crt = nn.CrossEntropyLoss()
epoch_loss = 0.0
for batch_images, batch_labels in train_dataloader:
    # issue: Tensor.to() is not an in-place operation (though Model.to() is)
    # therefore batch_labels remains on CPU
    # -
    # move labels to GPU (if available)
    # batch_labels.to(device)

    # solution:
    #batch_labels=batch_labels.to(device)
    
    # batch_size x 2
    # feedforward
    out = model(batch_images)
    
    # compute loss 
    loss = loss_crt(out, batch_labels)
    epoch_loss += loss.item()

device =  cuda
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw

train dataset size =  60000


RuntimeError: ignored

## Exercise 6: Not exactly my type

In [None]:
class MLP(nn.Module):
    def __init__(self, 
                 input_size: int, 
                 hidden_size: int, 
                 device: torch.device,
                 activation_fn: Callable):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.hidden_layer = nn.Linear(input_size, hidden_size)
        self.output_layer = nn.Linear(hidden_size, 10)
        self.activation_fn = activation_fn
        self.device = device

    def forward(self, x):
        # batch_size x 64
        x = x.to(self.device)

        h = self.hidden_layer(x)
        h = self.activation_fn(h)
        out = self.output_layer(h)

        return out

BATCH_SIZE=32
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("device = ", device)

# instantiate model
model = MLP(
    input_size=64, hidden_size=256, activation_fn=nn.ReLU(), device=device
)

# move model to GPU (Module.to() is an in-place operation)
model.to(device)

# load the 1797 images from the Digits dataset:
# https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html
# Images are grayscale digits from 0 to 9, stored as arrays of size 64 (8x8). 
# Both images and labels are stored are NumPy arrays, so we need to convert 
# them to Tensors.
x = load_digits()

# issue: NumPy arrays are 64bit floating point type, so torch.tensor() converts 
# them to Tensors of type Double (torch.float64) by default. However,
# the default type of Tensors is torch.float32, so when we instantiated the
# model, its parameters have type torch.float32 as well. This leads to a 
# type mismatch when feeding a torch.float64 input to the first layer.
# -
# 1797 x 64, 1797
#images, labels = torch.tensor(x.data), torch.tensor(x.target)

# solution 1: specify dtype argument in the torch.tensor() constructor
images, labels = torch.tensor(x.data, dtype=torch.float32), torch.tensor(x.target)

# solution 2: use torch.to(dtype=torch.float32) to convert to Float type
images, labels = torch.tensor(x.data), torch.tensor(x.target)
images = images.to(dtype=torch.float32)

# solution 3: call .float() on tensor to convert it to Float type
images, labels = torch.tensor(x.data), torch.tensor(x.target)
#images = images.float()

# we create a TensorDataset, which is a type of Dataset that wraps Tensors.
# https://pytorch.org/docs/stable/data.html#torch.utils.data.TensorDataset
# Examples are indexed over the first dimension, so the first dimension of 
# the Tensors must be the same (1797 in our case)
train_dataset = TensorDataset(images, labels)

# instantiate dataloader
train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
)

loss_crt = nn.CrossEntropyLoss()
epoch_loss = 0.0
for batch_images, batch_labels in train_dataloader:
    batch_labels=batch_labels.to(device)
    
    # batch_size x 2
    # feedforward
    out = model(batch_images)
    
    # compute loss 
    loss = loss_crt(out, batch_labels)
    epoch_loss += loss.item()

device =  cuda


RuntimeError: ignored

In [None]:
x = torch.tensor([3,4])

In [None]:
x[-5]

IndexError: ignored

In [None]:
x = m(y)

## Exercise 7: Out of bounds
The [Wheat Seeds](https://archive.ics.uci.edu/ml/datasets/seeds) dataset ([Kaggle link](https://www.kaggle.com/jmcaro/wheat-seedsuci)) is a classification task with 3 classes, which contains 209 examples. Each example contains 7 geometrical properties of wheat seeds belonging to 3 varieties of wheat. 

**Hint 1:** When training on GPUs, CUDA errors may be less helpful. Usually, errors such as "`RuntimeError: CUDA error: device-side assert triggered`" indicate a problem with an index, which may be too large. To get a more accurate error message, move the model and dataset to CPU, check the error again and try to fix it.

**Hint 2:** After fixing the code responsible for a CUDA error, you may still encounter the error when running on GPU. Try restarting the Colab Notebook (`Runtime` -> `Restart runtime`) and run the cells again.


In [None]:
class MLP(nn.Module):
    def __init__(self, 
                 input_size: int, 
                 hidden_size: int, 
                 device: torch.device,
                 activation_fn: Callable):
        super().__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.hidden_layer = nn.Linear(input_size, hidden_size)
        self.output_layer = nn.Linear(hidden_size, 3)
        self.activation_fn = activation_fn
        self.device = device

    def forward(self, x):
        # batch_size x 7
        x = x.to(self.device)

        h = self.hidden_layer(x)
        h = self.activation_fn(h)
        out = self.output_layer(h)

        return out

BATCH_SIZE=32

# if you encounter a vague CUDA error message, move the operations to CPU then
# run the code again. The error message is usually more helpful.
#device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device = torch.device('cpu')
print("device = ", device)

# instantiate model
model = MLP(
    input_size=7, hidden_size=128, activation_fn=nn.ReLU(), device=device
)

# move model to GPU (Module.to() is an in-place operation)
model.to(device)

# download Wheat Seeds dataset
!wget --no-check-certificate \
https://raw.githubusercontent.com/jbrownlee/Datasets/master/wheat-seeds.csv \
-O /tmp/wheat.csv

# read Wheat Seeds dataset from csv
# Dataset has 209 examples. Each example has 7 attributes (features).
# It's a classification task with 3 classes (1, 2 and 3)
data = pd.read_csv("/tmp/wheat.csv")

# issue: the labels read from the CSV are 1, 2 and 3. However, the scores that
# the model outputs in the `out` tensor are indexed from 0 to 2, leading to
# an index error when trying to access out[:, 3]
#x = torch.tensor(data.values, dtype=torch.float32)
#data, labels = x[:,:-1], x[:,-1].long()

# solution: subtract 1 from the `labels` tensor
x = torch.tensor(data.values, dtype=torch.float32)
data, labels = x[:,:-1], x[:,-1].long()-1

# we create a TensorDataset, which is a type of Dataset that wraps Tensors.
# https://pytorch.org/docs/stable/data.html#torch.utils.data.TensorDataset
# Examples are indexed over the first dimension, so the first dimension of 
# the Tensors must be the same (209 in our case)
validation_dataset = TensorDataset(data, labels)

# instantiate dataloader
validation_dataloader = DataLoader(
    validation_dataset,
    batch_size=BATCH_SIZE
)

loss_crt = nn.CrossEntropyLoss()
epoch_loss = 0.0
for batch_images, batch_labels in validation_dataloader:
    batch_labels=batch_labels.to(device)
    
    # feedforward
    # batch_size x 3
    out = model(batch_images)
    
    # compute loss 
    loss = loss_crt(out, batch_labels)
    epoch_loss += loss.item()

epoch_loss /= len(validation_dataloader)
print("Validation loss = ", epoch_loss)

device =  cpu
--2022-03-08 14:17:50--  https://raw.githubusercontent.com/jbrownlee/Datasets/master/wheat-seeds.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 9301 (9.1K) [text/plain]
Saving to: ‘/tmp/wheat.csv’


2022-03-08 14:17:51 (33.8 MB/s) - ‘/tmp/wheat.csv’ saved [9301/9301]



IndexError: ignored

## Exercise 8: I have no memory of that

**Hint 1:** The error will appear after ~2 epochs

**Hint 2:** You do NOT need to modify the model's size to fix the memory bug

**Hint 3:** After getting the error message, you have to restart the machine:
  - restart Colab: `Runtime` -> `Restart runtime`
  - run the cell that imports packages 
  - run the cell below

In [None]:
class MLP(nn.Module):
    def __init__(self, 
                 input_size: int, 
                 hidden_size_1: int, 
                 hidden_size_2: int, 
                 hidden_size_3: int, 
                 hidden_size_4: int, 
                 device: torch.device,
                 activation_fn: Callable):
        super().__init__()
        self.input_size = input_size
        self.hidden_size_1 = hidden_size_1
        self.hidden_size_2 = hidden_size_2
        self.hidden_layer_1 = nn.Linear(input_size, hidden_size_1)
        self.hidden_layer_2 = nn.Linear(hidden_size_1, hidden_size_2)
        self.hidden_layer_3 = nn.Linear(hidden_size_2, hidden_size_3)
        self.hidden_layer_4 = nn.Linear(hidden_size_3, hidden_size_4)
        self.output_layer = nn.Linear(hidden_size_4, 10)
        self.activation_fn = activation_fn
        self.device = device

    def forward(self, x):
        # move input data to GPU (if available)
        x = x.to(self.device)

        # reshape tensor
        # batch_size x 784
        x = x.view(-1, self.input_size)

        h1 = self.activation_fn(self.hidden_layer_1(x))
        h2 = self.activation_fn(self.hidden_layer_2(h1))
        h3 = self.activation_fn(self.hidden_layer_3(h2))
        h4 = self.activation_fn(self.hidden_layer_4(h3))
        out = self.output_layer(h4)

        return out

BATCH_SIZE=32
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print("device = ", device)

# instantiate model
model = MLP(
    input_size=784, 
    hidden_size_1=4096,
    hidden_size_2=4096,
    hidden_size_3=4096,
    hidden_size_4=4096,
    activation_fn=nn.ReLU(), 
    device=device
)

# move model to GPU (Module.to() is an in-place operation)
model.to(device)

# instantiate MNIST dataset
train_dataset = datasets.MNIST(
    root='./data', 
    train=True, 
    download=True, 
    transform=transforms.ToTensor())
print("train dataset size = ", len(train_dataset))

# instantiate dataloader
train_dataloader = DataLoader(
    train_dataset,
    batch_size=BATCH_SIZE,
    shuffle=True,
)

loss_crt = nn.CrossEntropyLoss()
epoch_loss = 0.0
num_batches = len(train_dataloader)
for epoch in range(20):
    for idx, (batch_images, batch_labels) in enumerate(train_dataloader):
        if idx % 50 == 0:
            print("epoch %d, batch %d/%d" % (epoch, idx, num_batches))

        # move labels to GPU (if available)
        batch_labels=batch_labels.to(device)
        
        # batch_size x 2
        # feedforward
        out = model(batch_images)
        
        # compute loss 
        loss = loss_crt(out, batch_labels)

        # issue:
        # loss is a Tensor, which contains the whole computational graph
        # of the model. When adding it to the epoch_loss counter, the 
        # computational graph of each training loop is accumulated in the
        # epoch_loss variable, instead of being discarded. Thus, the memory
        # usage continues to increase until the out of memory error appears.
        # you can also see this issue discussed here:
        # https://pytorch.org/docs/stable/notes/faq.html#my-model-reports-cuda-runtime-error-2-out-of-memory
        # -
        # epoch_loss += loss

        

        # solution:
        # extract the scalar from the loss Tensor with .item()
        # since the loss variable is not further used, the memory required 
        # for the computational graph is freed after every iteration
        epoch_loss += loss.item()

    epoch_loss /= num_batches
    print("epoch loss = ", epoch_loss)