
# Using Transformers for Programming Language Classification

In this laboratory we will implement a self-attention module and use it to perform programming language classification. The input to the system is a program in some programming language, and the model has to predict the programming language used.

In [1]:
import torch
import datetime
from torch.utils.tensorboard import SummaryWriter
print(torch.__version__) 

torch.manual_seed(1212) # set seed to replicate results
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # use GPU if available
print(device)

! pip install tokenizers
from tokenizers import Tokenizer

# Download data

! wget -O mlnn_lab5.4.zip https://ehubox.ehu.eus/s/xFtYB7zikesHw5r/download
! unzip mlnn_lab5.4.zip
! ls



1.13.0+cu116
cuda
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenizers
  Downloading tokenizers-0.13.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m53.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.13.2
--2023-01-08 14:31:00--  https://ehubox.ehu.eus/s/xFtYB7zikesHw5r/download
Resolving ehubox.ehu.eus (ehubox.ehu.eus)... 158.227.0.95
Connecting to ehubox.ehu.eus (ehubox.ehu.eus)|158.227.0.95|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5249522 (5.0M) [application/zip]
Saving to: ‘mlnn_lab5.4.zip’


2023-01-08 14:31:09 (656 KB/s) - ‘mlnn_lab5.4.zip’ saved [5249522/5249522]

Archive:  mlnn_lab5.4.zip
  inflating: mlnn_lab5.4/dev.txt     
  inflating: mlnn_lab5.4/labels.txt  
  inflating: mlnn_lab5.4/test.txt    
  inflati

## Setting the hyperparameters

In [2]:
# Hyper-parameters

LEARNING_RATE = 0.0001 # should be right
HIDDEN_DIM = 128 # The dimension of the hidden state of the Self attention module
NUM_LAYERS = 4 # number of encoder layers
DROPOUT_PROB = 0.1 # dropout probability
BATCH_SIZE = 256 # batch size
NUM_EPOCHS = 50 # number of epochs
N_LABELS = 21 # nmber of classes (see 'mlnn_lab5.4/labels.txt')
SEQ_LEN = 150 # sequence max length


## The dataset class

As always, we use `Dataset`and the `torch.utils.data.Dataloader`, which will create the batches for us. Note that the dataset receives a tokenizer as input, which will split the input text into (sub)tokens.

Each instance is composed by a triplet $(x^{(i)}, m^{(i)}, y^{(i)})$ where:

- $x^{(i)}$ is a tensor with `seq_len` (sub)token ids. If the instance has less than `seq_len` tokens, the tensor is padded (and the corresponding mask is zero). The first element corresponds to the special `[CLS]`token.
- $m^{(i)}$ is a mask tensor of `seq_len` 0/1 values. If zero, the corresponding (sub)token is masked.
- $y^{(i)}$ is a 1 dimension tensor with the class label for the sequence.


In [3]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, fname, tokenizer):
        super(Dataset, self).__init__()
        self.data = []
        for i,line in enumerate(open(fname)):
            x, lbl = line.strip().split('\t')
            x = tokenizer.encode(x)
            if len(x) > SEQ_LEN:
                continue
            self.data.append([x.ids, x.attention_mask, [int(lbl)]])

    def __getitem__(self, idx):
        '''Select a tuple that is further passed to collate_fn'''
        sent = self.data[idx][0]
        mask = self.data[idx][1]
        lbl = self.data[idx][2]
        return torch.LongTensor(sent), torch.LongTensor(mask), torch.LongTensor(lbl)

    def __len__(self):
        '''Return the length of the dataset.'''
        return len(self.data)


# Tokenizer class

The tokenizer converts input strings to a list of tokens, and returns the following. Look at the code below to understand how it works. **NOTE** how the tokenizer prepends the special `[CLS]` token at the beginning of the sequence.

In [4]:
tok = Tokenizer.from_file("mlnn_lab5.4/tokenizer.json")
tok.enable_padding(direction='right',length=SEQ_LEN) # max sequence is 150
enc = tok.encode('int main(int argc, char**argv[])')
print(enc.ids)
print(enc.attention_mask)
print(enc.tokens)


[1, 1965, 2200, 12, 1965, 4434, 16, 2143, 14, 14, 3707, 63, 65, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
['[CLS]', 'int', 'main', '(', 'int', 'argc', ',', 'char', '*', '*', 'argv

## Class to encode input embeddings

We calculate input embeddings by summing up two embedding types (of class `torch.nn.Embedding`)

- token embeddings (`vocab_size` $\times$ `embedding_dim`)
- positional embeddings (`vocab_size` $\times$ `embedding_dim`)

In [5]:
class InputEmbedding(torch.nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(InputEmbedding, self).__init__()
        self.embeddings = torch.nn.Embedding(vocab_size, embedding_dim)
        self.positional_embeddings = torch.nn.Embedding(vocab_size, embedding_dim)

    def forward(self, x):
        return self.embeddings(x) + self.positional_embeddings(x)


## Self Attention module

This module perform a self-attention step on the input tokens in
$x$.

The self-attention is performed in the usual way:

$\begin{aligned}
  & Q=X\cdot W_Q\\
  & K=X\cdot W_K\\
  & V=X\cdot W_V\\
  & \mathrm{Attention}(Q,K,V) = \mathrm{softmax}(\frac{QK^T}{\sqrt{d_k}})V
\end{aligned}$

where $d_k$ is the dimension of the hidden vectors. However, instead
of matrix multiplication we will use linear layers (`torch.nn.Linear`)
for representing all $W_Q, W_K$ and $W_V$

## `forward` method. It receives two inputs:

- `x` (size `batch_size` $\times$ `seq_len` $\times$ `hidden_dim`): subtoken embeddings.
- `mask` (size `batch_size` $\times$ `seq_len`): a mask tensor where $m_{bi} == 0$ if the $i$th word of instance $b$ is masked. If so, the value of the logit has to be set to $-1e^{10}$. See the comments below.

The output is a tensor of size (`batch_size` $\times$ `seq_len` $\times$ `hidden_dim`)

**EXERCISE**: complete the `forward` function below (see the comments in the function).



In [6]:
class SelfAttention(torch.nn.Module):
    def __init__(self, hidden_dim):
        super(SelfAttention, self).__init__()
        # hidden dimension has to be multiple of number of heads
        self.Mq = torch.nn.Linear(hidden_dim, hidden_dim)
        self.Mk = torch.nn.Linear(hidden_dim, hidden_dim)
        self.Mv = torch.nn.Linear(hidden_dim, hidden_dim)
        self.scale = float(hidden_dim) ** 0.5 # torch.sqrt(torch.tensor(hidden_dim))

    def forward(self, x, mask):
        # TODO: implement the forward pass of the self-attention module. Given an input
        # x of shape (batch_size, seq_len, input_dim), apply self-attention to obtain
        # a new vector y of shape (batch_size, seq_len, hidden_dim).

        q = self.Mq(x) # q = [batch_size, seq_len, hidden_dim]
        k = self.Mk(x) # k = [batch_size, seq_len, hidden_dim]
        v = self.Mv(x) # v = [batch_size, seq_len, hidden_dim]
        out = None

        ### WRITE YOUR CODE HERE ###
        #
        # 1. Obtain attention weights
        # 2. Mask attention weights, and apply softmax
        # 3. Obtain out vector by applying attention on v
        #
        # Useful functions:
        #
        # - torch.matmul(x,y): matrix multiplication
        # - torch.softmax(x, dim): apply softmax in the required dimension.
        #
        # Useful Tensor methods:
        #
        # - permute(shape): permute tensor to the new shape. Shape parameter
        #   has to be complatible with tensor shape.
        # - masked_fill_(condition, value): set value to elements that fullfil
        #   contition. Typical usage:
        #
        #   att = att.masked_fill(mask == 0, -1e10)
        #
        #   IMPORTANT: do this BEFORE the softmax step.
        #
        # (~5 lines of code)

        att = torch.matmul(q, k.permute(0,2,1)) / self.scale # [batch_size, seq_len, seq_len]
        # Apply mask: set to a very low value if mask is zero.
        if mask is not None:
            mask = mask.unsqueeze(-1)
            att = att.masked_fill(mask == 0, -1e10)
        att = torch.nn.functional.softmax(att, dim=-1)
        out = torch.matmul(att, v) # [batch_size, seq_len, hidden_dim]

        ########################

        return out # [batch_size, seq_len, hidden_dim]


## Test your single head self-attention module

Below you have a code snippet and the expected output.

In [7]:
torch.manual_seed(1212)
self_att = SelfAttention(HIDDEN_DIM)
emb_layer = InputEmbedding(tok.get_vocab_size(), HIDDEN_DIM)
enc = tok.encode('int main(int argc, char**argv[])')
with torch.no_grad():
  embs = emb_layer(torch.tensor(enc.ids)).unsqueeze(0)
  mask = torch.tensor(enc.attention_mask).unsqueeze(0)
  yhat = self_att(embs,mask)
  print(yhat[0])


tensor([[ 0.2640, -0.7938,  0.1397,  ..., -0.5985, -0.2393, -0.7274],
        [ 0.3177, -0.9336,  0.1001,  ..., -0.6601, -0.2226, -0.7175],
        [ 0.2549, -0.8340,  0.0670,  ..., -0.5941, -0.1998, -0.6733],
        ...,
        [ 0.3393, -0.9621,  0.1189,  ..., -0.6528, -0.2242, -0.7558],
        [ 0.3393, -0.9621,  0.1189,  ..., -0.6528, -0.2242, -0.7558],
        [ 0.3393, -0.9621,  0.1189,  ..., -0.6528, -0.2242, -0.7558]])


## Multi-head attention module

This module perform a multi-head self-attention on the input tokens in
$x$. Is very similar to the previous class, but using multi-head.

**IMPORTANT: DO NOT IMPLEMENT THIS CLASS YET**. Do it once you have successfully trained and evaluated a model using single-head self-attention layer.


A multihead self-attention with $k$ heads is performed in the usual way:

$\begin{aligned}
  & \mathrm{MultiHead}(Q,K,V) = \mathrm{Concat}(\mathrm{head}_1,\ldots,\mathrm{head}_k)\\
  & \mathrm{where}\ 
\mathrm{head}_i=\mathrm{Attention}(Q W^{Q}_i,K W^{K}_i,V W^V_i) = \mathrm{softmax}(\frac{QK^T}{\sqrt{d_k}})V
\end{aligned}$

and $d_k$ is the dimension of the hidden vectors.

We won't use different $W_i$ matrices though. Instead, the module will
contain three linear layers as in the single head case, and change the
`forward` method to perform the multi-head. See below.

## `forward` method. It receives two inputs:

- `x` (size `batch_size` $\times$ `seq_len` $\times$ `hidden_dim`): ids of the subtokens. If the sequence of an instance $i$ is less than `seq_len`, the vector is padded.
- `mask` (size `batch_size` $\times$ `seq_len`): a mask tensor where $m_{bi} == 0$ if the $i$th word of instance $b$ is masked. If so, the value of the logit has to be set to $-1e^{10}$.

The output is a tensor of size (`batch_size` $\times$ `seq_len` $\times$ `hidden_dim`)

**EXERCISE**: complete the `forward` function below (see the comments in the function). **REMEMBER THAT DO SHOULD IMPLEMENT THIS CLASS IN A SECOND STEP**, once you have successfully trained and evaluated a model using single-head self-attention layer.

In [None]:
class MultiHeadSelfAttention(torch.nn.Module):
    def __init__(self, hidden_dim, n_heads):
        super(MultiHeadSelfAttention, self).__init__()
        # hidden dimension has to be multiple of number of heads
        assert hidden_dim % n_heads == 0, f'hidden_dim ({hidden_dim}) is not multiple of n_heads ({n_heads})'
        self.n_heads = n_heads
        self.head_dim = hidden_dim // n_heads
        self.hidden_dim = hidden_dim
        self.Mq = torch.nn.Linear(hidden_dim, hidden_dim)
        self.Mk = torch.nn.Linear(hidden_dim, hidden_dim)
        self.Mv = torch.nn.Linear(hidden_dim, hidden_dim)
        self.scale = float(self.head_dim) ** 0.5 # torch.sqrt(torch.tensor(hidden_dim))

    def forward(self, x, mask):
        '''Forward pass.

        Perform a forward pass of the self-attention module. Given an input
        x of shape (bsz, seq_len, input_dim), apply self-attention to obtain
        a new vector y of shape (bsz, seq_len, hidden_dim).
        '''
        batch_size = x.size(0)
        q = self.Mq(x) # [batch_size, seq_len, hidden_dim]
        k = self.Mk(x) # [batch_size, seq_len, hidden_dim]
        v = self.Mv(x) # [batch_size, seq_len, hidden_dim]
        out = None

        ### WRITE YOUR CODE HERE ###
        #
        #
        # 1. view q,v,k as [batch_size, n_heads, seq_len, head_dim] tensor
        # 2. obtain attention weights of size [batch size, n_heads, seq_len, seq_len]
        # 3. apply mask to attention weights
        # 4. apply softmax on the attention weights
        # 5. obtain out by applying attention over v to obtain a tensor of shape [batch_size, n_heads, seq_len, head_dim]
        # 6. set out dimensions as [batch_size, seq_len, hidden_dim]
        #
        # Useful functions:
        #
        # - torch.matmul(x,y): batch matrix multiplication
        # - torch.softmax(x, dim): apply softmax in the required dimension.
        #
        # Useful Tensor methods:
        #
        # - view(shape): return a new view of a tensor. For more
        #   information refer to the documentation:
        #   https://pytorch.org/docs/stable/generated/torch.Tensor.view.html#torch.Tensor.view
        #
        # - permute(shape): permute tensor to the new shape. Shape
        #   parameter has to be complatible with tensor shape.
        #   https://pytorch.org/docs/stable/generated/torch.permute.html
        #
        # - contiguous(): Returns a contiguous in memory tensor containing the same
        #   data as self tensor. https://pytorch.org/docs/stable/generated/torch.Tensor.contiguous.html
        #
        # - masked_fill(condition, value): set value to elements that
        #   fullfil contition. Typical usage:
        #
        #   att = att.masked_fill(mask == 0, -1e10)
        #
        #   IMPORTANT: do this BEFORE the softmax step.
        # (~14 lines of code)

        q = q.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) # [batch_size, n_heads, seq_len, head_dim]
        k = k.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) # [batch_size, n_heads, seq_len, head_dim]
        v = v.view(batch_size, -1, self.n_heads, self.head_dim).permute(0, 2, 1, 3) # [batch_size, n_heads, seq_len, head_dim]
        att = torch.matmul(q, k.permute(0,1,3,2)) / self.scale # [batch size, n_heads, seq_len, seq_len]
        # Apply mask: set to a very low value if mask is zero.
        if mask is not None:
            # mask = [batch_size, seq_len]
            mask = mask.unsqueeze(1).unsqueeze(1)
            att = att.masked_fill(mask == 0, -1e10)
        att = torch.softmax(att, dim=-1)
        out = torch.matmul(att, v) # [batch_size, n_heads, seq_len, head_dim]
        out = out.permute(0, 2, 1, 3).contiguous() # [batch_size, seq_len, n_heads, head_dim]
        out = out.view(batch_size, -1, self.hidden_dim) # [batch_size, seq_len, hidden_dim]

        ########################

        return out # [batch_size, seq_len, hidden_dim]


## Test your multihead self-attention module

Below you have a code snippet and the expected output.111

In [None]:
self_att = MultiHeadSelfAttention(HIDDEN_DIM,4)
emb_layer = InputEmbedding(tok.get_vocab_size(), HIDDEN_DIM)
enc = tok.encode('int main(int argc, char**argv[])')
with torch.no_grad():
  embs = emb_layer(torch.tensor(enc.ids)).unsqueeze(0)
  mask = torch.tensor(enc.attention_mask).unsqueeze(0)
  yhat = self_att(embs,mask)
  print(yhat[0])


tensor([[ 0.7262, -0.1723, -0.0943,  ...,  0.5582,  0.3762,  0.2817],
        [ 0.6340, -0.1306, -0.0251,  ...,  0.4880,  0.3915,  0.2166],
        [ 0.6994, -0.1688, -0.0754,  ...,  0.4718,  0.3981,  0.2035],
        ...,
        [ 0.6982, -0.1652, -0.0636,  ...,  0.5276,  0.3882,  0.2533],
        [ 0.6982, -0.1652, -0.0636,  ...,  0.5276,  0.3882,  0.2533],
        [ 0.6982, -0.1652, -0.0636,  ...,  0.5276,  0.3882,  0.2533]])



## Encoder block

We will implement the encoder like this:
![Encoder architecture](https://ehubox.ehu.eus/s/jRXmMpKDCTaX4tb/download)

It consists of two submodules.

1. The first submodule consists of:
   - A self-attention module (probably multi-head)
   - A dropout module (`torch.nn.Dropout`)
   - A Layer Normalization module (`torch.nn.LayerNorm`)
2. The second submodule consists of:
   - A linear layer (`torch.nn.Linear`)
   - A dropout module (`torch.nn.Dropout`)
   - A Layer Normalization module (`torch.nn.LayerNorm`)


### `forward` function:

This function applies the encoder step to the input. See the comments in the function.

**EXERCISE**: complete the `forward` function below.


In [None]:
class EncoderBlock(torch.nn.Module):
    def __init__(self, hidden_dim, n_heads, dropout):
        super(EncoderBlock, self).__init__()

        # First sublayer: self-attention, dropout and layer normalization
        if n_heads == 1:
            self.self_attn = SelfAttention(hidden_dim)
        else:
            self.self_attn = MultiHeadSelfAttention(hidden_dim, n_heads)
        self.dropout1 = torch.nn.Dropout(dropout)
        self.norm1 = torch.nn.LayerNorm(hidden_dim)

        # Second sublayer: linear, dropout and layer normalization
        self.linear = torch.nn.Linear(hidden_dim, hidden_dim)
        self.ReLU = torch.nn.ReLU()
        self.dropout2 = torch.nn.Dropout(dropout)
        self.norm2 = torch.nn.LayerNorm(hidden_dim)

    def forward(self, x, mask):

        out = None
        ### WRITE YOUR CODE HERE ###
        #
        #
        # You have to program the following:
        #
        # s1 = LayerNorm(x + Dropout(Attention(x)))
        # s2 = s2 + Dropout(ReLU(Linear(s1)))
        # return LayerNorm(s2)
        #
        # (~6 lines of code)

        s1 = self.self_attn(x, mask)
        s1 = x + self.dropout1(s1)
        s1 = self.norm1(s1)

        s2 = self.ReLU(self.linear(s1))
        s2 = s2 + self.dropout2(s2)
        out = self.norm2(s2)

        ########################

        return out


## PLClassifier

This is the main class for Programming Language Classification. It is
composed of

- an embedding layer (`InputEmbedding`)
- a dropout layer
- an Encoder (`Encoder`) of L layers
- a linear layer to classify the first embedding of each example
  (corresponding to the `[CLS]` special token) to the corresponding
  class.


### `__init__` function:

Look carefully to the input parameters. Create the components of the
PLClassifier class.


### `forward` function:

This function sends the input to the encoder, and then pools the output to classify it.

The input is as follow:

- `x` (size `batch_size` $\times$ `seq_len`): ids of the (sub)tokens. If the sequence of an instance $i$ is less than `seq_len`, the vector is padded.
- `mask` (size `batch_size` $\times$ `seq_len`): a mask tensor where $m_{bi} == 0$ if the $i$th word of instance $b$ is masked.

The output is a tensor of size (`batch_size` $\times$ `n_classes`)

To produce the output, the function does the following:

1. Obtain embeddings input embeddings for x
2. Apply a dropout
3. Send the input to the encoder
4. Pool the output to obtain the first embedding first embedding of
   each example (corresponding to the `[CLS]` special token), and send
   it to the linear module.

**EXERCISE**: complete the `__init__` and `forward` functions below.


In [None]:
class PLClassifier(torch.nn.Module):
    def __init__(self, n_layers, hidden_dim, heads,
                 vocab_size, n_classes, dropout = DROPOUT_PROB):
        super(PLClassifier, self).__init__()

        self.embeddings = None
        self.dropout = None
        self.encoders = None
        self.fc = None

        ### WRITE YOUR CODE HERE ###
        # Note:
        # 1. You should use torch.nn.ModuleList for self.encoders
        # 2. self.encoders has n_layers EncoderBlock layers
        # 3. You should use torch.nn.Dropout for self.dropout
        #
        # (~5 lines of code)

        self.embeddings = InputEmbedding(vocab_size, hidden_dim)
        self.encoders = torch.nn.ModuleList([EncoderBlock(hidden_dim, heads, dropout)
                                             for _ in range(n_layers)])
        self.fc = torch.nn.Linear(hidden_dim, n_classes)
        self.dropout = torch.nn.Dropout(dropout)

        ########################

    def forward(self, x, mask):

        # x [batch_size, seq_len]

        out = None
        ### WRITE YOUR CODE HERE ###
        # Note:
        # 1. Obtain embeddings input embeddings for x
        # 2. Apply a dropout
        # 3. Send the input to the encoder
        # 4. Use the first embedding of each instance (corresp. to
        #   '[CLS]') and send it to the linear layer
        #
        # (~6 lines of code)

        x = self.embeddings(x)
        x = self.dropout(x)
        for encoder in self.encoders:
            x = encoder(x, mask)
        # get pooled embedding (CLS)
        x = x[:,0,:]
        out = self.fc(x)

        ########################

        return out # (batch_size, n_classes)


## Training the model

Nothing special here, just a typical training loop.

**EXERCISE**: complete the `training_step` function below.


In [None]:
def training_step(model, train_loader, loss_fn, optimizer, epoch, log_every):
    '''Train the model'''

    model.train() # training mode (for batchnorm, dropout, etc)
    for i, (sents, masks, labels) in enumerate(train_loader):
        sents = sents.to(device)
        masks = masks.to(device)
        labels = labels.to(device)

        ############# Your code here ############
        # Note:
        # 1. Zero grad the optimizer
        # 2. Feed the data into the model
        # 4. Feed the output and label to loss_fn
        # (~3 lines of code)

        optimizer.zero_grad()
        outputs = model(sents, masks)
        loss = loss_fn(outputs, labels.squeeze(-1))

        ########################
        # backward and optimize
        loss.backward()
        optimizer.step()
        if (i+1) % log_every == 0:
            print (f'Epoch [{epoch+1}/{NUM_EPOCHS}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item():.8f}')


In [None]:
def evaluate(model, loader):
    ok = 0
    total = 0
    loss_list = []
    criterion = torch.nn.CrossEntropyLoss() # combines LogSoftmax and NLLLoss
    model.eval()
    with torch.no_grad(): # do not store gradients (reduces memory consumption)
        for i, (sents, masks, labels) in enumerate(loader):
            sents = sents.to(device)
            masks = masks.to(device)
            labels = labels.to(device)
            outputs = model(sents, masks)
            loss = criterion(outputs, labels.squeeze(-1))
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            loss_list.append(loss.item())
            ok += (predicted == labels.squeeze(-1)).sum().item()
    model.train() # return to training mode (for batchnorm, dropout, etc)
    return sum(loss_list)/len(loss_list), 100*ok/total

def train(model, num_epochs, train_loader, val_loader, test_model, writer = None):
    '''Train the model'''

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
    loss_fn = torch.nn.CrossEntropyLoss() # combines LogSoftmax and NLLLoss
    steps_per_epoch = len(train_loader)
    log_every = min(15, steps_per_epoch)
    for epoch in range(num_epochs):
        training_step(model, train_loader, loss_fn, optimizer, epoch, log_every)
        current_step = (epoch + 1) * steps_per_epoch
        train_loss, train_acc = evaluate(model, train_loader)
        val_loss, val_acc = evaluate(model, val_loader)
        test_loss, test_acc = evaluate(model, test_loader)
        if writer is not None:
            writer.add_scalar('training loss', train_loss, current_step)
            writer.add_scalar('training acc', train_acc, current_step)
            writer.add_scalar('validation loss', val_loss, current_step)
            writer.add_scalar('validation acc', val_acc, current_step)
        print(f'Evaluating model in epoch {epoch+1} Val loss/acc: {val_loss:.8f}|{val_acc:.4f} Test loss/acc: {test_loss:.8f}|{test_acc:.4f}')

def predict(model, loader):
    model.eval()
    predictions = []
    with torch.no_grad(): # do not store gradients (reduces memory consumption)
        for i, (sents, masks, labels) in enumerate(loader):
            sents = sents.to(device)
            masks = masks.to(device)
            labels = labels.to(device)
            outputs = model(sents, masks)
            _, predicted = torch.max(outputs.data, 1)
            predictions.extend(predicted.tolist())
    return predictions

# load dataset

tok = Tokenizer.from_file("mlnn_lab5.4/tokenizer.json")
tok.enable_padding(direction='right',length=SEQ_LEN) # max sequence is 150

train_dataset = Dataset('mlnn_lab5.4/train.txt', tok)
val_dataset = Dataset('mlnn_lab5.4/dev.txt', tok)
test_dataset = Dataset('mlnn_lab5.4/test.txt',tok)

# dataloaders automatically shuffle the data and create batches, using the 'collate_fn' function
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE,
                                           shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE,
                                         shuffle=False)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE,
                                          shuffle=False)

## Create a single head model, train and evaluate.

 You should obtain a test accuracy of
 - ~33.6% with 1 epoch
 - ~46.8% with 2 epochs
 - ~69.7% with 20 epochs
 - ~72.5% with 50 epochs

Compute only the first two epochs during class hours (you can stop the execution afterwards), as it takes almost an hour to train the model for 50 epochs and you should see if the network is learning correctly the classification task by the end of the second epoch.

In [None]:
model=PLClassifier(NUM_LAYERS, HIDDEN_DIM, 1, tok.get_vocab_size(), N_LABELS)
model=model.to(device)

logdir = "logs/singlehead_" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
writer = SummaryWriter(logdir)

train(model, NUM_EPOCHS, train_loader, val_loader, test_loader, writer)
writer.flush()

# evaluate the model
test_loss, test_acc = evaluate(model, test_loader)
print(f'Test loss/accuracy: {test_loss:.4f}/{test_acc:.4f}')


Epoch [1/50], Step [15/164], Loss: 3.07382131
Epoch [1/50], Step [30/164], Loss: 3.00515079
Epoch [1/50], Step [45/164], Loss: 2.99736524
Epoch [1/50], Step [60/164], Loss: 2.92004418
Epoch [1/50], Step [75/164], Loss: 2.79229522
Epoch [1/50], Step [90/164], Loss: 2.75135708
Epoch [1/50], Step [105/164], Loss: 2.68686366
Epoch [1/50], Step [120/164], Loss: 2.65278411
Epoch [1/50], Step [135/164], Loss: 2.56336188
Epoch [1/50], Step [150/164], Loss: 2.44470954
Evaluating model in epoch 1 Val loss/acc: 2.25454913|34.6925 Test loss/acc: 2.26346034|33.6668
Epoch [2/50], Step [15/164], Loss: 2.31197572
Epoch [2/50], Step [30/164], Loss: 2.31117678
Epoch [2/50], Step [45/164], Loss: 2.16237593
Epoch [2/50], Step [60/164], Loss: 2.12400150
Epoch [2/50], Step [75/164], Loss: 2.14129162
Epoch [2/50], Step [90/164], Loss: 2.13652730
Epoch [2/50], Step [105/164], Loss: 1.99963379
Epoch [2/50], Step [120/164], Loss: 1.97851324
Epoch [2/50], Step [135/164], Loss: 1.96936226
Epoch [2/50], Step [150/

## STEP 2: Create a multi head model, train and evaluate

Go and complete the `MultiHeadSelfAttention` class. Then, train and evaluate the model using 4 heads.
 
 You should obtain a test accuracy of
 - ~31.0% with 1 epoch
 - ~43.6% with 2 epochs
 - ~71.0% with 20 epochs
 - ~73.2% with 50 epochs

Compute only the first two epochs during class hours (you can stop the execution afterwards), as it takes almost an hour to train the model for 50 epochs and you should see if the network is learning correctly the classification task by the end of the second epoch.


In [None]:
model=PLClassifier(NUM_LAYERS, HIDDEN_DIM, 4, tok.get_vocab_size(), N_LABELS)
model=model.to(device)

logdir = "logs/multihead_" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
writer = SummaryWriter(logdir)

train(model, NUM_EPOCHS, train_loader, val_loader, test_loader, writer)
writer.flush()

# evaluate the model
test_loss, test_acc = evaluate(model, test_loader)
print(f'Test loss/accuracy: {test_loss:.4f}/{test_acc:.4f}')


Epoch [1/50], Step [15/164], Loss: 3.06091404
Epoch [1/50], Step [30/164], Loss: 2.98493552
Epoch [1/50], Step [45/164], Loss: 2.94785380
Epoch [1/50], Step [60/164], Loss: 2.92106366
Epoch [1/50], Step [75/164], Loss: 2.81041551
Epoch [1/50], Step [90/164], Loss: 2.70765924
Epoch [1/50], Step [105/164], Loss: 2.58929300
Epoch [1/50], Step [120/164], Loss: 2.50426817
Epoch [1/50], Step [135/164], Loss: 2.40064144
Epoch [1/50], Step [150/164], Loss: 2.35465741
Evaluating model in epoch 1 Val loss/acc: 2.22974254|36.2326 Test loss/acc: 2.23652828|35.3177
Epoch [2/50], Step [15/164], Loss: 2.31760406
Epoch [2/50], Step [30/164], Loss: 2.21484375
Epoch [2/50], Step [45/164], Loss: 2.21931481
Epoch [2/50], Step [60/164], Loss: 2.12833667
Epoch [2/50], Step [75/164], Loss: 2.30586791
Epoch [2/50], Step [90/164], Loss: 2.09697914
Epoch [2/50], Step [105/164], Loss: 2.20498276
Epoch [2/50], Step [120/164], Loss: 2.12840843
Epoch [2/50], Step [135/164], Loss: 2.05285311
Epoch [2/50], Step [150/

# Ploting at the learning curves

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs