# Installation and imports



In [1]:
%pip install transformers datasets



In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from datasets import load_dataset


from transformers import AutoTokenizer

## data

In [3]:
dataset = load_dataset("dangne/processed-wikipedia-20220301.simple")["train"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


dataset_infos.json:   0%|          | 0.00/808 [00:00<?, ?B/s]

data/train-00000-of-00001-b1221355b76071(…):   0%|          | 0.00/122M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2588992 [00:00<?, ? examples/s]

In [4]:
dataset = dataset.train_test_split(test_size=10_000, seed=0)

In [5]:

dataset = {k: v["text"] for k, v in dataset.items()}

for k, v in dataset.items():
    print(k, len(v))

train 2578992
test 10000


In [6]:
print(dataset["train"][8])

It is found in the region Basse-Normandie in the Calvados department in the northwest of France.


## tokenization

Like in the previous PW, we rely on GPT-2 BPE

In [7]:
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")

tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [8]:
print(tokenizer.tokenize(dataset["train"][8]))

['It', 'Ġis', 'Ġfound', 'Ġin', 'Ġthe', 'Ġregion', 'ĠBas', 'se', '-', 'Norm', 'and', 'ie', 'Ġin', 'Ġthe', 'ĠCal', 'v', 'ados', 'Ġdepartment', 'Ġin', 'Ġthe', 'Ġnorthwest', 'Ġof', 'ĠFrance', '.']


In [9]:
text_batch = dataset["train"][:8]

huggingface's `transformers` provides a convenient way to tokenize text, it also takes care of padding the text so that we can wrap all examples of a batch in the same `Tensor`

In [10]:
seq_len=64

In [11]:
input_ids = tokenizer(text_batch, return_tensors='pt', padding=True, truncation=True, max_length=seq_len)['input_ids']

In [12]:
input_ids

tensor([[ 1544,   373,   351,   262,  3232,  4141, 12700,   878,  5033,   257,
         17091, 18382,   290,  3230,  1743,    13, 50256, 50256, 50256, 50256,
         50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256],
        [  464, 13478,   373,  6198,  1900,   355, 41598,   286, 31100,   287,
          9656,   290,   355, 19408,  2256, 41598,   286, 31100,   422,  9656,
           284,  4793,    13, 50256, 50256, 50256, 50256, 50256, 50256, 50256],
        [ 1544,   373, 11343,   262,  2321, 20715, 15895,   287, 23123,   351,
         14465,  2113, 30848,   329,   366,  2833,    12, 13395, 11992,  5050,
           326,  7139, 15964,   290, 17512,   286,  1981, 14821,  3341,  1911],
        [   51,   377,   541,   371,   528, 49484, 44487, 25011,   357,    26,
          4642,  1467,  2693, 14489,     8,   318,   257,  3517,  7179,  3615,
           763,    12, 27173, 14971,    13, 50256, 50256, 50256, 50256, 50256],
        [ 1026,   635,   468,  3544,   284,  435

In [13]:
input_ids.shape

torch.Size([8, 30])

Notice the padding: small texts are padded by `tokenizer.eos_token_id`

In [14]:
tokenizer.eos_token_id

50256

# Embedding


Use `nn.Embedding` to embed token indices into vectors of dimension `hidden_size=10`.

![image.png](attachment:c31f4598-d792-40ee-81d0-59208e15afc5.png)


In [15]:
hidden_size=10

In [16]:

batch_size = 8
seq_len = 30
vocab_size = 50257
max_position = 512
token_embeddings = nn.Embedding(vocab_size,hidden_size)

Combine these with position encoding

In [17]:

# Positional embedding
pos_embedding = nn.Embedding(max_position, hidden_size)


tok_embeds = token_embeddings(input_ids)


position_ids = torch.arange(seq_len).unsqueeze(0)
position_ids = position_ids.expand(batch_size, seq_len)


pos_embed = pos_embedding(position_ids)

# Combine: token + position
embeddings = tok_embeds + pos_embed
embeddings



tensor([[[ 1.0477, -1.8240,  0.8495,  ...,  0.0665,  1.5255, -0.2185],
         [-0.2184, -0.2203, -1.8651,  ..., -1.9044, -1.3931, -1.2776],
         [ 1.6267,  0.4036, -0.0120,  ..., -1.3021, -0.8347, -1.7434],
         ...,
         [ 1.1205,  2.9404, -0.2779,  ..., -2.1764, -2.3874, -2.5236],
         [ 2.2858,  0.0599, -1.1025,  ...,  0.3163, -2.6223,  0.7756],
         [ 2.6008, -1.0092,  0.6806,  ..., -1.0468, -1.1065, -0.9184]],

        [[ 1.0092, -1.0455,  0.7341,  ..., -1.4823,  0.2810, -2.6945],
         [-0.5657, -0.6164, -1.7095,  ..., -2.1836,  0.0193, -1.1992],
         [-0.5122,  0.6567, -0.9221,  ..., -2.3895, -2.9860, -1.5825],
         ...,
         [ 1.1205,  2.9404, -0.2779,  ..., -2.1764, -2.3874, -2.5236],
         [ 2.2858,  0.0599, -1.1025,  ...,  0.3163, -2.6223,  0.7756],
         [ 2.6008, -1.0092,  0.6806,  ..., -1.0468, -1.1065, -0.9184]],

        [[ 1.0477, -1.8240,  0.8495,  ...,  0.0665,  1.5255, -0.2185],
         [-0.2184, -0.2203, -1.8651,  ..., -1

# Attention
We're now ready to feed these embeddings to an attention layer.

Note the attention happens at every layer in the transformer, not only after the input embedding.


Attention is a crucial component in the transformer, it allows to capture dependencies between different positions of two sequence of elements. In our case, and in most cases in NLP applications, sequences are sentences and elements are (sub)words.
It is a powerful operation that allows to learn an alignment between each element in two sequences. It generates a score of how related each element in sequence1 and sequence2 are between each other.
Understanding how attention works and being able to implement it are essential for anyone working with transformers.

Given a query ($Q$), key ($K$), and value ($V$) tensors, the attention mechanism computes a weighted sum of the value tensor based on the similarity between the query and key tensors as shown in the following equation:

$$
\text{Attention}(Q,K,V) = \text{softmax}\Big(\frac{QK^T}{\sqrt{d_k}}\Big)V
$$

where
- $Q$ represents the query tensor.
- $K$ represents the key tensor.
- $V$ represents the value tensor.
- $d_k$ represents the dimensionality of the key tensor.

This is the image that was in the [original Transformer paper](https://proceedings.neurips.cc/paper_files/paper/2017/hash/3f5ee243547dee91fbd053c1c4a845aa-Abstract.html) and that shows the computations used in the attention.

Forget about the right part, we'll get back to that later in the lab.

![image](https://miro.medium.com/v2/resize:fit:1270/1*LpDpZojgoKTPBBt8wdC4nQ.png)



## Building a Simple Self-Attention Function

In self-attention, a single sequence acts as the query $Q$, key $K$, and value $V$, allowing attention to be computed within the sequence itself. This can be useful for syntactic where an attention head can model the relationship between part of speech like subjects and verbs.


Given an input sequence $S$ and the transformation weights $W_Q$, $W_K$ and $W_V$, complete the `self_attention` function in the cell below.

You need to implement the following:
- Calculate the query, key, and value projections using linear transformations.
- Compute the attention scores by performing the dot product between the query and key tensors.
- Apply softmax activation to the attention scores to obtain the attention weights.

$$\text{Softmax}(x_{i}) = \frac{\exp(x_i)}{\sum_j \exp(x_j)}$$

- Multiply the attention weights with the value tensor to get the attended values.
- Return the attended values.



In [18]:
import math
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=0)

def self_attention(S, W_Q, W_K, W_V):
    #raise NotImplementedError()
    Q = torch.matmul(S, W_Q)
    K = torch.matmul(S, W_K)
    V = torch.matmul(S, W_V)

    # attention scores
    d_k = Q.size(-1)
    scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(d_k)

    # Softmax
    attention_weights = F.softmax(scores, dim=-1)

    #  Weights
    weighted_values = torch.matmul(attention_weights, V)

    return attention_weights, weighted_values

In [19]:
# Projections
W_Q = torch.rand((hidden_size, 2))  # Query weights
W_K = torch.rand((hidden_size, 2))  # Key weights
W_V = torch.rand((hidden_size, 2))  # Value weights

# Perform self-attention
attended_values = self_attention(embeddings, W_Q, W_K, W_V)

### Multi-Head

However, the relations present even in a single sentence are more than one. Think about number and gender agreement as one, the semantic relation between subject and object, the functional aspect that verb arguments have etc. All this cannot be modeled by a single head.

For this reason, we are going to extend the single-head attention function to **multi-head attention**. In the previous implementation, we had one set of weights for the input query, resulting in a single type of _relationship between the the source and target sequence_. With multi-head attention, we can utilize _multiple parallel single-head attention modules_ to obtain diverse relationships between the query and the values. The attention operation works by projecting the sequences through a multiplication with a projection matrix, and then computing the alignment score. These are are all operation that can be parallelized since there's no interdependency between each each head. For this reasons, each head could learn to model a different linguistic intereation useful for many downstream tasks, be it syntactic, semantic or generation-based..


As we've seen in class, this can be done simply by reshaping queries, keys and values.

Project back the results using $W_O$

In [20]:
def multi_head_attention(S, W_Q, W_K, W_V, W_O, num_heads=2):
    Batch_size, sequence_length, Hidden_size = S.shape
    head_dim = Hidden_size // num_heads


    Q = S @ W_Q
    K = S @ W_K
    V = S @ W_V

    # Reshape into heads
    Q = Q.view(Batch_size, sequence_length, num_heads, head_dim).permute(0, 2, 1, 3)
    K = K.view(Batch_size, sequence_length, num_heads, head_dim).permute(0, 2, 1, 3)
    V = V.view(Batch_size, sequence_length, num_heads, head_dim).permute(0, 2, 1, 3)


    scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(head_dim)
    attn_weights = F.softmax(scores, dim=-1)

    #  Weights
    context = torch.matmul(attn_weights, V)

    # Merge
    context = context.permute(0, 2, 1, 3).contiguous()
    context = context.view(Batch_size, sequence_length, Hidden_size)

    # Output
    out = context @ W_O

    return attn_weights, out

In [21]:

# Projections
W_Q = torch.rand((10, 10))  # Query weights
W_K = torch.rand((10, 10))  # Key weights
W_V = torch.rand((10, 10))  # Value weights
W_O = torch.rand((10, 10))  # Output proj

attended_values = multi_head_attention(embeddings, W_Q, W_K, W_V, W_O)

### Causal Mask

GPT uses a version of self-attention called causal self-attention. When training our models for tasks like language modeling and machine translation, in practice we feed the entire train sequence to the model but, at every timestep, we want to prevent it to compute the alignment with future tokens. For this reason we use a mask that we incrementally lift at every timestep. For instance, we have a sentence that says "Libson is a great city to live in". At time 0, we feed the entire sentence to the model masking everything but the first token. Using the strikethrough format as masking, this will be what the model sees at step 0:

- Time 0: Libson ~is a great city to live in~

We then let the model generate a token a and move to step 1 where we are masking everything but the first two tokens

- Time 1: Libson is ~a great city to live in~

and so on...

- Time 2: Libson is a ~great city to live in~
- Time 3: Libson is a great ~city to live in~
- Time 4: Libson is a great city ~to live in~
- Time 5: Libson is a great city to ~live in~
- Time 6: Libson is a great city to live ~in~


![transformer](https://paullerner.github.io/aivancity_nlp/_static/attention_mask.png)

Apply mask on attention using `torch.tril` and `masked_fill`

In [22]:
def causal_multi_head_attention(S, W_Q, W_K, W_V, W_O, num_heads=2):
    Batch_size, sequence_length, Hidden_size = S.shape
    head_dim = Hidden_size // num_heads

    # projections
    Q = S @ W_Q
    K = S @ W_K
    V = S @ W_V


    Q = Q.view(Batch_size, sequence_length, num_heads, head_dim).permute(0, 2, 1, 3)
    K = K.view(Batch_size, sequence_length, num_heads, head_dim).permute(0, 2, 1, 3)
    V = V.view(Batch_size, sequence_length, num_heads, head_dim).permute(0, 2, 1, 3)

    # attention scores
    scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(head_dim)

    #causal mask
    mask = torch.triu(torch.ones(sequence_length, sequence_length, device=S.device), diagonal=1).bool()
    scores = scores.masked_fill(mask, float('-inf'))

    # Softmax
    attn = F.softmax(scores, dim=-1)

    # Weights
    context = torch.matmul(attn, V)

    # Merge
    context = context.permute(0, 2, 1, 3).contiguous().view(Batch_size, sequence_length, Hidden_size)

    # Output
    out = context @ W_O

    return attn, out

In [23]:
attended_values = causal_multi_head_attention(embeddings, W_Q, W_K, W_V, W_O)


We can now look back at the attention figure from the paper. Hopefully, you are now able to understand also the right side of the figure.

![image](https://miro.medium.com/v2/resize:fit:1270/1*LpDpZojgoKTPBBt8wdC4nQ.png)

### Pytorch Module

The last modification involves wrapping our functions into a PyTorch module. As you may have noticed, in the previous exercise, we passed the transformation weights as inputs to the function. In a real-world scenario, these matrices are learned, and PyTorch can keep track of them for us.

- Complete the missing lines on the initialization of the module and the forward pass.
- add dropout on the attention weights and the output


In [24]:
class CausalSelfAttention(nn.Module):
    def __init__(self, hidden_size=8, num_heads=2, dropout=0.1, seq_len=3):
        super().__init__()
        raise NotImplementedError()

    def forward(self, x):
        raise NotImplementedError()

In [27]:
attention_module = CausalSelfAttention()

NotImplementedError: 

In [28]:
attended_values = attention_module(embeddings)

NameError: name 'attention_module' is not defined

# Transformer


![transformer](https://paullerner.github.io/aivancity_nlp/_static/transformer_decoder.png)

## Attention is almost all you need: feedforward neural network

Simple Neural network of two layers with a ReLU activation in-between and dropout at output. The intermediate dimension should be 4 times `hidden_size`

In [29]:
class FeedForward(nn.Module):
    def __init__(self, hidden_size, dropout):
        super().__init__()
        raise NotImplementedError()

    def forward(self, x):
        raise NotImplementedError()

## Transformer Block

- stack CausalSelfAttention and FeedForward
- add residual connections
- add layer norms

In [30]:
class Block(nn.Module):
    def __init__(self, hidden_size=10, num_heads=2, dropout=0.1, seq_len=128):
        super().__init__()
        raise NotImplementedError()

    def forward(self, x):
        raise NotImplementedError()

In [32]:
block = Block()
output = block(embeddings)

NotImplementedError: 

## Complete Transformer
- word embeddings
- position embeddings
- as many blocks as you like to stack
- output layer back to the vocabulary (no need for softmax)

In [33]:
class Transformer(nn.Module):

    def __init__(self, vocab_size, hidden_size=8, num_heads=2, dropout=0.1, seq_len=3, num_layers=2):
        super().__init__()
        raise NotImplementedError()

    def forward(self, input_ids):
        raise NotImplementedError()


In [34]:
transformer = Transformer(vocab_size=?)

SyntaxError: invalid syntax (ipython-input-2957738489.py, line 1)

In [None]:
logits = transformer(input_ids)

# Training



![lm](https://paullerner.github.io/aivancity_nlp/_static/lm.png)

A language model estimates the probability of a sequence of words $w$:
$$P(w)=\prod_t^{|w|} P(w_t | w_{<t}) = P(w_1)  P(w_2|w_1)  P(w_3 | w_1 w_2)...$$

See how this turns into a sequence of classification problem:
- first $P(w_1)$
- then $P(w_2|w_1)$
- etc.

The model "predicts the next word" given a context

## Self-supervision

Remember the greatest thing about Language Modeling: we don't need to annotate data!

The model should predict the next word given the context so we just need to shift the input by 1 to get the labels!

Compute the loss on one batch using `nn.CrossEntropyLoss`. Be careful about the padding! We don't want our model to learn to predict padding at the end of text!

Remember to flatten the batch dimension with the sequence dimension

In [None]:
raise NotImplementedError()

## Training loop


In [None]:
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.mps.is_available()
    else "cpu"
)
print(f"{device} is available make sure to put your model on this device")

In [None]:
%load_ext tensorboard

In [None]:
import torch
from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter("logs/pw4")

Run tensorboard before training. Refresh during training.

In [None]:
%tensorboard --logdir logs/pw4

In [None]:
# if you don't have a GPU, reduce MAX_STEPS
MAX_STEPS = 1000
seq_len=16
transformer = Transformer(vocab_size=tokenizer.vocab_size, hidden_size=32, num_layers=2, num_heads=2, dropout=0.1, seq_len=seq_len).to(device)

optimizer = torch.optim.AdamW(transformer.parameters(), lr=0.01)

batch_size = 256
train_loader = torch.utils.data.DataLoader(dataset["train"], batch_size=batch_size, shuffle=True)
validation_loader = torch.utils.data.DataLoader(dataset["test"], batch_size=batch_size, shuffle=False)

steps = 0
while steps < MAX_STEPS:
    for text_batch in train_loader:
        input_ids = tokenizer(text_batch, return_tensors='pt', padding=True, truncation=True, max_length=seq_len)['input_ids'].to(device)
        logits = transformer(input_ids)
        raise NotImplementedError("compute loss as above")
        loss =
        writer.add_scalar("Loss/train", loss.item(), steps)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        steps += 1

        # validation
        if steps % (MAX_STEPS//2) == 0:
            with torch.no_grad():
                transformer.eval()
                valid_loss = 0
                valid_batches = 0
                for text_batch in validation_loader:
                    input_ids = tokenizer(text_batch, return_tensors='pt', padding=True, truncation=True, max_length=seq_len)['input_ids'].to(device)
                    logits = transformer(input_ids)
                    raise NotImplementedError("compute loss as above")
                    loss =
                    valid_loss += loss.item()
                    valid_batches += 1
                transformer.train()
                writer.add_scalar("Loss/validation", valid_loss/valid_batches, steps)

        if steps > MAX_STEPS:
            break

Save model

In [None]:
torch.save(transformer.state_dict(), "transformer.bin")

## Generate text

Use the previous PW decoding method to generate text

In [None]:
# load previously saved model
#transformer.load_state_dict(torch.load("transformer.bin", map_location=device))

In [None]:
dataset["test"][2]

In [None]:
prompt = "Montirat is"


In [None]:
input_ids = tokenizer([prompt], return_tensors='pt', padding=True, truncation=True, max_length=seq_len)['input_ids'].to(device)
output = generate(transformer, input_ids, greedy)

tokenizer.batch_decode(output)

# Bonus: Visualize Attentions

Now that we understand the basic mechanisms of attention, we can check the activated attention patterns in a pretrained BERT model (Devlin et al. 2018). Recall that BERT is an encoder-based transformer model which is based on a stack of self-attention blocks.

In [None]:
from transformers import BertTokenizer, BertModel
from bertviz import head_view

# Define a sample input text
text = "I will go for a run and will jump into a lake."

# Instantiate the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Tokenize the input text
tokens = tokenizer.tokenize(text)

# Convert tokens to token IDs
token_ids = tokenizer.convert_tokens_to_ids(tokens)

# Create attention mask
attention_mask = [1] * len(token_ids)

# Convert token IDs and attention mask to tensors
input_ids = torch.tensor([token_ids])
attention_mask = torch.tensor([attention_mask])

# Generate the transformer output
outputs = model(input_ids, attention_mask=attention_mask, output_attentions=True)

# Extract attentions and check the shape
outputs.attentions[0].shape

As you can see, we extracted an attention from the first layer. The first dimension is the bach, the second one is the number of heads used in the first layer, and the last two dimensions are the sequence length. Given that this was a self attention block the last two numbers are equal.

We can now use a method from the [bertviz library](https://github.com/jessevig/bertviz) and plot all the heads.

You'll see a dropdown menu that allows you the select a layer of the model (BERT has 12). You'll then see a color for every head used in that layer (BERT has 12 head per layer). By default all heads are shown, click on a color to activate/disactivate that head. It can help starting by activating only one head and checking the learned relation learn by that self attentino head. By hovering over each word you can see the attention weigths that linked that words to all the others.

**Question** Do you notice any interesting (linguistic) pattern?

In [None]:
head_view(outputs.attentions, tokens=tokens)