# Introduction
In this laboratory we will get our hands dirty working with Large Language Models (e.g. GPT and BERT) to do various useful things. I you haven't already, it is highly recommended to:

+ Read the [Attention is All you Need](https://arxiv.org/abs/1706.03762) paper, which is the basis for all transformer-based LLMs.
+ Watch (and potentially *code along*) with this [Andrej Karpathy video](https://www.youtube.com/watch?v=kCc8FmEb1nY) which shows you how to build an autoregressive GPT model from the ground up.

# Exercise 1: Warming Up
In this first exercise you will train a *small* autoregressive GPT model for character generation (the one used by Karpathy in his video) to generate text in the style of Dante Aligheri. Use [this file](https://archive.org/stream/ladivinacommedia00997gut/1ddcd09.txt), which contains the entire text of Dante's Inferno (**note**: you will have to delete some introductory text at the top of the file before training). Train the model for a few epochs, monitor the loss, and generate some text at the end of training. Qualitatively evaluate the results 

In [1]:
# Your code here.
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
import wandb
import matplotlib.pyplot as plt
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


# Data proeprocessing

In [2]:
from os.path import expanduser

class TextDS(Dataset):
    def __init__(self, block_size: int, path = expanduser('~') + "/datasets/commedia.txt", start: float = 0, stop: float= 0.7) -> None:
        super().__init__()
        with open(path, 'r') as f:
            self.text = f.read()
        l = len(self.text)
        self.text = self.text[int(start * l): int(stop * l)]
        self.vocab = sorted(list(set(self.text)))
        self.vocab_size = len(self.vocab)
        self.stoi = { ch: i for i, ch in enumerate(self.vocab) }
        self.itos = { i: ch for i, ch in enumerate(self.vocab) }
        self.encode = lambda s: [self.stoi[c] for c in s]
        self.decode = lambda l: ''.join([self.itos[i] for i in l])
        self.block_size = block_size

        self.data = torch.Tensor(self.encode(self.text)).type(torch.LongTensor)

    def __len__(self): return len(self.text) - (self.block_size + 1)

    def __getitem__(self, index):
        x = self.data[index: index + self.block_size]
        y = self.data[index + 1: index + self.block_size + 1]
        return x, y

In [3]:
block_size = 32
batch_size = 8
train_ds = TextDS(block_size, start=0, stop=0.3)
train_dl = DataLoader(train_ds, batch_size, True)
val_ds = TextDS(block_size, start=0.7, stop=0.9)
val_dl = DataLoader(val_ds, batch_size, True)
test_ds = TextDS(block_size, start=0.9, stop=1)
train_dl = DataLoader(train_ds, batch_size, True)

for x, y in train_dl:
    print(x.shape, y.shape)
    break


torch.Size([8, 32]) torch.Size([8, 32])


In [4]:
class AttentionHead(nn.Module):
    # take in input an embedding

    def __init__(self, input_size: int, head_size: int, block_size: int, masked: bool = True) -> None:
        super(AttentionHead, self).__init__()
        self.Q = nn.Linear(input_size, head_size)
        self.K = nn.Linear(input_size, head_size)
        self.V = nn.Linear(input_size, head_size)
        self.dropout = nn.Dropout()
        self.d = head_size
        self.masked = masked
        if self.masked:
            self.tril = torch.tril(torch.ones((block_size, block_size))).to(device)

    def forward(self, X):
        # X [B T C]
        # B, T, C = X.shape
        q = self.Q(X) # [B T D]
        k = self.K(X)
        v = self.V(X)

        qk: torch.Tensor = (q @ k.transpose(-1, -2)) / (self.d ** 0.5) # [B T D] @ [B D T] = [B T T]      
        if self.masked:
            qk = self.dropout(qk)
            qk = qk.masked_fill(self.tril == 0, float('-inf'))
        # qk = F.softmax(qk, dim=-1)
        # print("a", qk.isnan().any())
        return F.softmax(qk, dim=-1) @ v # [B T T] @ [B T D] = [B T D]
        
class MultiHeadAttention(nn.Module):

    def __init__(self, embedding_size: int, head_size: int, num_heads: int, block_size: int, masked: bool = True) -> None:
        super(MultiHeadAttention, self).__init__()
        self.heads = nn.ModuleList([AttentionHead(embedding_size, head_size, block_size, masked) for _ in range(num_heads)])
        self.projection = nn.Linear(head_size * num_heads, embedding_size)
        self.dropout = nn.Dropout()

    def forward(self, X):
        # each head has an output of [B T T], stacking at [B T (T*N)]
        concat = torch.cat([head(X) for head in self.heads], dim=-1) # last dimension
        out = self.projection(concat)
        out = self.dropout(out)
        return out

class FeedFoward(nn.Module):

    def __init__(self, n_embd):
        super(FeedFoward, self).__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(0.5)
        )

    def forward(self, x):
        return self.net(x)

class TransformerBlock(nn.Module):

    def __init__(self, embedding_size, num_heads, block_size) -> None:
        super(TransformerBlock, self).__init__()
        head_size = embedding_size // num_heads
        self.mha = MultiHeadAttention(embedding_size, head_size, num_heads, block_size)
        self.feed_forward = FeedFoward(embedding_size)
        self.ln1 = nn.LayerNorm(embedding_size)
        self.ln2 = nn.LayerNorm(embedding_size)

    def forward(self, X):
        X = X + self.mha(self.ln1(X))
        X = X + self.feed_forward(self.ln2(X))
        return X
    
class BLM(nn.Module):
    def __init__(self, vocab_size: int, embedding_size: int, num_heads: int, block_size: int, num_layers: int) -> None:
        super(BLM, self).__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, embedding_size)
        self.position_embedding_table = nn.Embedding(block_size, embedding_size)
        self.blocks = nn.Sequential(*[TransformerBlock(embedding_size, num_heads, block_size) for _ in range(num_layers)]) 
        self.ln_f = nn.LayerNorm(embedding_size) # final layer norm
        self.lm_head = nn.Linear(embedding_size, vocab_size)

    def forward(self, X):
        T = X.shape[-1]
        token_embedding = self.token_embedding_table(X)
        position_embedding = self.position_embedding_table(torch.arange(T, device=device))
        X = token_embedding + position_embedding
        X = self.blocks(X)
        X = self.ln_f(X)
        X = self.lm_head(X)
        return X

    def generate(self, X, max_new_token: int):
        # NOTE: Assume the input to be a single element, not a batch!
        # X is [T]
        T = X.shape[-1]
        for _ in range(max_new_token):
            # use last T tokens
            logits = self(X[-T:]) # [T C]
            # get the last timestep
            logits = logits[-1, :]
            # get the distribution of the next element
            probs = F.softmax(logits, dim=-1) # softmax over the last dimensiont
            next_token = torch.multinomial(probs, num_samples=1)
            X = torch.cat((X, next_token), dim=0)
        return X

# Train & Validation

In [5]:
@torch.no_grad()
def validation(model, dataloader, loss_fn):
    model.eval()
    loss = 0
    acc = 0
    for x, y in tqdm(dataloader, "Validation: ", leave=False):
        x, y = x.to(device), y.to(device)
        prediction_logits = model(x)

        B, T, C = prediction_logits.shape
        prediction_logits = prediction_logits.view(B*T, C)
        y = y.view(B*T)

        loss += loss_fn(prediction_logits, y).item()
        acc += (prediction_logits.argmax(1) == y).float().sum().item()
    return loss / len(dataloader), acc / len(dataloader.dataset)

def training(model, train_dataloader, validation_dataloader, loss_fn, optimizer, epochs, validation_freq, log):
    losses, accs = [], []
    for t in range(1, epochs + 1):
        model.train()
        for x, y in tqdm(train_dataloader, f"Epoch #{t}: ", leave=False):
            x, y = x.to(device), y.to(device)
            prediction_logits = model(x)

            B, T, C = prediction_logits.shape
            prediction_logits = prediction_logits.view(B*T, C)
            y = y.view(B*T)
            
            loss = loss_fn(prediction_logits, y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        if t % validation_freq == 0:
            lss, acc = validation(model, validation_dataloader, loss_fn)
            losses.append(lss)
            accs.append(acc)
            log_dict = {"loss": lss, "accuracy": acc}
            if log:
                wandb.log(log_dict)

    return losses, accs

In [6]:
def plot_training_result(losses, accs, freq):
    x = [i * freq for i in range(1, len(losses) + 1)]
    fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(14, 4))
    ax1.plot(x, losses)
    ax1.set_title("Loss")
    ax2.plot(x, accs)
    ax2.set_title("Accuracy")
    fig.show()

In [10]:
epochs = 10
embedding_size = 512
num_head = 16
num_layers = 6
validation_freq = 5
blm = BLM(train_ds.vocab_size, embedding_size, num_head, block_size, num_layers).to(device)
optimizer = torch.optim.Adam(blm.parameters(), lr=0.001)
loss = F.cross_entropy

In [11]:
blm.eval()
generated_tokens = blm.generate(train_ds[100][0].to(device),max_new_token=50)
generated_tokens = generated_tokens.tolist()
generated_text = train_ds.decode(generated_tokens)
print(generated_text)


  che' la diritta via era smarr`cMQVEfEXoRf'laso"NMDIpPTr"TPz>FGDpidloDOo>G?XrZlD


In [15]:
losses, accuracies = training(blm, train_dl, val_dl, loss, optimizer, epochs, validation_freq, False)
torch.save(blm.state_dict(), "lab2.1_blm.pth")

                                                               

In [14]:
blm.load_state_dict(torch.load("lab2.1_blm.pth"))

<All keys matched successfully>

In [15]:
blm.eval()
generated_tokens = blm.generate(train_ds[100][0].to(device),max_new_token=50)
generated_tokens = generated_tokens.tolist()
generated_text = train_ds.decode(generated_tokens)
print(generated_text)


  che' la diritta via era smarra,
coma acquastono.r giosade: dinolla me che mi va


# Exercise 2: Working with Real LLMs

Our toy GPT can only take us so far. In this exercise we will see how to use the [Hugging Face](https://huggingface.co/) model and dataset ecosystem to access a *huge* variety of pre-trained transformer models.

## Exercise 2.1: Installation and text tokenization

First things first, we need to install the [Hugging Face transformer library](https://huggingface.co/docs/transformers/index):

    conda install -c huggingface -c conda-forge transformers
    
The key classes that you will work with are `GPT2Tokenizer` to encode text into sub-word tokens, and the `GPT2LMHeadModel`. **Note** the `LMHead` part of the class name -- this is the version of the GPT2 architecture that has the text prediction heads attached to the final hidden layer representations (i.e. what we need to **generate** text). 

Instantiate the `GPT2Tokenizer` and experiment with encoding text into integer tokens. Compare the length of input with the encoded sequence length.

**Tip**: Pass the `return_tensors='pt'` argument to the togenizer to get Pytorch tensors as output (instead of lists).

In [16]:
# Your code here.
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

In [17]:
text = "My first GPT2 encoded text"
encoded_text = tokenizer.encode(text, return_tensors='pt')
print(encoded_text.shape) # expected shape [1 X]

torch.Size([1, 7])


In [18]:
print(tokenizer.decode(encoded_text[0]))
for i in range(encoded_text.shape[1]):
    print(tokenizer.decode(encoded_text[0, i]))

My first GPT2 encoded text
My
 first
 G
PT
2
 encoded
 text


In [19]:
coded = tokenizer.encode(["<SOS>", "<EOS>", "<PAD>", "<UNK>", "<RANDOM>"])
print(coded)
print(tokenizer.decode(coded))

[50256, 50256, 50256, 50256, 50256]
<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|>


## Exercise 2.2: Generating Text

There are a lot of ways we can, given a *prompt* in input, sample text from a GPT2 model. Instantiate a pre-trained `GPT2LMHeadModel` and use the [`generate()`](https://huggingface.co/docs/transformers/v4.27.2/en/main_classes/text_generation#transformers.GenerationMixin.generate) method to generate text from a prompt.

**Note**: The default inference mode for GPT2 is *greedy* which might not results in satisfying generated text. Look at the `do_sample` and `temperature` parameters.

In [20]:
# Your code here.
from transformers import GPT2LMHeadModel
model = GPT2LMHeadModel.from_pretrained("gpt2")

In [21]:
generated = model.generate()
print(generated)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.


tensor([[50256,   198,   464,   717,   640,   314,  2497,   262,   649,  2196,
           286,   262,   983,    11,   314,   373,   523,  6568,    13,   314]])


In [22]:
print(tokenizer.decode(generated[0]))

<|endoftext|>
The first time I saw the new version of the game, I was so excited. I


# Exercise 3: Reusing Pre-trained LLMs (choose one)

Choose **one** of the following exercises (well, *at least* one). In each of these you are asked to adapt a pre-trained LLM (`GPT2Model` or `DistillBERT` are two good choices) to a new Natural Language Understanding task. A few comments:

+ Since GPT2 is a *autoregressive* model, there is no latent space aggregation at the last transformer layer (you get the same number of tokens out that you give in input). To use a pre-trained model for a classification or retrieval task, you should aggregate these tokens somehow (or opportunistically select *one* to use).

+ BERT models (including DistillBERT) have a special [CLS] token prepended to each latent representation in output from a self-attention block. You can directly use this as a representation for classification (or retrieval).

+ The first *two* exercises below can probably be done *without* any fine-tuning -- that is, just training a shallow MLP to classify or represent with the appropriate loss function.

# Exercise 3.1: Training a Text Classifier (easy)

Peruse the [text classification datasets on Hugging Face](https://huggingface.co/datasets?task_categories=task_categories:text-classification&sort=downloads). Choose a *moderately* sized dataset and use a LLM to train a classifier to solve the problem.

**Note**: A good first baseline for this problem is certainly to use an LLM *exclusively* as a feature extractor and then train a shallow model.

# Exercise 3.2: Training a Question Answering Model (harder)

Peruse the [multiple choice question answering datasets on Hugging Face](https://huggingface.co/datasets?task_categories=task_categories:multiple-choice&sort=downloads). Chose a *moderately* sized one and train a model to answer contextualized multiple-choice questions. You *might* be able to avoid fine-tuning by training a simple model to *rank* the multiple choices (see margin ranking loss in Pytorch).

# Exercise 3.3: Training a Retrieval Model (hardest)

The Hugging Face dataset repository contains a large number of ["text retrieval" problems](https://huggingface.co/datasets?task_categories=task_categories:text-retrieval&p=1&sort=downloads). These tasks generally require that the model measure *similarity* between text in some metric space -- naively, just a cosine similarity between [CLS] tokens can get you pretty far. Find an interesting retrieval problem and train a model (starting from a pre-trained LLM of course) to solve it.

**Tip**: Sometimes identifying the *retrieval* problems in these datasets can be half the challenge. [This dataset](https://huggingface.co/datasets/BeIR/scifact) might be a good starting point.

# Exercise 1:
## Dataset exploration
In qeusto caso uso distilbert, in quanto prevede un token di padding, che facilita l'uso di frasi di lunghezza diversa

In [33]:
from datasets import load_dataset
dataset = load_dataset("rotten_tomatoes", data_dir="~/datasets")
model_name = "distilbert-base-uncased"

Found cached dataset rotten_tomatoes (/home/dl23emacas/.cache/huggingface/datasets/rotten_tomatoes/default-data_dir=~%2Fdatasets/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46)


  0%|          | 0/3 [00:00<?, ?it/s]

In [34]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
def map_dataset(examples): return tokenizer(examples['text'], return_tensors='pt', padding="max_length", truncation=True)

In [35]:
dataset['train'][0]['text']
tokenized_dataset = dataset.map(map_dataset, batched=True)

Loading cached processed dataset at /home/dl23emacas/.cache/huggingface/datasets/rotten_tomatoes/default-data_dir=~%2Fdatasets/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46/cache-e7e5012ced87c54a.arrow
Loading cached processed dataset at /home/dl23emacas/.cache/huggingface/datasets/rotten_tomatoes/default-data_dir=~%2Fdatasets/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46/cache-0bd91201817c20fa.arrow
Loading cached processed dataset at /home/dl23emacas/.cache/huggingface/datasets/rotten_tomatoes/default-data_dir=~%2Fdatasets/1.0.0/40d411e45a6ce3484deed7cc15b82a53dad9a72aafd9f86f8f227134bec5ca46/cache-86c9364705de6c40.arrow


Vediamo i primi e ultimi 10 token di una frase. Ci possiamo aspettare che gli ultimi token, essendo di padding, siano uguali

In [36]:
tokenized_dataset['train'][0]['input_ids'][:10], tokenized_dataset['train'][0]['input_ids'][-10:]

([101, 1996, 2600, 2003, 16036, 2000, 2022, 1996, 7398, 2301],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

Vediamo come appare il token di padding

In [37]:
tokenizer.decode(0)

'[PAD]'

Qui viene effettuato un po' di preprocessing del dataset, dato che i modelli di huggingface si aspettano un formato particolare dei dati, che corrisponde ad un dizionario conentenente la label della sequenza e la lista dei token che la definisce, padding inclusi

In [38]:
tokenized_dataset = tokenized_dataset.remove_columns("text")
tokenized_dataset = tokenized_dataset.rename_column("label", "labels")

In [40]:
tokenized_dataset['train'][0]

{'labels': 1,
 'input_ids': [101,
  1996,
  2600,
  2003,
  16036,
  2000,
  2022,
  1996,
  7398,
  2301,
  1005,
  1055,
  2047,
  1000,
  16608,
  1000,
  1998,
  2008,
  2002,
  1005,
  1055,
  2183,
  2000,
  2191,
  1037,
  17624,
  2130,
  3618,
  2084,
  7779,
  29058,
  8625,
  13327,
  1010,
  3744,
  1011,
  18856,
  19513,
  3158,
  5477,
  4168,
  2030,
  7112,
  16562,
  2140,
  1012,
  102,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
 

Dopo aver impostato il formato del dataset su quello di pytorch, il resto è identico ad un normale flow di training

In [42]:
from torch.utils.data import DataLoader
tokenized_dataset.set_format("torch")
bsz = 8
train_dl = DataLoader(tokenized_dataset['train'], shuffle=True, batch_size=bsz)
val_dl = DataLoader(tokenized_dataset['validation'], shuffle=True, batch_size=bsz)
test_dl = DataLoader(tokenized_dataset['test'], shuffle=True, batch_size=bsz)

In [43]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [44]:
inp = tokenized_dataset['train'][:2]

Adesso gli input sono dizionari

In [45]:

inp = {k: v.to(device) for k, v in inp.items()}

In [46]:
lab = inp.pop("labels")


In [47]:
lab

tensor([1, 1], device='cuda:0')

In [21]:
out = model(**inp)

In [22]:
val = out.logits
print(val)

tensor([[ 0.0437, -0.0704],
        [-0.0142, -0.1273]], device='cuda:0', grad_fn=<AddmmBackward0>)


In [23]:
epochs = 10
print(device)
for epoch in range(epochs):
    model.train()
    for batch in tqdm(train_dl, f"Epoch #{epoch+1}/{epochs}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

    # validation
    model.eval()
    with torch.no_grad():
        val_loss = 0
        val_accuracy = 0
        for batch in tqdm(val_dl, f"Epoch #{epoch+1} Validation", leave=False):
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            val_loss += outputs.loss
            val_accuracy += (outputs.logits.argmax(1) == batch['labels']).sum().float().item()
        print(val_loss / len(val_dl), val_accuracy / len(val_dl.dataset))
torch.save(model.state_dict(), "lab2.1_version_a.pth")

cuda


Epoch #1/10:   0%|          | 0/1067 [00:00<?, ?it/s]

Epoch #1/10: 100%|██████████| 1067/1067 [06:32<00:00,  2.72it/s]
                                                                      

tensor(0.6931, device='cuda:0') 0.5


Epoch #2/10: 100%|██████████| 1067/1067 [06:32<00:00,  2.72it/s]
                                                                      

tensor(0.6931, device='cuda:0') 0.5


Epoch #3/10: 100%|██████████| 1067/1067 [06:30<00:00,  2.73it/s]
                                                                      

tensor(0.6931, device='cuda:0') 0.5


Epoch #4/10: 100%|██████████| 1067/1067 [06:38<00:00,  2.68it/s]
                                                                      

tensor(0.6932, device='cuda:0') 0.5


Epoch #5/10: 100%|██████████| 1067/1067 [06:38<00:00,  2.68it/s]
                                                                      

tensor(0.6932, device='cuda:0') 0.5


Epoch #6/10: 100%|██████████| 1067/1067 [06:30<00:00,  2.73it/s]
                                                                      

tensor(0.6932, device='cuda:0') 0.5


Epoch #7/10: 100%|██████████| 1067/1067 [06:31<00:00,  2.73it/s]
                                                                      

tensor(0.6932, device='cuda:0') 0.5


Epoch #8/10: 100%|██████████| 1067/1067 [06:33<00:00,  2.71it/s]
                                                                      

tensor(0.6931, device='cuda:0') 0.5


Epoch #9/10: 100%|██████████| 1067/1067 [06:31<00:00,  2.72it/s]
                                                                      

tensor(0.6932, device='cuda:0') 0.5


Epoch #10/10: 100%|██████████| 1067/1067 [06:30<00:00,  2.73it/s]
                                                                       

tensor(0.6932, device='cuda:0') 0.5




In [24]:
model.load_state_dict(torch.load("lab2.1_version_a.pth"))

## Hybrid Implementation
In questo caso invece di usare un modello end-to-end per la classificazione nen usiamo uno adatto ad eseguire il solo encoding, usando la sua codifica come input di una testa linear di classificazione. L'aggregazione dell'embedding di tutti gli elementi della sequenza viene fatta per concatenazione

In [25]:
import torch.nn as nn
from transformers import AutoModel
class SentenceClassifier(nn.Module):

    def __init__(self, n_classes: int = 2) -> None:
        super(SentenceClassifier, self).__init__()
        self.encoder = AutoModel.from_pretrained(model_name) # sequence of 512 tokens with embeddings with d=768
        self.head = nn.Sequential(nn.Linear(512 * 768, 768), nn.ReLU(), nn.Linear(768, n_classes))
        self.flatten = nn.Flatten()

    def forward(self, X):
        # X is expected to be a dictionary with keys {embeddings, mask_ids}
        with torch.no_grad():
            X = self.encoder(**X).last_hidden_state
        return self.head(self.flatten(X))

In [48]:
model = SentenceClassifier().to(device)
adam = torch.optim.Adam(model.parameters(), lr=1e-3)

NameError: name 'SentenceClassifier' is not defined

In [28]:
epochs = 10
for epoch in range(epochs):
    model.train()
    for batch in tqdm(train_dl, f"Epoch #{epoch+1}/{epochs}"):
        batch = {k: v.to(device) for k, v in batch.items()}
        labels = batch.pop("labels")
        outputs = model(batch)
        loss = F.cross_entropy(outputs, labels)
        
        adam.zero_grad()
        loss.backward()
        adam.step()

    # validation
    with torch.no_grad():
        val_loss = 0
        val_accuracy = 0
        for batch in tqdm(val_dl, f"Epoch #{epoch+1} Validation", leave=False):
            batch = {k: v.to(device) for k, v in batch.items()}
            labels = batch.pop("labels")
            outputs = model(batch)
            val_loss += F.cross_entropy(outputs, labels)
            val_accuracy += (outputs.argmax(1) == labels).sum().float().item()
        print(val_loss / len(val_dl), val_accuracy / len(val_dl.dataset))
torch.save(model.state_dict(), "lab2.1_version_b.pth")

Epoch #1/10: 100%|██████████| 1067/1067 [04:00<00:00,  4.43it/s]
                                                                      

tensor(0.4779, device='cuda:0') 0.7776735459662288


Epoch #2/10: 100%|██████████| 1067/1067 [04:04<00:00,  4.36it/s]
                                                                      

tensor(0.4691, device='cuda:0') 0.7786116322701688


Epoch #3/10: 100%|██████████| 1067/1067 [04:04<00:00,  4.37it/s]
                                                                      

tensor(0.5618, device='cuda:0') 0.7833020637898687


Epoch #4/10: 100%|██████████| 1067/1067 [04:04<00:00,  4.37it/s]
                                                                      

tensor(0.4591, device='cuda:0') 0.7964352720450282


Epoch #5/10: 100%|██████████| 1067/1067 [04:04<00:00,  4.37it/s]
                                                                      

tensor(0.5078, device='cuda:0') 0.797373358348968


Epoch #6/10: 100%|██████████| 1067/1067 [04:04<00:00,  4.37it/s]
                                                                      

tensor(0.4981, device='cuda:0') 0.798311444652908


Epoch #7/10: 100%|██████████| 1067/1067 [04:04<00:00,  4.37it/s]
                                                                      

tensor(0.5408, device='cuda:0') 0.7954971857410882


Epoch #8/10: 100%|██████████| 1067/1067 [04:04<00:00,  4.37it/s]
                                                                      

tensor(0.6787, device='cuda:0') 0.7879924953095685


Epoch #9/10: 100%|██████████| 1067/1067 [04:04<00:00,  4.36it/s]
                                                                      

tensor(0.7164, device='cuda:0') 0.7833020637898687


Epoch #10/10: 100%|██████████| 1067/1067 [04:04<00:00,  4.37it/s]
                                                                       

tensor(0.9151, device='cuda:0') 0.774859287054409




In [None]:
model.load_state_dict(torch.load("lab2.1_version_b.pth"))