In [None]:
!pip install datasets transformers tokenizers torch scikit-learn -q

# LAB 2: SI700NLP

**Elaborado por**: Andres Felipe Restrepo Acevedo

In [None]:
pip install accelerate -q # pip install transformers[torch]

Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from datasets import load_dataset
from transformers import AutoTokenizer, TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import Whitespace
from tokenizers import Tokenizer, models, trainers, pre_tokenizers, processors

In [4]:
from transformers import PreTrainedTokenizerFast

In [5]:
# Load AG_NEWS dataset using the datasets library
dataset = load_dataset("ag_news")
dataset_train = dataset["train"]
dataset_test = dataset["test"]

In [6]:
len(dataset_train)

120000

In [7]:
# Initialize a BPE tokenizer
tokenizer = Tokenizer(models.BPE(unk_token="[UNK]"))

# Set the pre-tokenizer to split on whitespace
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

# Set up the trainer with special tokens
trainer = trainers.BpeTrainer(
    vocab_size=1000,
    special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]
)

# Define a generator to yield batches of text
def batch_iterator(batch_size=1000):
    for i in range(0, len(dataset_train), batch_size):
        yield dataset_train[i:i + batch_size]["text"]

# Train the tokenizer
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)

# Set the post-processor to add special tokens
tokenizer.post_processor = processors.TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", tokenizer.token_to_id("[CLS]")),
        ("[SEP]", tokenizer.token_to_id("[SEP]")),
    ],
)
# Enable truncation and padding
tokenizer.enable_truncation(max_length=128)
tokenizer.enable_padding(pad_id=tokenizer.token_to_id("[PAD]"), pad_token="[PAD]", length=128)






In [8]:
tokenizer.save("custom_tokenizer.json")

In [9]:
fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="custom_tokenizer.json")

In [10]:
def preprocess_data(examples):
    # Tokenize the text
    encodings = tokenizer.encode_batch(examples["text"])

    # Extract input IDs and attention masks
    input_ids = [encoding.ids for encoding in encodings]
    attention_masks = [encoding.attention_mask for encoding in encodings]

    return {
        "input_ids": input_ids,
        "attention_mask": attention_masks,
        "label": examples["label"]
    }


In [11]:
# Apply preprocessing
ds_train = dataset_train.map(preprocess_data, batched=True)
ds_test = dataset_test.map(preprocess_data, batched=True)

In [12]:
ds_train[0]

{'text': "Wall St. Bears Claw Back Into the Black (Reuters) Reuters - Short-sellers, Wall Street's dwindling\\band of ultra-cynics, are seeing green again.",
 'label': 2,
 'input_ids': [1,
  54,
  150,
  202,
  16,
  33,
  996,
  34,
  699,
  33,
  215,
  157,
  99,
  95,
  923,
  215,
  11,
  219,
  12,
  219,
  15,
  383,
  692,
  15,
  78,
  245,
  133,
  14,
  54,
  150,
  202,
  104,
  124,
  10,
  78,
  63,
  82,
  286,
  71,
  101,
  58,
  61,
  112,
  103,
  745,
  144,
  15,
  62,
  84,
  73,
  605,
  14,
  192,
  151,
  64,
  101,
  66,
  104,
  96,
  362,
  16,
  2,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3,
  3],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  

In [35]:
# Define Basic Embedding Model
class BasicEmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=3)
        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, input_ids, attention_mask, labels=None):
        embedded = self.embedding(input_ids)
        embedded = torch.mean(embedded, dim=1)  # Averaging embeddings
        logits = self.fc(embedded)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

In [36]:
# Model setup
embed_dim = 100
num_classes = 4
vocab_size = 1000
model = BasicEmbeddingModel(vocab_size, embed_dim, num_classes)
# Show model summary
print(model)

BasicEmbeddingModel(
  (embedding): Embedding(1000, 100, padding_idx=3)
  (fc): Linear(in_features=100, out_features=4, bias=True)
)


In [37]:
# Define compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    acc = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average='weighted')
    return {"accuracy": acc, "f1": f1}

In [41]:
# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=1e-2,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=5,
)

In [42]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=ds_train,
    eval_dataset=ds_test,
    processing_class=fast_tokenizer,
    compute_metrics=compute_metrics
)

In [43]:
# Train model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4211,0.428026,0.848421,0.84799
2,0.4064,0.423046,0.849079,0.848574
3,0.4029,0.416045,0.851316,0.850995
4,0.3885,0.414268,0.854211,0.854063
5,0.3816,0.4137,0.8525,0.852185


TrainOutput(global_step=18750, training_loss=0.3984001985677083, metrics={'train_runtime': 124.6024, 'train_samples_per_second': 4815.315, 'train_steps_per_second': 150.479, 'total_flos': 0.0, 'train_loss': 0.3984001985677083, 'epoch': 5.0})

# Lab: Enhancing a Basic Embedding Model with Positional Encoding and Multi-Head Attention

## Objective
In this lab, you will modify a basic sentiment analysis model by adding two key components from transformer-based architectures: **positional encoding** and **multi-head attention**. These modifications will improve the model’s ability to capture word order and token interactions, making it more expressive.

---

## Task Overview
You are provided with a basic embedding model that performs sentiment analysis by:
- Mapping token IDs to embeddings.
- Averaging embeddings across tokens.
- Passing the result through a fully connected layer for classification.

Your tasks:
1. **Implement positional encoding** to enrich word embeddings with information about token positions.
2. **Replace the mean pooling operation** with a **multi-head self-attention layer** to allow tokens to attend to each other.

---

## Step 1: Implement Positional Encoding
Transformers lack recurrence, so they rely on positional encodings to incorporate token order. You will implement **sinusoidal positional encoding**, as described in Vaswani et al. (2017), and add it to the input embeddings.

### Requirements
- Implement a function to generate sinusoidal positional encodings.
- Ensure the encoding is applied correctly to all input embeddings.

---

## Step 2: Implement Multi-Head Self-Attention
Instead of simply averaging word embeddings, you will replace this operation with **multi-head self-attention** to allow interactions between tokens.

### Requirements
- Implement a multi-head self-attention layer.
- Replace the `torch.mean(embedded, dim=1)` operation with the **attention mechanism**.
- Ensure that the implementation correctly processes padded tokens.

---


## Starter Code
Below is the base implementation of the model. You must complete the missing parts and train the model.

```python
import torch
import torch.nn as nn
import torch.nn.functional as F

class BasicEmbeddingModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, num_heads):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=3)
        
        # Task 1: Implement positional encoding
        self.positional_encoding = PositionalEncoding(embed_dim)

        # Task 2: Implement Multi-Head Attention (replace mean pooling)
        # self.multihead_attn = ???

        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, input_ids, attention_mask, labels=None):
        embedded = self.embedding(input_ids)
        
        # Apply positional encoding
        embedded = self.positional_encoding(embedded)

        # Task: Replace mean pooling with Multi-Head Attention
        # attn_output, _ = ???(embedded, embedded, embedded, key_padding_mask=attention_mask)
        
        # Pooling: Extract the first token's representation (CLS token equivalent)
        # pooled_output = attn_output[:, 0, :]

        logits = self.fc(pooled_output)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

# Students must implement this class
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=5000):
        super().__init__()
        self.encoding = self.create_positional_encoding(embed_dim, max_len)
        
    def create_positional_encoding(self, embed_dim, max_len):
        # Task: Implement sinusoidal positional encoding here
        pass

    def forward(self, x):
        # Task: Add positional encoding to embeddings
        pass
```

## Evaluation Criteria

To verify your implementation:

- Ensure that positional encoding correctly modifies embeddings.

- Confirm that the self-attention layer replaces mean pooling effectively.

- Train the modified model on a small sentiment dataset and compare results.

## Extra Challenges

For advanced students:

- Experiment with different pooling strategies (e.g., max pooling, CLS token extraction).

- Compare learned positional embeddings vs. sinusoidal encodings.

- Add layer normalization after attention.


## Step 1: Implement Positional Encoding

In [None]:
# Model setup
embed_dim = 100
num_classes = 4
vocab_size = 1000

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Students must implement this class
class PositionalEncoding(nn.Module):
    def __init__(self, embed_dim, max_len=500):
        super().__init__()
        self.encoding = self.create_positional_encoding(embed_dim, max_len)
        
    def create_positional_encoding(self, embed_dim, max_len):
        # Task: Implement sinusoidal positional encoding here
        position = np.arange(max_len)[:, np.newaxis]
        div_term = np.exp(np.arange(0, embed_dim, 2) * -(np.log(10000.0) / embed_dim))
        pe = np.zeros((max_len, embed_dim))
        pe[:, 0::2] = np.sin(position * div_term)
        pe[:, 1::2] = np.cos(position * div_term)
        pe = torch.from_numpy(pe).float()
        pe = pe.unsqueeze(0)  # Add batch dimension
        return pe

    def forward(self, x):
        self.encoding = self.encoding.to(x.device)
        # Task: Add positional encoding to embeddings
        x = x + self.encoding[:, :x.size(1)]
        return x

### Test Positional Encoding

In [55]:
def test_positional_encoding():
    # Parameters
    batch_size = 2
    seq_length = 5
    embed_dim = 10

    pos_enc = PositionalEncoding(embed_dim, max_len=seq_length)
    
    embeddings = torch.randn(batch_size, seq_length, embed_dim)
    
    encoded_embeddings = pos_enc(embeddings)

    # Validate dimensions
    assert embeddings.shape == encoded_embeddings.shape, "Dimensionality mismatch"

    # Check that same positional indices across sequences have the same encoding added
    test_embeddings = torch.ones(batch_size, seq_length, embed_dim)
    encoded_test_embeddings = pos_enc(test_embeddings)

    # Verify that positional encodings are similar for same positions
    pos_encoding_diff = (encoded_test_embeddings[0] - encoded_test_embeddings[1]).abs().max().item()
    assert pos_encoding_diff < 1e-5, f"Positional encodings vary across identical positions: {pos_encoding_diff}"
    
    print("All tests passed successfully.")

# Ejecutamos la prueba
test_positional_encoding()

All tests passed successfully.


## Step 2: Implement Multi-Head Self-Attention


In [None]:
class BasicEmbeddingModelInproved(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, num_heads):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=3)
        
        # Task 1: Implement positional encoding
        self.positional_encoding = PositionalEncoding(embed_dim)

        # Task 2: Implement Multi-Head Attention (replace mean pooling)
        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)

        self.fc = nn.Linear(embed_dim, num_classes)

    def forward(self, input_ids, attention_mask, labels=None):
        embedded = self.embedding(input_ids)
        
        # Apply positional encoding
        embedded = self.positional_encoding(embedded)

        # Transpose needed for MultiheadAttention: (seq_len, batch_size, embed_dim)
        embedded = embedded.transpose(0, 1) #new code
        
        # attention_mask needs to be inverted for key_padding_mask
        key_padding_mask = ~attention_mask.bool() # new code

        # Task: Replace mean pooling with Multi-Head Attention
        attn_output, _ = self.multihead_attn(embedded, embedded, embedded, key_padding_mask=key_padding_mask)

        # Switching back to (batch_size, seq_len, embed_dim)
        attn_output = attn_output.transpose(0, 1) #new code

        # Pooling: Extract the first token's representation (CLS token equivalent)

        pooled_output = attn_output[:, 0, :]

        logits = self.fc(pooled_output)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

## Train the modified model on a small sentiment dataset and compare results.

In [108]:
from transformers import EarlyStoppingCallback
def train_model(num_heads, ds_train, ds_test, fast_tokenizer):
    
    # Define Training Arguments
    training_args = TrainingArguments(
        output_dir="./results_{}".format(num_heads),
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=1e-3,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=50,
        load_best_model_at_end=True, 
    )
    # Model setup

    model_2 = BasicEmbeddingModelInproved(vocab_size, embed_dim, num_classes,num_heads=num_heads)
    # Show model summary
    print(model_2)

    # Define Trainer
    trainer = Trainer(
        model=model_2,
        args=training_args,
        train_dataset=ds_train,
        eval_dataset=ds_test,
        processing_class=fast_tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Train model
    trainer.train()

    return trainer

In [110]:
for num_heads in [1, 2, 10,25,50,100,500]:
    print(f"Training model with {num_heads} heads")
    train_model(num_heads, ds_train, ds_test, fast_tokenizer)

Training model with 1 heads
BasicEmbeddingModelInproved(
  (embedding): Embedding(1000, 100, padding_idx=3)
  (positional_encoding): PositionalEncoding()
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
  )
  (fc): Linear(in_features=100, out_features=4, bias=True)
)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4883,0.491868,0.825,0.824607
2,0.4317,0.433494,0.8475,0.846552
3,0.4122,0.426875,0.852105,0.852274
4,0.3953,0.420548,0.851447,0.851536
5,0.3952,0.418065,0.850526,0.850885
6,0.3824,0.405483,0.851711,0.851218
7,0.3952,0.41062,0.855921,0.855792
8,0.3728,0.419779,0.852632,0.851679
9,0.3889,0.406382,0.855395,0.855294


Training model with 2 heads
BasicEmbeddingModelInproved(
  (embedding): Embedding(1000, 100, padding_idx=3)
  (positional_encoding): PositionalEncoding()
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
  )
  (fc): Linear(in_features=100, out_features=4, bias=True)
)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4836,0.507013,0.823158,0.822999
2,0.4335,0.440952,0.843947,0.842958
3,0.4143,0.439832,0.848816,0.848987
4,0.3985,0.431093,0.847763,0.847863
5,0.3977,0.426572,0.850921,0.85102
6,0.3845,0.413845,0.854211,0.853682
7,0.3978,0.414575,0.853684,0.853629
8,0.3742,0.42723,0.849605,0.848686
9,0.3915,0.414728,0.854737,0.854449


Training model with 10 heads
BasicEmbeddingModelInproved(
  (embedding): Embedding(1000, 100, padding_idx=3)
  (positional_encoding): PositionalEncoding()
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
  )
  (fc): Linear(in_features=100, out_features=4, bias=True)
)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4796,0.509977,0.816316,0.816793
2,0.4368,0.446779,0.837632,0.836522
3,0.4169,0.442946,0.842237,0.842489
4,0.3938,0.429543,0.844079,0.844349
5,0.3909,0.415029,0.8525,0.852808
6,0.3807,0.412223,0.850789,0.84994
7,0.3901,0.413172,0.854342,0.854034
8,0.365,0.422173,0.851447,0.850972
9,0.3759,0.4096,0.856184,0.85576
10,0.3519,0.399591,0.855395,0.855103


Training model with 25 heads
BasicEmbeddingModelInproved(
  (embedding): Embedding(1000, 100, padding_idx=3)
  (positional_encoding): PositionalEncoding()
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
  )
  (fc): Linear(in_features=100, out_features=4, bias=True)
)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4662,0.469954,0.832237,0.831516
2,0.4222,0.427284,0.847763,0.847482
3,0.4042,0.422783,0.849868,0.85005
4,0.3814,0.422321,0.851053,0.851234
5,0.3715,0.416003,0.856316,0.856419
6,0.3582,0.412502,0.850395,0.849333
7,0.3664,0.39745,0.856447,0.856433
8,0.3397,0.398116,0.858158,0.857771
9,0.3507,0.411088,0.859342,0.858953
10,0.3254,0.397682,0.856579,0.856275


Training model with 50 heads
BasicEmbeddingModelInproved(
  (embedding): Embedding(1000, 100, padding_idx=3)
  (positional_encoding): PositionalEncoding()
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
  )
  (fc): Linear(in_features=100, out_features=4, bias=True)
)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4557,0.457944,0.831711,0.830976
2,0.4164,0.41977,0.846842,0.846238
3,0.3954,0.413358,0.849605,0.84986
4,0.3751,0.411707,0.851579,0.852138
5,0.3614,0.405729,0.857237,0.857253
6,0.3441,0.405391,0.853289,0.852385
7,0.346,0.40365,0.854211,0.853848
8,0.325,0.408529,0.852763,0.852122
9,0.3223,0.420796,0.8525,0.851982
10,0.2973,0.41223,0.855658,0.855308


Training model with 100 heads
BasicEmbeddingModelInproved(
  (embedding): Embedding(1000, 100, padding_idx=3)
  (positional_encoding): PositionalEncoding()
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
  )
  (fc): Linear(in_features=100, out_features=4, bias=True)
)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4492,0.45463,0.837895,0.837819
2,0.408,0.418975,0.847895,0.847473
3,0.3872,0.419678,0.851974,0.852338
4,0.3639,0.412922,0.851974,0.852375
5,0.352,0.406112,0.863289,0.863149
6,0.3259,0.415946,0.855789,0.855329
7,0.3292,0.405048,0.858026,0.858276
8,0.3014,0.425316,0.853816,0.853594
9,0.3025,0.43136,0.858684,0.858658
10,0.2718,0.429966,0.853947,0.854021


Training model with 500 heads


AssertionError: embed_dim must be divisible by num_heads

### conclusión

Pese a agregar el enconder posicional y las capas de multihead attention, no se logro una mejora representativa en las metricas. Esto se puede deber a que se requieren mas datos de entrenamiento. Se adecuo una parada de epocas anticipada para evitar overfiting del modelo.

## Extra Challenges

* ### Experiment with different pooling strategies (e.g., max pooling, CLS token extraction).

In [111]:
class BasicEmbeddingModelInproved2(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, num_heads, pooling_strategy="cls"):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=3)
        
        # Task 1: Implement positional encoding
        self.positional_encoding = PositionalEncoding(embed_dim)

        # Task 2: Implement Multi-Head Attention (replace mean pooling)
        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)

        self.fc = nn.Linear(embed_dim, num_classes)

         # Define la estrategia de pooling ("cls" o "max")
        if pooling_strategy not in ["cls", "max"]:
            raise ValueError("Pooling strategy must be either 'cls' or 'max'")
        self.pooling_strategy = pooling_strategy

    def forward(self, input_ids, attention_mask, labels=None):
        embedded = self.embedding(input_ids)
        
        # Apply positional encoding
        embedded = self.positional_encoding(embedded)

        # Transpose needed for MultiheadAttention: (seq_len, batch_size, embed_dim)
        embedded = embedded.transpose(0, 1) #new code
        
        # attention_mask needs to be inverted for key_padding_mask
        key_padding_mask = ~attention_mask.bool() # new code

        # Task: Replace mean pooling with Multi-Head Attention
        attn_output, _ = self.multihead_attn(embedded, embedded, embedded, key_padding_mask=key_padding_mask)

        # Switching back to (batch_size, seq_len, embed_dim)
        attn_output = attn_output.transpose(0, 1) #new code

        # Pooling: Extract the first token's representation (CLS token equivalent)

        # Estrategia de pooling
        if self.pooling_strategy == "cls":
            # Extraer la representación del primer token
            pooled_output = attn_output[:, 0, :]
        elif self.pooling_strategy == "max":
            # Max Pooling a través de la dimensión de secuencia
            pooled_output = torch.max(attn_output, dim=1)[0]

        logits = self.fc(pooled_output)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

In [115]:
from transformers import EarlyStoppingCallback
def train_model(pooling_strategy, ds_train, ds_test, fast_tokenizer):
    
    # Define Training Arguments
    training_args = TrainingArguments(
        output_dir="./results_{}".format(num_heads),
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=1e-3,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=50,
        load_best_model_at_end=True, 
    )
    # Model setup

    model3 = BasicEmbeddingModelInproved2(vocab_size, embed_dim, num_classes, num_heads=2, pooling_strategy=pooling_strategy)
    # Show model summary
    print(model3)

    # Define Trainer
    trainer = Trainer(
        model=model3,
        args=training_args,
        train_dataset=ds_train,
        eval_dataset=ds_test,
        processing_class=fast_tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Train model
    trainer.train()

    return trainer

In [117]:
for pooling_strategy in ["cls", "max"]:
    print(f"Training model with pooling_strategy {pooling_strategy}")
    train_model(pooling_strategy, ds_train, ds_test, fast_tokenizer)

Training model with pooling_strategy cls
BasicEmbeddingModelInproved2(
  (embedding): Embedding(1000, 100, padding_idx=3)
  (positional_encoding): PositionalEncoding()
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
  )
  (fc): Linear(in_features=100, out_features=4, bias=True)
)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4848,0.479975,0.83,0.829435
2,0.4319,0.430264,0.844737,0.8436
3,0.4117,0.432386,0.848289,0.848414
4,0.3966,0.42119,0.851316,0.851448
5,0.3958,0.418295,0.854211,0.854139
6,0.3815,0.404786,0.856974,0.856601
7,0.393,0.410053,0.857368,0.857286
8,0.3719,0.421451,0.851579,0.850849
9,0.3871,0.405998,0.855,0.854819


Training model with pooling_strategy max
BasicEmbeddingModelInproved2(
  (embedding): Embedding(1000, 100, padding_idx=3)
  (positional_encoding): PositionalEncoding()
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
  )
  (fc): Linear(in_features=100, out_features=4, bias=True)
)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4822,0.486686,0.821053,0.821056
2,0.4301,0.449983,0.833684,0.832066
3,0.3983,0.417241,0.851842,0.852279
4,0.3676,0.411708,0.852763,0.852626
5,0.3529,0.400288,0.858553,0.858537
6,0.3363,0.396893,0.859868,0.859686
7,0.3337,0.391645,0.861842,0.86177
8,0.309,0.423764,0.852237,0.851271
9,0.308,0.407651,0.861842,0.861832
10,0.2775,0.414494,0.859737,0.859191


* ### Compare learned positional embeddings vs. sinusoidal encodings.

In [133]:
import torch
import torch.nn as nn
import math

class PositionalEncodingInproved(nn.Module):
    def __init__(self, embed_dim, max_len=5000, method="sinusoidal"):
        super().__init__()
        self.method = method
        assert method in ["sinusoidal", "learned"], "Method should be 'sinusoidal' or 'learned'"
        
        if method == "sinusoidal":
            # Create a constant positional encoding matrix with sinusoidal values
            position = torch.arange(0, max_len).unsqueeze(1).float()
            div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * -(math.log(10000.0) / embed_dim))
            pe = torch.zeros(max_len, 1, embed_dim)
            pe[:, 0, 0::2] = torch.sin(position * div_term)
            pe[:, 0, 1::2] = torch.cos(position * div_term)
            self.register_buffer('pe', pe)
        elif method == "learned":
            # Use nn.Embedding to learn positional encodings
            self.position_embedding = nn.Embedding(max_len, embed_dim)
        
    def forward(self, x):
        if self.method == "sinusoidal":
            x = x + self.pe[:x.size(1)]
        elif self.method == "learned":
            positions = torch.arange(x.size(1), device=x.device).unsqueeze(0).expand(x.size(0), -1)
            x = x + self.position_embedding(positions)
        return x

class BasicEmbeddingModelInproved3(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, num_heads, max_len=5000, pooling_strategy="cls", pe_method="sinusoidal"):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=3)
        self.positional_encoding = PositionalEncodingInproved(embed_dim, max_len, method=pe_method)
        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
        self.fc = nn.Linear(embed_dim, num_classes)
        
        assert pooling_strategy in ["cls", "max"], "pooling_strategy debe ser 'cls' o 'max'"
        self.pooling_strategy = pooling_strategy

    def forward(self, input_ids, attention_mask, labels=None):
        embedded = self.embedding(input_ids)
        embedded = self.positional_encoding(embedded)
        embedded = embedded.transpose(0, 1)

        key_padding_mask = ~attention_mask.bool()
        attn_output, _ = self.multihead_attn(embedded, embedded, embedded, key_padding_mask=key_padding_mask)
        attn_output = attn_output.transpose(0, 1)

        if self.pooling_strategy == "cls":
            pooled_output = attn_output[:, 0, :]
        elif self.pooling_strategy == "max":
            pooled_output = torch.max(attn_output, dim=1)[0]

        logits = self.fc(pooled_output)
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}


In [146]:
from transformers import EarlyStoppingCallback
def train_model(ds_train, ds_test,fast_tokenizer, max_len=5000, pooling_strategy="max", pe_method="learned",num_heads=2):
    
    # Define Training Arguments
    training_args = TrainingArguments(
        output_dir="./results_{}".format(num_heads),
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=1e-3,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=50,
        load_best_model_at_end=True, 
    )
    # Model setup

   
    model4 = BasicEmbeddingModelInproved3(vocab_size, embed_dim, num_classes, max_len = max_len, num_heads=num_heads, 
                                    pooling_strategy=pooling_strategy, pe_method=pe_method)
    # Show model summary
    print(model4)

    # Define Trainer
    trainer = Trainer(
        model=model4,
        args=training_args,
        train_dataset=ds_train,
        eval_dataset=ds_test,
        processing_class=fast_tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Train model
    trainer.train()

    return trainer

In [147]:
train_model(ds_train, ds_test, fast_tokenizer,max_len=5,pooling_strategy="max", pe_method="learned",num_heads=2)

BasicEmbeddingModelInproved3(
  (embedding): Embedding(1000, 100, padding_idx=3)
  (positional_encoding): PositionalEncodingInproved(
    (position_embedding): Embedding(5, 100)
  )
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
  )
  (fc): Linear(in_features=100, out_features=4, bias=True)
)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4494,0.454042,0.839211,0.838656
2,0.4096,0.419236,0.850658,0.850111
3,0.3824,0.408171,0.856316,0.856793
4,0.3587,0.403927,0.857105,0.857138
5,0.3493,0.404277,0.861974,0.861823
6,0.3295,0.389227,0.861579,0.861091
7,0.3273,0.395465,0.864605,0.864582
8,0.3009,0.415779,0.857105,0.856392
9,0.303,0.41341,0.863553,0.863179


<transformers.trainer.Trainer at 0x335d1aab0>

In [148]:
train_model(ds_train, ds_test, fast_tokenizer,max_len=5,pooling_strategy="max", pe_method="learned",num_heads=10)

BasicEmbeddingModelInproved3(
  (embedding): Embedding(1000, 100, padding_idx=3)
  (positional_encoding): PositionalEncodingInproved(
    (position_embedding): Embedding(5, 100)
  )
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
  )
  (fc): Linear(in_features=100, out_features=4, bias=True)
)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4391,0.43755,0.842632,0.841923
2,0.3911,0.403842,0.853816,0.853051
3,0.3677,0.392838,0.856842,0.856735
4,0.3362,0.390869,0.865,0.865433
5,0.3187,0.405102,0.862237,0.862278
6,0.2945,0.395169,0.861184,0.860743
7,0.2839,0.416794,0.858421,0.858639


<transformers.trainer.Trainer at 0x36ab5df10>

In [149]:
train_model(ds_train, ds_test, fast_tokenizer,max_len=10,pooling_strategy="max", pe_method="learned",num_heads=2)

BasicEmbeddingModelInproved3(
  (embedding): Embedding(1000, 100, padding_idx=3)
  (positional_encoding): PositionalEncodingInproved(
    (position_embedding): Embedding(10, 100)
  )
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
  )
  (fc): Linear(in_features=100, out_features=4, bias=True)
)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.466,0.468439,0.830263,0.829341
2,0.4125,0.427626,0.847763,0.847307
3,0.3882,0.411525,0.851974,0.851678
4,0.3586,0.41042,0.853553,0.854024
5,0.3501,0.408897,0.859342,0.859048
6,0.3269,0.402689,0.859079,0.858843
7,0.3371,0.40484,0.861711,0.861642
8,0.305,0.423911,0.855921,0.855372
9,0.3043,0.419663,0.861053,0.860986


<transformers.trainer.Trainer at 0x335dc5970>

* ### Add layer normalization after attention.

In [150]:
import torch
import torch.nn as nn
import math

class PositionalEncodingInproved(nn.Module):
    def __init__(self, embed_dim, max_len=500, method="learned"):
        super().__init__()
        self.method = method
        assert method in ["sinusoidal", "learned"], "Method should be 'sinusoidal' or 'learned'"
        
        if method == "sinusoidal":
            # Create a constant positional encoding matrix with sinusoidal values
            position = torch.arange(0, max_len).unsqueeze(1).float()
            div_term = torch.exp(torch.arange(0, embed_dim, 2).float() * -(math.log(10000.0) / embed_dim))
            pe = torch.zeros(max_len, 1, embed_dim)
            pe[:, 0, 0::2] = torch.sin(position * div_term)
            pe[:, 0, 1::2] = torch.cos(position * div_term)
            self.register_buffer('pe', pe)
        elif method == "learned":
            # Use nn.Embedding to learn positional encodings
            self.position_embedding = nn.Embedding(max_len, embed_dim)
        
    def forward(self, x):
        if self.method == "sinusoidal":
            x = x + self.pe[:x.size(1)]
        elif self.method == "learned":
            positions = torch.arange(x.size(1), device=x.device).unsqueeze(0).expand(x.size(0), -1)
            x = x + self.position_embedding(positions)
        return x

class BasicEmbeddingModelInproved4(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_classes, num_heads, max_len=5000, pooling_strategy="cls", pe_method="learned"):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=3)
        self.positional_encoding = PositionalEncodingInproved(embed_dim, max_len, method=pe_method)
        self.multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
        # Add LayerNorm after attention
        self.layer_norm = nn.LayerNorm(embed_dim)
        self.fc = nn.Linear(embed_dim, num_classes)
        
        assert pooling_strategy in ["cls", "max"], "pooling_strategy debe ser 'cls' o 'max'"
        self.pooling_strategy = pooling_strategy

    def forward(self, input_ids, attention_mask, labels=None):
        embedded = self.embedding(input_ids)
        embedded = self.positional_encoding(embedded)
        embedded = embedded.transpose(0, 1)

        key_padding_mask = ~attention_mask.bool()
        attn_output, _ = self.multihead_attn(embedded, embedded, embedded, key_padding_mask=key_padding_mask)
        attn_output = attn_output.transpose(0, 1)

        if self.pooling_strategy == "cls":
            pooled_output = attn_output[:, 0, :]
        elif self.pooling_strategy == "max":
            pooled_output = torch.max(attn_output, dim=1)[0]

        logits = self.fc(pooled_output)
        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {"loss": loss, "logits": logits} if loss is not None else {"logits": logits}

In [151]:
from transformers import EarlyStoppingCallback
def train_model(ds_train, ds_test,fast_tokenizer, max_len=50, pooling_strategy="max", pe_method="learned",num_heads=2):
    
    # Define Training Arguments
    training_args = TrainingArguments(
        output_dir="./results_{}".format(num_heads),
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=1e-3,
        per_device_train_batch_size=32,
        per_device_eval_batch_size=32,
        num_train_epochs=50,
        load_best_model_at_end=True, 
    )
    # Model setup

   
    model4 = BasicEmbeddingModelInproved4(vocab_size, embed_dim, num_classes, max_len = max_len, num_heads=num_heads, 
                                    pooling_strategy=pooling_strategy, pe_method=pe_method)
    # Show model summary
    print(model4)

    # Define Trainer
    trainer = Trainer(
        model=model4,
        args=training_args,
        train_dataset=ds_train,
        eval_dataset=ds_test,
        processing_class=fast_tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )

    # Train model
    trainer.train()

    return trainer

In [152]:
train_model(ds_train, ds_test, fast_tokenizer,max_len=50,pooling_strategy="max", pe_method="learned",num_heads=2)

BasicEmbeddingModelInproved4(
  (embedding): Embedding(1000, 100, padding_idx=3)
  (positional_encoding): PositionalEncodingInproved(
    (position_embedding): Embedding(50, 100)
  )
  (multihead_attn): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=100, out_features=100, bias=True)
  )
  (layer_norm): LayerNorm((100,), eps=1e-05, elementwise_affine=True)
  (fc): Linear(in_features=100, out_features=4, bias=True)
)


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4814,0.469952,0.828421,0.827804
2,0.4291,0.432803,0.843553,0.843225
3,0.403,0.422411,0.8475,0.84776
4,0.3788,0.411985,0.854737,0.854619
5,0.3674,0.405253,0.855132,0.855046
6,0.3539,0.403804,0.858158,0.85743
7,0.3576,0.396494,0.859211,0.858997
8,0.3353,0.443442,0.851579,0.85072
9,0.3351,0.404788,0.863816,0.863706
10,0.3048,0.414321,0.859868,0.859464


<transformers.trainer.Trainer at 0x36ab49d30>

### conclusion: 

Durante las pruebas, se experimentó con varias configuraciones del modelo para mejorar la precisión y el puntaje F1. Se implementaron diferentes estrategias de pooling, incluyendo max pooling y extracción del token CLS, pero no se observaron mejoras significativas en el rendimiento. Además, se añadió normalización por capas (Layer Normalization) después de la capa de atención, y se compararon distintos métodos de codificación posicional, como learned positional embeddings VS codificaciones sinusoidales.

En los resultados no se observa un aumento notable en la precisión ni en el puntaje F1. Esto sugiere que podría ser necesario realizar un ajuste más fino de los hiperparámetros. Variables como la tasa de aprendizaje, el número de cabezas de atención (num_heads), y el tamaño de los embeddings (embed_dim) deben ser exploradas con técnicas grid search o random search para encontrar configuraciones más óptimas.

Otra posibilidad es que los datos de entrenamiento actuales no sean suficientes para que el modelo aprenda de manera efectiva.

In [171]:
import random
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback

# Definir el espacio de búsqueda de hiperparámetros
param_grid = {
    'max_len': [32, 64, 96, 128],
    'learning_rate': [1e-4, 5e-3, 1e-2],
    'pooling_strategy': ["max", "cls"],
    'pe_method': ["learned", "sinusoidal"],
    'batch_size': [8, 16, 32, 64]
}

# Ajuste inicial para embed_dim y num_heads
possible_heads = [2, 4, 8]  
embed_dim_choices = [64, 128, 256, 512]  

# Combinaciones válidas
valid_combinations = [(embed_dim, num_heads) for embed_dim in embed_dim_choices for num_heads in possible_heads if embed_dim % num_heads == 0]

# Definir la métrica de mejor resultado
best_accuracy = 0
best_params = {}

# Conjunto para almacenar combinaciones ya probadas
tested_combinations = set()

# Realizar Random Search con un número determinado de intentos
n_trials = 20

while len(tested_combinations) < n_trials:
    # Seleccionar aleatoriamente los hiperparámetros
    max_len = random.choice(param_grid['max_len'])
    learning_rate = random.choice(param_grid['learning_rate'])
    pooling_strategy = random.choice(param_grid['pooling_strategy'])
    pe_method = random.choice(param_grid['pe_method'])
    batch_size = random.choice(param_grid['batch_size'])

    # Selecciona una combinación válida de embed_dim y num_heads
    embed_dim, num_heads = random.choice(valid_combinations)

    # Crear una tupla de la combinación actual
    current_combination = (
        max_len, 
        learning_rate, 
        pooling_strategy, 
        pe_method, 
        batch_size, 
        embed_dim, 
        num_heads
    )
    
    # Verificar si la combinación ya ha sido probada
    if current_combination in tested_combinations:
        continue
    
    # Añadir la combinación actual al conjunto de combinaciones probadas
    tested_combinations.add(current_combination)

    print({
        'max_len': max_len,
        'learning_rate': learning_rate,
        'num_heads': num_heads,
        'pooling_strategy': pooling_strategy,
        'pe_method': pe_method,
        'batch_size': batch_size
    })
    
    # Configuración del entrenamiento
    training_args = TrainingArguments(
        output_dir=f"./results_{num_heads}",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=learning_rate,
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=50,
        load_best_model_at_end=True, 
    )
    
    # Configuración del modelo
    model4 = BasicEmbeddingModelInproved4(
        vocab_size=vocab_size, 
        embed_dim=embed_dim, 
        num_classes=num_classes,
        max_len=max_len, 
        num_heads=num_heads, 
        pooling_strategy=pooling_strategy, 
        pe_method=pe_method
    )
    
    # Definir Trainer
    trainer = Trainer(
        model=model4,
        args=training_args,
        train_dataset=ds_train,
        eval_dataset=ds_test,
        processing_class=fast_tokenizer,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]
    )
    
    try:
        # Entrenar y evaluar el modelo
        trainer.train()
        eval_results = trainer.evaluate()
        
        # Comparar y guardar los mejores parámetros
        if eval_results["eval_accuracy"] > best_accuracy:
            best_accuracy = eval_results["eval_accuracy"]
            best_params = {
                'max_len': max_len,
                'learning_rate': learning_rate,
                'num_heads': num_heads,
                'pooling_strategy': pooling_strategy,
                'pe_method': pe_method,
                'batch_size': batch_size
            }
    except Exception as e:
        print(f"Error en la combinación {current_combination}: {e}")
        continue

# Imprimir los mejores hiperparámetros encontrados
print("Mejores hiperparámetros:", best_params)
print("Mejor precisión:", best_accuracy)

{'max_len': 32, 'learning_rate': 0.0001, 'num_heads': 8, 'pooling_strategy': 'cls', 'pe_method': 'learned', 'batch_size': 16}


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.6568,0.647464,0.758947,0.757945
2,0.5425,0.544942,0.805,0.804037
3,0.4894,0.50493,0.820132,0.819745
4,0.4739,0.490307,0.827632,0.82752
5,0.4257,0.478359,0.829211,0.828759
6,0.4392,0.464823,0.834342,0.833787
7,0.4078,0.461696,0.838421,0.838432
8,0.4112,0.463692,0.837895,0.836944
9,0.3901,0.449961,0.846053,0.845673
10,0.3966,0.44451,0.844342,0.844242


{'max_len': 32, 'learning_rate': 0.01, 'num_heads': 2, 'pooling_strategy': 'max', 'pe_method': 'sinusoidal', 'batch_size': 8}
Error en la combinación (32, 0.01, 'max', 'sinusoidal', 8, 64, 2): The size of tensor a (8) must match the size of tensor b (32) at non-singleton dimension 0
{'max_len': 32, 'learning_rate': 0.0001, 'num_heads': 8, 'pooling_strategy': 'max', 'pe_method': 'learned', 'batch_size': 16}


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4447,0.438722,0.840658,0.840125
2,0.3742,0.398508,0.858553,0.85805
3,0.325,0.397276,0.864474,0.86472
4,0.2786,0.435335,0.8625,0.862825
5,0.2114,0.476803,0.862895,0.862645
6,0.1778,0.543318,0.860526,0.860192


{'max_len': 128, 'learning_rate': 0.0001, 'num_heads': 8, 'pooling_strategy': 'cls', 'pe_method': 'sinusoidal', 'batch_size': 8}
Error en la combinación (128, 0.0001, 'cls', 'sinusoidal', 8, 64, 8): The size of tensor a (8) must match the size of tensor b (128) at non-singleton dimension 0
{'max_len': 128, 'learning_rate': 0.005, 'num_heads': 8, 'pooling_strategy': 'cls', 'pe_method': 'learned', 'batch_size': 16}


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.1389,1.183061,0.536316,0.525861
2,1.0026,0.991661,0.645395,0.638543
3,0.9403,1.065222,0.624605,0.600784
4,0.9083,1.000313,0.663947,0.664833
5,0.8536,0.856131,0.701184,0.701491
6,0.8408,0.842988,0.692105,0.690517
7,0.7984,0.755851,0.717368,0.715295
8,0.7948,0.73047,0.726316,0.72475
9,0.7613,0.8031,0.713553,0.713791
10,0.766,0.764248,0.729079,0.727999


{'max_len': 32, 'learning_rate': 0.0001, 'num_heads': 8, 'pooling_strategy': 'cls', 'pe_method': 'learned', 'batch_size': 32}


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5692,0.546288,0.797895,0.79685
2,0.4854,0.48743,0.820526,0.81937
3,0.4621,0.458695,0.834605,0.834347
4,0.4273,0.448725,0.834474,0.834396
5,0.4206,0.437889,0.840132,0.839511
6,0.4045,0.426958,0.843553,0.84307
7,0.4184,0.423854,0.846711,0.846718
8,0.395,0.425619,0.845789,0.844932
9,0.4034,0.417834,0.845,0.844739
10,0.3807,0.413559,0.849342,0.849376


{'max_len': 96, 'learning_rate': 0.0001, 'num_heads': 4, 'pooling_strategy': 'cls', 'pe_method': 'learned', 'batch_size': 64}


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.0585,0.947974,0.617105,0.614189
2,0.7956,0.768107,0.700789,0.70014
3,0.675,0.666767,0.743158,0.742378
4,0.6113,0.610994,0.765921,0.765237
5,0.5763,0.573831,0.784079,0.782902
6,0.5428,0.547347,0.796711,0.795842
7,0.5188,0.527102,0.805789,0.805263
8,0.4967,0.514855,0.811711,0.811062
9,0.4875,0.50334,0.817763,0.817541
10,0.4767,0.495343,0.819211,0.818858


{'max_len': 96, 'learning_rate': 0.01, 'num_heads': 2, 'pooling_strategy': 'cls', 'pe_method': 'learned', 'batch_size': 8}


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.5147,1.428622,0.305526,0.294696
2,1.2497,1.192906,0.538553,0.534223
3,1.0208,1.148587,0.571053,0.536576
4,1.0314,1.070923,0.647763,0.648199
5,1.0014,0.949492,0.656447,0.655149
6,0.9641,1.005667,0.657368,0.652653
7,1.4842,1.475851,0.251711,0.209561
8,1.3814,1.415657,0.351579,0.33227


{'max_len': 64, 'learning_rate': 0.005, 'num_heads': 2, 'pooling_strategy': 'max', 'pe_method': 'learned', 'batch_size': 8}


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,9.518,8.991953,0.462368,0.449918
2,11.6118,13.836452,0.429605,0.403024
3,10.9751,10.022132,0.450658,0.437897
4,9.8745,8.6762,0.501053,0.494855
5,8.6556,9.81011,0.404211,0.381302
6,9.038,8.522888,0.460789,0.449429
7,8.1195,8.452899,0.470263,0.467847
8,7.4559,8.0046,0.460921,0.462154
9,6.7767,5.420787,0.476184,0.476683
10,6.1802,6.192521,0.498289,0.479332


{'max_len': 96, 'learning_rate': 0.005, 'num_heads': 8, 'pooling_strategy': 'cls', 'pe_method': 'learned', 'batch_size': 32}


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.9078,0.864747,0.6475,0.647391
2,0.8068,0.822606,0.688553,0.686
3,0.7994,0.823475,0.694605,0.692401
4,0.7834,0.881375,0.691447,0.687911
5,0.7607,0.774867,0.719342,0.71676
6,0.7537,0.768359,0.704474,0.701818
7,0.7882,0.818145,0.7075,0.705641
8,0.7563,0.788734,0.705,0.703921
9,0.762,0.7186,0.729211,0.728436
10,0.7153,0.78672,0.722105,0.71709


{'max_len': 64, 'learning_rate': 0.01, 'num_heads': 8, 'pooling_strategy': 'cls', 'pe_method': 'learned', 'batch_size': 16}


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,6.2143,5.772493,0.318947,0.31303
2,6.703,5.650516,0.390526,0.380913
3,5.8633,4.616207,0.355,0.347532
4,5.7318,5.987925,0.451184,0.452596
5,5.4992,5.177403,0.528553,0.526618
6,4.869,4.935938,0.489868,0.473886


{'max_len': 64, 'learning_rate': 0.0001, 'num_heads': 4, 'pooling_strategy': 'cls', 'pe_method': 'sinusoidal', 'batch_size': 32}
Error en la combinación (64, 0.0001, 'cls', 'sinusoidal', 32, 512, 4): The size of tensor a (32) must match the size of tensor b (64) at non-singleton dimension 0
{'max_len': 64, 'learning_rate': 0.01, 'num_heads': 2, 'pooling_strategy': 'cls', 'pe_method': 'learned', 'batch_size': 16}


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,1.4429,1.43129,0.285,0.223953
2,1.4093,1.380262,0.325263,0.324974
3,1.1887,1.216639,0.460789,0.464598
4,1.2295,1.260209,0.425395,0.409814
5,1.0889,1.094133,0.532105,0.513743
6,1.0616,1.074172,0.549605,0.549586
7,0.9281,0.966581,0.641842,0.639473
8,0.8925,0.87412,0.667763,0.666184
9,0.8929,0.901799,0.657895,0.655298
10,1.0295,1.042571,0.582237,0.57812


{'max_len': 96, 'learning_rate': 0.005, 'num_heads': 2, 'pooling_strategy': 'cls', 'pe_method': 'learned', 'batch_size': 16}


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,4.1538,4.205808,0.258816,0.25756
2,5.022,4.924616,0.247368,0.233346
3,2.6604,2.486738,0.263816,0.237793
4,4.1675,4.545487,0.281974,0.254505
5,3.8831,3.735919,0.266053,0.264213
6,3.7391,4.222735,0.297895,0.298176


{'max_len': 96, 'learning_rate': 0.0001, 'num_heads': 8, 'pooling_strategy': 'cls', 'pe_method': 'sinusoidal', 'batch_size': 64}
Error en la combinación (96, 0.0001, 'cls', 'sinusoidal', 64, 64, 8): The size of tensor a (64) must match the size of tensor b (96) at non-singleton dimension 0
{'max_len': 96, 'learning_rate': 0.0001, 'num_heads': 2, 'pooling_strategy': 'max', 'pe_method': 'sinusoidal', 'batch_size': 64}
Error en la combinación (96, 0.0001, 'max', 'sinusoidal', 64, 512, 2): The size of tensor a (64) must match the size of tensor b (96) at non-singleton dimension 0
{'max_len': 128, 'learning_rate': 0.005, 'num_heads': 4, 'pooling_strategy': 'max', 'pe_method': 'learned', 'batch_size': 64}


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.4991,0.496634,0.821711,0.820616
2,0.4438,0.439033,0.840263,0.840155
3,0.4307,0.459899,0.840132,0.840436
4,0.4209,0.454686,0.840658,0.841095
5,0.4216,0.441907,0.847237,0.846838


{'max_len': 32, 'learning_rate': 0.0001, 'num_heads': 2, 'pooling_strategy': 'max', 'pe_method': 'learned', 'batch_size': 64}


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,0.5199,0.47767,0.825789,0.824804
2,0.4244,0.432626,0.842237,0.84132
3,0.3745,0.408702,0.851316,0.85087
4,0.3305,0.411346,0.854211,0.853792
5,0.2939,0.417913,0.854342,0.853897
6,0.2591,0.427439,0.858947,0.858294


{'max_len': 32, 'learning_rate': 0.005, 'num_heads': 8, 'pooling_strategy': 'cls', 'pe_method': 'sinusoidal', 'batch_size': 32}


Epoch,Training Loss,Validation Loss


Error en la combinación (32, 0.005, 'cls', 'sinusoidal', 32, 256, 8): The size of tensor a (16) must match the size of tensor b (32) at non-singleton dimension 0
{'max_len': 32, 'learning_rate': 0.01, 'num_heads': 4, 'pooling_strategy': 'max', 'pe_method': 'sinusoidal', 'batch_size': 32}


Epoch,Training Loss,Validation Loss


Error en la combinación (32, 0.01, 'max', 'sinusoidal', 32, 64, 4): The size of tensor a (16) must match the size of tensor b (32) at non-singleton dimension 0
Mejores hiperparámetros: {'max_len': 32, 'learning_rate': 0.0001, 'num_heads': 8, 'pooling_strategy': 'max', 'pe_method': 'learned', 'batch_size': 16}
Mejor precisión: 0.8644736842105263


In [174]:
mejora_porcentual = ((0.8644736842105263-0.852500)/0.852500)*100
print(mejora_porcentual)

1.4045377373051318


### conclusion ajuste hiperparametros
Con un ajuste de hiperparametros , se optiene una mejora en la precision de de 1.4% , lo cual no es relevante versus el aumento de complejidad del modelo. Y confirma que se deben aumentar la cantidad de datos para poder alcancer una metrica mayor.
Mejores hiperparámetros: {'max_len': 32, 'learning_rate': 0.0001, 'num_heads': 8, 'pooling_strategy': 'max', 'pe_method': 'learned', 'batch_size': 16}
Mejor precisión: 0.8644736842105263