## Dependencies

Due to this codebase being inherited from 2022, we have to install a previous version of torch due to torchtext being deprecated.

In [None]:
!pip install --upgrade torch==2.3.0
!pip install --upgrade torchtext==0.18.0
!pip install --upgrade torchvision==0.18.0

Collecting torch==2.3.0
  Downloading torch-2.3.0-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch==2.3.0)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch==2.3.0)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch==2.3.0)
  Downloading nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch==2.3.0)
  Downloading nvidia_cufft_cu12-11.0.2.54-py3-none-manylin

In [None]:
import warnings
from sklearn.exceptions import UndefinedMetricWarning
warnings.filterwarnings("ignore", category=UndefinedMetricWarning)

## Shared codebase

The following defines useful methods, the definition of models to use and anything that is not subject to change.*italicized text*

In [None]:
import torch


import time
import pandas as pd
import torch.optim as optim
from torch.utils.data import Dataset
from collections import Counter
from torch.nn.utils.rnn import pad_sequence
import torch.nn as nn
import torch.nn.functional as F
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import vocab as torch_vocab
from torch.utils.data import DataLoader


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

from sklearn.metrics import f1_score, precision_score, recall_score

def build_vocab_from_corpus(dataset, tokenizer):
    """
    Given a corpus, builds a Vocab object with tokenizer function.
    """
    counter = Counter()
    for _headline, _body, _ in dataset:
        counter.update(tokenizer(_headline + "\n\n" + _body))
    v = torch_vocab(counter, specials=["BOH", "EOH", "BOP", "EOP"])
    return v

def calculate_metrics(predictions, y_true):
    """
    A simple funcion that given predictions and the expected results
    will return the precision, recall and f1 (macro).
    """
    y_pred = predictions.argmax(dim=1)
    y_true = y_true.argmax(dim=1)
    f1 = f1_score(y_true, y_pred, average='macro')
    precision = precision_score(y_true, y_pred, average='macro')
    recall = recall_score(y_true, y_pred, average='macro')
    return precision, recall, f1

def training_step(model, dataloader, optimizer, criterion, device):
    """
    Given a model, a dataloader, a optimizer and a criterion, it wil perform
    a training step in batches.
    """

    epoch_loss = 0
    epoch_precision = 0
    epoch_recall = 0
    epoch_f1 = 0

    model.train()

    i = 0

    for labels, headlines, bodies in dataloader:
        i += 1
        # Move to device (usually GPU/TPU)
        labels = labels.to(device)
        headlines = headlines.to(device)
        bodies = bodies.to(device)
        # Reset gradients for previous steps
        optimizer.zero_grad()

        # Perform predictions
        predictions = model(headlines, bodies)

        # Criterion to define target
        loss = criterion(predictions, torch.argmax(labels, dim=1))

        # Metrics
        predictions = predictions.to('cpu')
        labels = labels.to('cpu')
        headlines = headlines.to('cpu')
        bodies = bodies.to('cpu')
        precision, recall, f1 = calculate_metrics(predictions, labels)

        # Gradients
        loss.backward()

        # Optimization
        optimizer.step()

        # Update metrics
        epoch_loss += loss.item()
        epoch_precision += precision
        epoch_recall += recall
        epoch_f1 += f1

    return epoch_loss / len(dataloader), epoch_precision / len(
        dataloader), epoch_recall / len(dataloader), epoch_f1 / len(dataloader)

def evaluate(model, dataloader, criterion):

    epoch_loss = 0
    epoch_precision = 0
    epoch_recall = 0
    epoch_f1 = 0

    model.eval()

    # Indicamos que ahora no guardaremos los gradientes
    with torch.no_grad():
        # Por cada batch
        for labels, headlines, bodies in dataloader:

            labels = labels.to(device)
            headlines = headlines.to(device)
            bodies = bodies.to(device)

            # Predecimos
            predictions = model(headlines, bodies)

            # Calculamos el Cross Entropy de las predicciones con respecto a las etiquetas reales
            loss = criterion(predictions, torch.argmax(labels, dim=1))

            predictions = predictions.to('cpu')
            labels = labels.to('cpu')
            headlines = headlines.to('cpu')
            bodies = bodies.to('cpu')

            # Calculamos las métricas
            precision, recall, f1 = calculate_metrics(predictions, labels)

            # Actualizamos el loss y las métricas
            epoch_loss += loss.item()
            epoch_precision += precision
            epoch_recall += recall
            epoch_f1 += f1

    return epoch_loss / len(dataloader), epoch_precision / len(
        dataloader), epoch_recall / len(dataloader), epoch_f1 / len(dataloader)


def run_experiment(model, model_name, criterion, train_dataloader, valid_dataloader, n_epochs, device):

  model.to(device)
  criterion.to(device)
  best_valid_loss = float('inf')

  for epoch in range(n_epochs):

      start_time = time.time()

      # Recuerdo: train_iterator y valid_iterator contienen el dataset dividido en batches.

      # Entrenar
      train_loss, train_precision, train_recall, train_f1 = training_step(
            model, train_dataloader, optimizer, criterion, device)

      print("Finishing training epoch")

        # Evaluar (valid = validación)
      valid_loss, valid_precision, valid_recall, valid_f1 = evaluate(
            model, valid_dataloader, criterion)

        #print("Finishing validation epoch")
      end_time = time.time()

      epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        # Si obtuvimos mejores resultados, guardamos este modelo en el almacenamiento (para poder cargarlo luego)
        # Si detienen el entrenamiento prematuramente, pueden cargar el modelo en el siguiente recuadro de código.
      if valid_loss < best_valid_loss:
          best_valid_loss = valid_loss
          torch.save(model.state_dict(), '/content/drive/MyDrive/Tesis/{}.pt'.format(model_name))
        # Si ya no mejoramos el loss de validación, terminamos de entrenar.

      print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
      print(
            f'\tTrain Loss: {train_loss:.3f} | Train f1: {train_f1:.2f} | Train precision: {train_precision:.2f} | Train recall: {train_recall:.2f}'
        )
      print(
            f'\t Val. Loss: {valid_loss:.3f} |  Val. f1: {valid_f1:.2f} |  Val. precision: {valid_precision:.2f} | Val. recall: {valid_recall:.2f}'
        )


def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs


class FakeNewsDataset(Dataset):
    """
    Dataset class for Clickbait Webis 20217 Dataset.
    Assumes data has already been preprocessed and has a format
    (headline, body, truthClass).
    """
    def __init__(self, path='data/clickbait', split='train'):
        if split not in ['train', 'test', 'valid']:
            raise ValueError(f'Dataset {split} not found, it must be train, valid or test')
        self.split = split
        dataset = pd.read_csv(f'{path}/{split}.csv')
        self.headlines = list(dataset['headline'])
        self.bodies = list(dataset['body'])
        self.labels = list(dataset['label'])

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.headlines[idx], self.bodies[idx], self.labels[idx]


class ClickbaitDataset(Dataset):
    """
    Dataset class for Clickbait Webis 20217 Dataset.
    Assumes data has already been preprocessed and has a format
    (headline, body, truthClass).
    """
    def __init__(self, path='data/clickbait', split='train'):
        if split not in ['train', 'test', 'valid']:
            raise ValueError(f'Dataset {split} not found, it must be train, valid or test')
        self.split = split
        if split == 'valid':
            split = 'validation'
        dataset = pd.read_csv(f'{path}/{split}/{split}.csv')
        self.headlines = list(dataset['headline'])
        self.bodies = list(dataset['body'])
        self.labels = list(dataset['truthClass'])

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.headlines[idx], self.bodies[idx], self.labels[idx]


class IncongruenceDataset(Dataset):
    """
    Dataset for FNC-1 Dataset.
    Assumes data has already been preprocessed and
    has a format (headline, body, labels)
    """

    def __init__(self, path='../data/fnc-1', split='train'):
        if split not in ['train', 'test', 'valid']:
            raise ValueError(f'Dataset {split} not found, it must be train, valid or test')
        self.split= split
        dataset = pd.read_csv(f'{path}/{split}.csv')
        self.headlines = list(dataset['headline'])
        self.bodies = list(dataset['body'])
        self.labels = list(dataset['label'])

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return (self.headlines[idx], self.bodies[idx], self.labels[idx])


class BiDualEncoder(nn.Module):

    def __init__(
            self, vocab_size,
            vectors=None,
            embed_dim=300,
            hidden_dim=128,
            hidden_layers=2,
            output_dim=2,
            freeze_embed=True,
            dropout=0.3,
        ):

        super().__init__()

        self.hidden_dim = hidden_dim

        # Shared embedding layer
        if vectors is not None:
          self.embedding_layer = nn.Embedding(vocab_size, embed_dim).from_pretrained(
              vectors,
              freeze=freeze_embed
          )
        else:
          self.embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)

        # Encoders
        self.headline_lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=hidden_layers, bidirectional=True)
        self.body_lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=hidden_layers, bidirectional=True)

        # Prediction
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.softmax = nn.Softmax(dim=0)

    def forward(self, headline, body):

        # Representations
        headline_embedding = self.embedding_layer(headline)
        body_embedding = self.embedding_layer(body)


        # Headline latent representation
        headline_lstm, _ = self.headline_lstm(headline_embedding)

        # Body latent representation
        body_lstm, _ = self.body_lstm(body_embedding)

        # Average latest output (bidirectional)
        # This is performerd taking the last
        output_lstm = (headline_lstm[-1,:,:self.hidden_dim] + body_lstm[-1,:,:self.hidden_dim]) / 2
        output = self.fc(self.dropout(output_lstm))
        return output


class BiConditionalDualEncoder(nn.Module):

    def __init__(
            self, vocab_size,
            vectors=None,
            embed_dim=300,
            hidden_dim=128,
            hidden_layers=2,
            output_dim=2,
            freeze_embed=True,
            dropout=0.3,
        ):

        super().__init__()

        self.hidden_dim = hidden_dim

        # Shared embedding layer
        if vectors is not None:
          self.embedding_layer = nn.Embedding(vocab_size, embed_dim).from_pretrained(
              vectors,
              freeze=freeze_embed
          )
        else:
          self.embedding_layer = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embed_dim)

        # Encoders
        self.headline_lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=hidden_layers, bidirectional=True, dropout=dropout)
        self.body_lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=hidden_layers, bidirectional=True, dropout=dropout)

        # Prediction
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, headline, body):

        # Representations
        headline_embedding = self.embedding_layer(headline)
        body_embedding = self.embedding_layer(body)

        # Headline latent representation
        _, (h_hidden, h_cell) = self.headline_lstm(headline_embedding)

        # Body latent representation
        body_lstm, _ = self.body_lstm(body_embedding, (h_hidden, h_cell))

        # Average outputs of body
        output_lstm = (body_lstm[-1,:,:self.hidden_dim] + body_lstm[-1,:,self.hidden_dim:]) / 2
        output = self.fc(self.dropout(output_lstm))
        return output


class LTSM_Encoder(nn.Module):

    def __init__(self, input_dim, embedding_dim, hidden_dim, output_dim, num_layers, padding_idx, vectors, bidirectional=False, dropout=0.3):
        super().__init__()
        # Embedding layer
        self.embed_layer = nn.Embedding(input_dim, embedding_dim).from_pretrained(vectors, freeze=True)
        # LSTM layer
        self.hidden_dim = hidden_dim
        self.lstm_headline = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, dropout=dropout if num_layers > 1 else 0)
        self.lstm_body = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, bidirectional=bidirectional, dropout=dropout if num_layers > 1 else 0)
        # Output layer
        self.fc = nn.Linear(hidden_dim * 4 if bidirectional else hidden_dim * 2, output_dim)
        # Dropout
        self.dropout = nn.Dropout(dropout)

    def forward(self, headline, body):
        # Embed text
        headline_embed = self.embed_layer(headline)
        body_embed = self.embed_layer(body)
        # LSTM layer
        outputs_heads, (hidden_heads, cell_heads) = self.lstm_headline(headline_embed)
        outputs_bodies, (hidden_bodies, cell_bodies) = self.lstm_body(body_embed)
        outputs_heads = outputs_heads[-1]
        outputs_bodies = outputs_bodies[-1]
        outputs = torch.cat((outputs_heads, outputs_bodies), 1)
        # Make predictions
        predictions = self.fc(self.dropout(outputs))
        return predictions

## Tokenization and collation

While ideally this code is subject to little changes, we prefer to separate it from the shared and static codebase.



In [None]:
import re
from torchtext.data.utils import get_tokenizer

tokenizer = get_tokenizer('basic_english')

vocab_transform = lambda x: [vocab[token] if token in vocab else 0 for token in tokenizer(x)]

tensor_transform = lambda x: torch.tensor(x)

def label_transform(label):
    if label == 'agree':
        return [1, 0]
    elif label == 'disagree':
        return [0, 1]
    elif label == 'discuss':
        return [1, 0]

def collate_batch(batch):
    articles, headlines, labels = [], [], []
    for (_headline, _body, _label) in batch:
        # Transform the headline
        _headline = tensor_transform(vocab_transform(_headline))
        _body = tensor_transform(vocab_transform(_body)[:300])
        headlines.append(_headline)
        articles.append(_body)
        labels.append(label_transform(_label))
    # Transform labels to tensor and add padding to articles
    labels = torch.tensor(labels)
    headlines = pad_sequence(headlines)
    articles = pad_sequence(articles)

    pad = articles.shape[0] - headlines.shape[0]
    headlines = F.pad(headlines, pad=(0,0, 0, pad), value=0)
    # Transform labels to tensor and add padding to articles
    return labels, headlines, articles

In [None]:
!ls /content/drive/MyDrive/Tesis/Tesis/incongruence/fnc-1/

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
train_dataset = FakeNewsDataset(path="/content/drive/MyDrive/Tesis/Tesis/incongruence/fnc-1/original/two", split="train")
valid_dataset = FakeNewsDataset(path="/content/drive/MyDrive/Tesis/Tesis/incongruence/fnc-1/original/two", split="valid")

In [None]:
from torchtext.vocab import GloVe
vectors = GloVe(name='6B', dim=300)
vocab = torch_vocab(vectors.stoi, specials=["BOH", "EOH", "BOP", "EOP", "<UNK>"])
vocab.set_default_index(-1)

.vector_cache/glove.6B.zip: 862MB [02:50, 5.06MB/s]                           
100%|█████████▉| 399999/400000 [00:54<00:00, 7305.99it/s]


In [None]:
train_dataloader = DataLoader(list(train_dataset), batch_size=32, shuffle=True,
                                collate_fn=collate_batch)
valid_dataloader = DataLoader(list(valid_dataset), batch_size=32, shuffle=True,
                                collate_fn=collate_batch)

In [None]:
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 32
OUTPUT_DIM = 2
NUM_LAYERS = 2

model = LTSM_Encoder(input_dim=INPUT_DIM, embedding_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, output_dim=OUTPUT_DIM, num_layers=NUM_LAYERS, padding_idx=0, vectors=vectors.vectors)
model_name = "LTSMEncoderGloveFNC2ClassEXP1"

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [None]:
n_epochs = 10

run_experiment(
    model=model, model_name=model_name,
    criterion=criterion,
    train_dataloader=train_dataloader,
    valid_dataloader=valid_dataloader,
    n_epochs=n_epochs,
    device=device
)

Finishing training epoch
Epoch: 01 | Epoch Time: 0m 20s
	Train Loss: 0.245 | Train f1: 0.55 | Train precision: 0.53 | Train recall: 0.56
	 Val. Loss: 0.236 |  Val. f1: 0.53 |  Val. precision: 0.52 | Val. recall: 0.55
Finishing training epoch
Epoch: 02 | Epoch Time: 0m 17s
	Train Loss: 0.213 | Train f1: 0.56 | Train precision: 0.55 | Train recall: 0.57
	 Val. Loss: 0.213 |  Val. f1: 0.55 |  Val. precision: 0.55 | Val. recall: 0.56
Finishing training epoch
Epoch: 03 | Epoch Time: 0m 19s
	Train Loss: 0.193 | Train f1: 0.61 | Train precision: 0.63 | Train recall: 0.61
	 Val. Loss: 0.204 |  Val. f1: 0.61 |  Val. precision: 0.64 | Val. recall: 0.61
Finishing training epoch
Epoch: 04 | Epoch Time: 0m 17s
	Train Loss: 0.180 | Train f1: 0.62 | Train precision: 0.64 | Train recall: 0.62
	 Val. Loss: 0.196 |  Val. f1: 0.64 |  Val. precision: 0.66 | Val. recall: 0.64
Finishing training epoch
Epoch: 05 | Epoch Time: 0m 19s
	Train Loss: 0.172 | Train f1: 0.66 | Train precision: 0.70 | Train recall: 

In [None]:
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 32
OUTPUT_DIM = 2
NUM_LAYERS = 2

model = BiDualEncoder(vocab_size=INPUT_DIM, embed_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, output_dim=OUTPUT_DIM, hidden_layers=NUM_LAYERS)
model_name = "BiDualEncoderFNC2ClassExp2"

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [None]:
n_epochs = 10

run_experiment(
    model=model, model_name=model_name,
    criterion=criterion,
    train_dataloader=train_dataloader,
    valid_dataloader=valid_dataloader,
    n_epochs=n_epochs,
    device=device
)

Finishing training epoch
Epoch: 01 | Epoch Time: 0m 33s
	Train Loss: 0.238 | Train f1: 0.55 | Train precision: 0.53 | Train recall: 0.57
	 Val. Loss: 0.212 |  Val. f1: 0.56 |  Val. precision: 0.54 | Val. recall: 0.58
Finishing training epoch
Epoch: 02 | Epoch Time: 0m 32s
	Train Loss: 0.180 | Train f1: 0.57 | Train precision: 0.56 | Train recall: 0.58
	 Val. Loss: 0.186 |  Val. f1: 0.63 |  Val. precision: 0.66 | Val. recall: 0.62
Finishing training epoch
Epoch: 03 | Epoch Time: 0m 32s
	Train Loss: 0.147 | Train f1: 0.64 | Train precision: 0.67 | Train recall: 0.64
	 Val. Loss: 0.175 |  Val. f1: 0.62 |  Val. precision: 0.66 | Val. recall: 0.61
Finishing training epoch
Epoch: 04 | Epoch Time: 0m 34s
	Train Loss: 0.131 | Train f1: 0.64 | Train precision: 0.68 | Train recall: 0.64
	 Val. Loss: 0.175 |  Val. f1: 0.63 |  Val. precision: 0.67 | Val. recall: 0.63
Finishing training epoch
Epoch: 05 | Epoch Time: 0m 32s
	Train Loss: 0.120 | Train f1: 0.67 | Train precision: 0.71 | Train recall: 

In [None]:
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 32
OUTPUT_DIM = 2
NUM_LAYERS = 2

model = BiDualEncoder(vocab_size=INPUT_DIM, embed_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, output_dim=OUTPUT_DIM, hidden_layers=NUM_LAYERS, vectors=vectors.vectors)
model_name = "BiDualEncoderGloveFNC2ClassEXP4"

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [None]:
n_epochs = 10

run_experiment(
    model=model, model_name=model_name,
    criterion=criterion,
    train_dataloader=train_dataloader,
    valid_dataloader=valid_dataloader,
    n_epochs=n_epochs,
    device=device
)

Finishing training epoch
Epoch: 01 | Epoch Time: 0m 20s
	Train Loss: 0.244 | Train f1: 0.56 | Train precision: 0.54 | Train recall: 0.57
	 Val. Loss: 0.235 |  Val. f1: 0.56 |  Val. precision: 0.54 | Val. recall: 0.58
Finishing training epoch
Epoch: 02 | Epoch Time: 0m 21s
	Train Loss: 0.215 | Train f1: 0.55 | Train precision: 0.53 | Train recall: 0.56
	 Val. Loss: 0.213 |  Val. f1: 0.54 |  Val. precision: 0.53 | Val. recall: 0.56
Finishing training epoch
Epoch: 03 | Epoch Time: 0m 21s
	Train Loss: 0.193 | Train f1: 0.57 | Train precision: 0.57 | Train recall: 0.59
	 Val. Loss: 0.206 |  Val. f1: 0.52 |  Val. precision: 0.51 | Val. recall: 0.54
Finishing training epoch
Epoch: 04 | Epoch Time: 0m 19s
	Train Loss: 0.183 | Train f1: 0.64 | Train precision: 0.67 | Train recall: 0.64
	 Val. Loss: 0.195 |  Val. f1: 0.62 |  Val. precision: 0.66 | Val. recall: 0.63
Finishing training epoch
Epoch: 05 | Epoch Time: 0m 20s
	Train Loss: 0.175 | Train f1: 0.65 | Train precision: 0.69 | Train recall: 

In [None]:
import torch.optim as optim

INPUT_DIM = len(vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 32
OUTPUT_DIM = 2
NUM_LAYERS = 2

model = BiConditionalDualEncoder(vocab_size=INPUT_DIM, embed_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, output_dim=OUTPUT_DIM, hidden_layers=NUM_LAYERS)
model_name = "BiConditionalDualEncoderFNC2ClassEXP5"

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [None]:
n_epochs = 10

run_experiment(
    model=model, model_name=model_name,
    criterion=criterion,
    train_dataloader=train_dataloader,
    valid_dataloader=valid_dataloader,
    n_epochs=n_epochs,
    device=device
)

Finishing training epoch
Epoch: 01 | Epoch Time: 0m 32s
	Train Loss: 0.216 | Train f1: 0.56 | Train precision: 0.56 | Train recall: 0.57
	 Val. Loss: 0.160 |  Val. f1: 0.61 |  Val. precision: 0.65 | Val. recall: 0.61
Finishing training epoch
Epoch: 02 | Epoch Time: 0m 34s
	Train Loss: 0.133 | Train f1: 0.68 | Train precision: 0.72 | Train recall: 0.67
	 Val. Loss: 0.137 |  Val. f1: 0.67 |  Val. precision: 0.71 | Val. recall: 0.67
Finishing training epoch
Epoch: 03 | Epoch Time: 0m 32s
	Train Loss: 0.096 | Train f1: 0.78 | Train precision: 0.82 | Train recall: 0.77
	 Val. Loss: 0.135 |  Val. f1: 0.71 |  Val. precision: 0.75 | Val. recall: 0.70
Finishing training epoch
Epoch: 04 | Epoch Time: 0m 34s
	Train Loss: 0.070 | Train f1: 0.85 | Train precision: 0.87 | Train recall: 0.86
	 Val. Loss: 0.141 |  Val. f1: 0.73 |  Val. precision: 0.77 | Val. recall: 0.73
Finishing training epoch
Epoch: 05 | Epoch Time: 0m 32s
	Train Loss: 0.053 | Train f1: 0.89 | Train precision: 0.90 | Train recall: 

In [None]:
INPUT_DIM = len(vocab)
EMBEDDING_DIM = 300
HIDDEN_DIM = 32
OUTPUT_DIM = 2
NUM_LAYERS = 2

model = BiConditionalDualEncoder(vocab_size=INPUT_DIM, embed_dim=EMBEDDING_DIM, hidden_dim=HIDDEN_DIM, output_dim=OUTPUT_DIM, hidden_layers=NUM_LAYERS, vectors=vectors.vectors)
model_name = "BiConditionalDualEncoderGloveFNC2ClassEXP6"

optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()

In [None]:
n_epochs = 10

run_experiment(
    model=model, model_name=model_name,
    criterion=criterion,
    train_dataloader=train_dataloader,
    valid_dataloader=valid_dataloader,
    n_epochs=n_epochs,
    device=device
)

Finishing training epoch
Epoch: 01 | Epoch Time: 0m 23s
	Train Loss: 0.244 | Train f1: 0.55 | Train precision: 0.54 | Train recall: 0.57
	 Val. Loss: 0.198 |  Val. f1: 0.56 |  Val. precision: 0.56 | Val. recall: 0.57
Finishing training epoch
Epoch: 02 | Epoch Time: 0m 21s
	Train Loss: 0.170 | Train f1: 0.61 | Train precision: 0.63 | Train recall: 0.61
	 Val. Loss: 0.165 |  Val. f1: 0.64 |  Val. precision: 0.68 | Val. recall: 0.63
Finishing training epoch
Epoch: 03 | Epoch Time: 0m 22s
	Train Loss: 0.142 | Train f1: 0.70 | Train precision: 0.74 | Train recall: 0.69
	 Val. Loss: 0.154 |  Val. f1: 0.63 |  Val. precision: 0.67 | Val. recall: 0.63
Finishing training epoch
Epoch: 04 | Epoch Time: 0m 23s
	Train Loss: 0.119 | Train f1: 0.75 | Train precision: 0.79 | Train recall: 0.74
	 Val. Loss: 0.145 |  Val. f1: 0.67 |  Val. precision: 0.70 | Val. recall: 0.67
Finishing training epoch
Epoch: 05 | Epoch Time: 0m 22s
	Train Loss: 0.103 | Train f1: 0.79 | Train precision: 0.82 | Train recall: 

## Discard from here

## BERT

In [None]:
!pip install transformers &> /dev/null
!pip install sentencepiece &> /dev/null
!pip install torchmetrics & > /dev/null

Collecting torchmetrics
  Downloading torchmetrics-1.6.0-py3-none-any.whl.metadata (20 kB)
Collecting lightning-utilities>=0.8.0 (from torchmetrics)
  Downloading lightning_utilities-0.11.9-py3-none-any.whl.metadata (5.2 kB)
Downloading torchmetrics-1.6.0-py3-none-any.whl (926 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m926.4/926.4 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightning_utilities-0.11.9-py3-none-any.whl (28 kB)
Installing collected packages: lightning-utilities, torchmetrics
Successfully installed lightning-utilities-0.11.9 torchmetrics-1.6.0


BERT comes with its own tokenizer

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

We need to change our collate_batch function to adapt our data to BERT

In [None]:
label_dict = {
    'agree': 0,
    'disagree': 1,
    'discuss': 0,
}

def collate_batch(batch):

    token_ids, mask_ids, segment_ids, labels = [], [], [], []

    for (_headline, _body, _label) in batch:
        _headline_id = tokenizer.encode(_headline, add_special_tokens=False)[:512]
        _headline_len = len(_headline_id)
        _body_id = tokenizer.encode(_body[:(512 - _headline_len - 3)], add_special_tokens=False)
        _pair_token_ids = [tokenizer.cls_token_id] + _body_id + [tokenizer.sep_token_id] + _headline_id + [tokenizer.sep_token_id]

        _body_len = len(_body_id)

        _segment_ids = torch.tensor([0] * (_body_len + 2) + [1] * (_headline_len + 1))
        _attn_mask_ids = torch.tensor([1] * (_body_len + _headline_len + 3))

        token_ids.append(torch.tensor(_pair_token_ids))
        segment_ids.append(_segment_ids)
        mask_ids.append(_attn_mask_ids)

        labels.append(label_dict[_label])

    token_ids = pad_sequence(token_ids, batch_first=True)
    segment_ids = pad_sequence(segment_ids, batch_first=True)
    mask_ids = pad_sequence(mask_ids, batch_first=True)
    labels = torch.tensor(labels)

    return token_ids, mask_ids, segment_ids, labels

train_dataloader = DataLoader(list(train_dataset), batch_size=32, shuffle=True,
                                collate_fn=collate_batch)
valid_dataloader = DataLoader(list(valid_dataset), batch_size=32, shuffle=True,
                                collate_fn=collate_batch)

In [None]:
from transformers import BertForSequenceClassification, AdamW

model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
model.to(device)

param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=2e-5, correct_bias=False)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
import time
from torchmetrics import F1Score

def multi_acc(y_pred, y_test):
  acc = (torch.log_softmax(y_pred, dim=1).argmax(dim=1) == y_test).sum().float() / float(y_test.size(0))
  return acc

f1_score = F1Score(task="binary", average='macro')
f1_score = f1_score.to(device)

def train_bert(model, train_loader, val_loader, optimizer, n_epochs):
  total_step = len(train_loader)
  BEST_F1 = 0
  for epoch in range(n_epochs):
    start = time.time()
    model.train()
    total_train_loss = 0
    total_train_acc  = 0
    total_train_f1 = 0
    for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(train_loader):
      optimizer.zero_grad()
      pair_token_ids = pair_token_ids.to(device)
      mask_ids = mask_ids.to(device)
      seg_ids = seg_ids.to(device)
      labels = y.to(device)
      loss, prediction = model(pair_token_ids,
                             token_type_ids=seg_ids,
                             attention_mask=mask_ids,
                             labels=labels).values()

      acc = multi_acc(prediction, labels)

      loss.backward()
      optimizer.step()

      total_train_loss += loss.item()
      total_train_acc  += acc.item()
      total_train_f1 += f1_score(torch.argmax(prediction, dim=1), labels)

    train_acc  = total_train_acc/len(train_loader)
    train_loss = total_train_loss/len(train_loader)
    train_f1 = total_train_f1/len(train_loader)
    model.eval()
    total_val_acc  = 0
    total_val_loss = 0
    with torch.no_grad():
      y_preds = torch.tensor(()).int().to(device)
      y_tests = torch.tensor(()).int().to(device)
      for batch_idx, (pair_token_ids, mask_ids, seg_ids, y) in enumerate(val_loader):
        optimizer.zero_grad()
        pair_token_ids = pair_token_ids.to(device)
        mask_ids = mask_ids.to(device)
        seg_ids = seg_ids.to(device)
        labels = y.to(device)

        loss, prediction = model(pair_token_ids,
                             token_type_ids=seg_ids,
                             attention_mask=mask_ids,
                             labels=labels).values()

        y_pred = torch.log_softmax(prediction, dim=1).argmax(dim=1)
        y_preds = torch.cat([y_preds, y_pred])
        y_tests = torch.cat([y_tests, labels])

        total_val_loss += loss.item()
        total_val_acc  += acc.item()

    val_f1 = f1_score(y_preds, y_tests)
    if val_f1 > BEST_F1:
        BEST_F1 = val_f1
        torch.save(model.state_dict(), '/content/drive/MyDrive/Tesis/bert-fnc-2class.pt')
    val_acc  = total_val_acc/len(val_loader)
    val_loss = total_val_loss/len(val_loader)
    end = time.time()
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)

    print(f'Epoch {epoch+1}: train_loss: {train_loss:.4f} train_f1: {train_f1:.4f} | val_loss: {val_loss:.4f} val_f1: {val_f1:.4f}')
    print("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

In [None]:
train_bert(model, train_dataloader, valid_dataloader, optimizer, 6)

Epoch 1: train_loss: 0.1794 train_f1: 0.0900 | val_loss: 0.1363 val_f1: 0.5936
00:05:02.24
Epoch 2: train_loss: 0.0798 train_f1: 0.5727 | val_loss: 0.0907 val_f1: 0.7582
00:05:05.28
Epoch 3: train_loss: 0.0404 train_f1: 0.7451 | val_loss: 0.0843 val_f1: 0.8000
00:05:05.27
Epoch 4: train_loss: 0.0162 train_f1: 0.7692 | val_loss: 0.0538 val_f1: 0.8612
00:05:04.72
Epoch 5: train_loss: 0.0087 train_f1: 0.8329 | val_loss: 0.0656 val_f1: 0.8565
00:05:03.93
Epoch 6: train_loss: 0.0074 train_f1: 0.8405 | val_loss: 0.0702 val_f1: 0.8690
00:05:04.53


In [None]:
from google.colab import runtime
runtime.unassign()