In [6]:
import datasets
import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification

In [7]:
DATASET_SCRIPT = "../datasets/semeval_2010_task_8.py"

PRETRAINED_MODEL_NAME = "bert-base-uncased"

HEAD_START_MARKER = "[HEAD]"
HEAD_END_MARKER = "[/HEAD]"
TAIL_START_MARKER = "[TAIL]"
TAIL_END_MARKER = "[/TAIL]"

In [8]:
def mark_arguments(examples,
                   tokenizer,
                   max_length,
                   truncation = "longest_first",
                   argument_marker_mode= "mark",
                   append_separator = "[SEP]",
                   head_start_marker = "[HEAD]",
                   head_end_marker = "[/HEAD]",
                   tail_start_marker = "[TAIL]",
                   tail_end_marker = "[/TAIL]"):
    marked_tokens = []
    for tokens, head_start, head_end, tail_start, tail_end in zip(
        examples["tokens"],
        examples["head_start"],
        examples["head_end"],
        examples["tail_start"],
        examples["tail_end"],
    ):

        head = (head_start, head_end, head_start_marker, head_end_marker)
        tail = (tail_start, tail_end, tail_start_marker, tail_end_marker)

        head_first = head_start < tail_start
        first, second = (head, tail) if head_first else (tail, head)

        first_start, first_end, first_start_marker, first_end_marker = first
        second_start, second_end, second_start_marker, second_end_marker = second

        first_tokens = tokens[first_start:first_end]
        second_tokens = tokens[second_start:second_end]

        marked_tokens.append(
            tokens[:first_start]
            + [first_start_marker]
            + first_tokens
            + [first_end_marker]
            + tokens[first_end:second_start]
            + [second_start_marker]
            + second_tokens
            + [second_end_marker]
            + tokens[second_end:]
        )

    return tokenizer(
        text=marked_tokens,
        is_split_into_words=True,
        padding="max_length",
        max_length=max_length,
        truncation=truncation,
    )

In [9]:
dataset = datasets.DatasetDict(train=datasets.load_dataset(path=DATASET_SCRIPT, split="train"))

dataset.rename_column_(original_column_name="label", new_column_name="labels")

Using custom data configuration default
Reusing dataset sem_eval2010_task8 (/home/christoph/.cache/huggingface/datasets/sem_eval2010_task8/default/1.0.0/aa27127323f78f643e734263370540be63f5f10ba9545fa11e66eea7a7c671d4)


In [10]:
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'head_start', 'head_end', 'tail_start', 'tail_end', 'labels'],
        num_rows: 8000
    })
})

In [11]:
dataset["train"][0]

{'head_end': 13,
 'head_start': 12,
 'labels': 3,
 'tail_end': 16,
 'tail_start': 15,
 'tokens': ['The',
  'system',
  'as',
  'described',
  'above',
  'has',
  'its',
  'greatest',
  'application',
  'in',
  'an',
  'arrayed',
  'configuration',
  'of',
  'antenna',
  'elements',
  '.']}

In [12]:
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
tokenizer.add_special_tokens(
    {
        "additional_special_tokens": [
            HEAD_START_MARKER,
            HEAD_END_MARKER,
            TAIL_START_MARKER,
            TAIL_END_MARKER
        ]
    }
)

4

In [13]:
tokenizer.tokenize("ABCDEFG lives in Y.")

['abc', '##de', '##f', '##g', 'lives', 'in', 'y', '.']

In [14]:
tokenizer_output = tokenizer("ABCDEFG lives in Y.")
tokenizer_output

{'input_ids': [101, 5925, 3207, 2546, 2290, 3268, 1999, 1061, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [15]:
tokenizer.convert_ids_to_tokens(tokenizer_output["input_ids"])

['[CLS]', 'abc', '##de', '##f', '##g', 'lives', 'in', 'y', '.', '[SEP]']

In [16]:
def preprocess_function(examples):
    return mark_arguments(examples, tokenizer=tokenizer, max_length=64)

preprocessed_dataset = dataset.map(preprocess_function, batched=True)

Loading cached processed dataset at /home/christoph/.cache/huggingface/datasets/sem_eval2010_task8/default/1.0.0/aa27127323f78f643e734263370540be63f5f10ba9545fa11e66eea7a7c671d4/cache-dc8d8bb83a31d46a.arrow


In [17]:
preprocessed_dataset

DatasetDict({
    train: Dataset({
        features: ['attention_mask', 'head_end', 'head_start', 'input_ids', 'labels', 'tail_end', 'tail_start', 'token_type_ids', 'tokens'],
        num_rows: 8000
    })
})

In [18]:
example = preprocessed_dataset["train"][0]

In [19]:
tokenizer.convert_ids_to_tokens(example["input_ids"])

['[CLS]',
 'the',
 'system',
 'as',
 'described',
 'above',
 'has',
 'its',
 'greatest',
 'application',
 'in',
 'an',
 'array',
 '##ed',
 '[HEAD]',
 'configuration',
 '[/HEAD]',
 'of',
 'antenna',
 '[TAIL]',
 'elements',
 '[/TAIL]',
 '.',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]']

In [20]:
preprocessed_dataset.set_format("pt", columns=["input_ids", "token_type_ids", "attention_mask", "labels"])

In [21]:
preprocessed_dataset["train"][0]

  return torch.tensor(x, **format_kwargs)


{'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'input_ids': tensor([  101,  1996,  2291,  2004,  2649,  2682,  2038,  2049,  4602,  4646,
          1999,  2019,  9140,  2098, 30522,  9563, 30523,  1997, 13438, 30524,
          3787, 30525,  1012,   102,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0]),
 'labels': tensor(3),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 

In [22]:
from torch.utils.data.dataloader import DataLoader


def train_model(model, dataset, optimizer, num_epochs, cuda_device=-1, batch_size=2, log_every_n_batches=10):
    device = torch.device("cuda", cuda_device) if cuda_device > -1 else torch.device("cpu")
    
    if "validation" in dataset:
        train_dataset = dataset["train"]
        validation_dataset = dataset["validation"]
    else:
        split_train_dataset = dataset["train"].train_test_split(test_size=0.1)
        train_dataset = split_train_dataset["train"]
        validation_dataset = split_train_dataset["test"]
    
    model = model.to(device)
    for epoch in range(num_epochs):
        
        model.train()
        train_loss = 0.
        train_dataloader = DataLoader(train_dataset, batch_size=batch_size)
        for batch_idx, train_batch in enumerate(train_dataloader, start=1):
            optimizer.zero_grad()
            
            train_batch = {key: tensor.to(device) for key, tensor in train_batch.items()}
            
            output = model(**train_batch)
            
            loss = output.loss
            
            loss.backward()
            
            optimizer.step()
            
            train_loss += loss.item()
            
            if (batch_idx % log_every_n_batches) == 0:
                print(f"[{batch_idx}/{len(train_dataloader)}] train loss: {train_loss / batch_idx}")
        
        model.eval()
        val_loss = 0.
        val_f1_score = datasets.load_metric("f1")
        with torch.no_grad():
            validation_dataloader = DataLoader(validation_dataset, batch_size=batch_size)
            for validation_batch in validation_dataloader:
                validation_batch = {key: tensor.to(device) for key, tensor in validation_batch.items()}
                
                output = model(**validation_batch)
                
                logits = output.logits
                
                pred_labels = logits.argmax(dim=-1)
                true_labels = validation_batch["labels"]
                
                val_f1_score.add_batch(predictions=pred_labels, references=true_labels)
                val_loss += output.loss.item()
            
            print("val loss: ", val_loss / len(validation_dataloader))
            print("val f1:", val_f1_score.compute(average="micro"))

In [23]:
CUDA_DEVICE = 0  # if you have a GPU, otherwise set to -1

In [24]:
model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=preprocessed_dataset["train"].features["labels"].num_classes)

new_vocab_size = tokenizer.vocab_size + len(tokenizer.additional_special_tokens)
model.resize_token_embeddings(new_vocab_size)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

train_model(model, preprocessed_dataset, optimizer, num_epochs=5, batch_size=16, cuda_device=CUDA_DEVICE)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

[10/450] train loss: 2.9193800687789917
[20/450] train loss: 2.8977689146995544
[30/450] train loss: 2.8806257009506226
[40/450] train loss: 2.8622309446334837
[50/450] train loss: 2.848922486305237
[60/450] train loss: 2.8355388164520265
[70/450] train loss: 2.8195022446768623
[80/450] train loss: 2.8023325741291045
[90/450] train loss: 2.789883515569899
[100/450] train loss: 2.774534707069397
[110/450] train loss: 2.7645813660188154
[120/450] train loss: 2.7528449575106304
[130/450] train loss: 2.7404092917075524
[140/450] train loss: 2.7257826157978604
[150/450] train loss: 2.7105794858932497
[160/450] train loss: 2.701285018026829
[170/450] train loss: 2.677043244417976
[180/450] train loss: 2.6637300491333007
[190/450] train loss: 2.6513121567274394
[200/450] train loss: 2.635449838638306
[210/450] train loss: 2.6247129213242304
[220/450] train loss: 2.608935646577315
[230/450] train loss: 2.5923818178798843
[240/450] train loss: 2.5733977988362313
[250/450] train loss: 2.55611368