Libraries that we might use

In [1]:
import torch
import random
import evaluate
import numpy as np
from datasets import Dataset
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification, BertTokenizerFast, Trainer, TrainingArguments, AutoConfig

  from .autonotebook import tqdm as notebook_tqdm


DATA LOADING

In [2]:
def read_conll_file(path):
    """
    read in conll file
    :param path: path to read from
    :returns: list with sequences of words and labels for each sentence
    """
    data = []
    current_words = []
    current_tags = []

    for line in open(path, encoding = "utf-8"):
        line = line.strip()
        if line:
            if line[0] == "#":
                continue # skip comments
            tok = line.split("\t")
            current_words.append(tok[1])
            current_tags.append(tok[2])
        else:
            if current_words: # skip empty lines
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []

    # check for last one
    if current_tags != []:
        data.append((current_words, current_tags))
    return data

In [3]:
train_data = read_conll_file("en_ewt-ud-train.iob2")

In [4]:
dev_data = read_conll_file("en_ewt-ud-dev.iob2")

In [5]:
test_data = read_conll_file("en_ewt-ud-test-masked.iob2")

LABEL MAPPING

In [6]:
unique_lables = list(set(lab for sen, labs in train_data for lab in labs))

In [7]:
lab2idx = {lab: idx for idx, lab in enumerate(unique_lables)}

In [8]:
idx2lab = {idx: lab for lab, idx in lab2idx.items()}

TOKENIZER AND ALIGNMENT

In [9]:
model_name = "google-bert/bert-base-cased"

In [10]:
# Tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name, num_labels = len(unique_lables), id2label = idx2lab, label2id = lab2idx)

In [11]:
def tokenize_and_align_labels(examples):
    """
    For each example, tokenize the list of tokens and align the original labels 
    to the resulting subwords. Tokens can be split into multiple subwords, so we mark 
    the "extra" subwords with -100 to ignore them in the loss.
    """
    tokenized_inputs = tokenizer(
        examples["token"],
        max_length = 128,
        padding = False,
        truncation = True, 
        is_split_into_words = True
    )

    all_labels = []

    for batch_index, labels in enumerate(examples["NER_int"]):
        word_ids = tokenized_inputs.word_ids(batch_index = batch_index)
        label_ids = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id == prev_word_id:
                continue
            else:
                label_ids.append(labels[word_id])
            prev_word_id = word_id
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels

    return tokenized_inputs

DATASET

In [12]:
train_dataset = Dataset.from_dict({"token": [x[0] for x in train_data], "NER": [x[1] for x in train_data], "NER_int":[[lab2idx[x] for x in x[1]] for x in train_data]})

In [13]:
dev_dataset = Dataset.from_dict({"token": [x[0] for x in dev_data], "NER": [x[1] for x in dev_data], "NER_int":[[lab2idx[x] for x in x[1]] for x in dev_data]})

In [14]:
test_dataset = Dataset.from_dict({"token": [x[0] for x in test_data], "NER": [x[1] for x in test_data], "NER_int":[[lab2idx[x] for x in x[1]] for x in test_data]})

In [15]:
processed_train_dataset = train_dataset.map(
    tokenize_and_align_labels,
    batched = True,
    remove_columns = ["token", "NER", "NER_int"],
    desc = "Running tokenizer on dataset"
)

processed_dev_dataset = dev_dataset.map(
    tokenize_and_align_labels,
    batched = True,
    remove_columns = ["token", "NER", "NER_int"],
    desc = "Running tokenizer on dataset"
)

processed_test_dataset = test_dataset.map(
    tokenize_and_align_labels,
    batched = True,
    remove_columns = ["token", "NER", "NER_int"],
    desc = "Running tokenizer on dataset"
)

Running tokenizer on dataset: 100%|██████████| 12543/12543 [00:00<00:00, 17225.80 examples/s]
Running tokenizer on dataset: 100%|██████████| 2001/2001 [00:00<00:00, 11525.63 examples/s]
Running tokenizer on dataset: 100%|██████████| 2077/2077 [00:00<00:00, 22800.20 examples/s]


In [16]:
for index in random.sample(range(len(processed_train_dataset)), 3):
    print(f"Sample {index} of the training set: {processed_train_dataset[index]}")

Sample 5226 of the training set: {'input_ids': [101, 6564, 13396, 102], 'token_type_ids': [0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1], 'labels': [-100, 3, 4, -100]}
Sample 2534 of the training set: {'input_ids': [101, 146, 1486, 7424, 1104, 6581, 1164, 1142, 117, 1133, 2140, 1122, 1225, 183, 112, 189, 1494, 119, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [-100, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, -100]}
Sample 9676 of the training set: {'input_ids': [101, 146, 1138, 1151, 5497, 7642, 1186, 1111, 1593, 1139, 2072, 1297, 1105, 146, 112, 1396, 1579, 2065, 1106, 1103, 7642, 1186, 2844, 1107, 1588, 185, 6690, 1183, 1105, 1228, 1103, 171, 6094, 23403, 2881, 1105, 1256, 1103, 1168, 1141, 1107, 5144, 10024, 6540, 117, 1133, 1165, 178, 1793, 1142, 185, 5114, 1282, 117, 1122, 8390, 1103, 1168, 185, 5114, 2725, 1283, 106, 106, 102], 'token_type_ids': [0, 0, 0, 0

MODEL AND OPTIMIZER

In [17]:
model = AutoModelForTokenClassification.from_pretrained(model_name, config = config)
data_collator = DataCollatorForTokenClassification(tokenizer)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at google-bert/bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12

In [18]:
training_args = TrainingArguments(
    output_dir = "output",
    evaluation_strategy = "epoch"
)



In [19]:
metric = evaluate.load("seqeval")

In [20]:
def convert_int_to_labels(preds):
    logits, labels = preds
    predictions = np.argmax(logits, axis = -1)
    true_labels = [[idx2lab[label] for label in label_sequence if label != -100] for label_sequence in labels]
    true_predictions = [[idx2lab[pred] for pred, label in zip(preds_sequence, labels_sequence) if label != -100] for preds_sequence, labels_sequence in zip(predictions, labels)]
    return true_labels, true_predictions

In [21]:
def compute_metrics(preds):
    true_labels, true_predictions = convert_int_to_labels(preds)
    results = metric.compute(predictions = true_predictions, references = true_labels)
    return {
        "Precision": results["overall_precision"],
        "Recall": results["overall_recall"],
        "F1": results["overall_f1"],
        "Accuracy": results["overall_accuracy"]
    }

In [22]:
trainer = Trainer(model = model,
                  args = training_args,
                  train_dataset = processed_train_dataset,
                  eval_dataset = processed_dev_dataset,
                  tokenizer =  tokenizer,
                  compute_metrics = compute_metrics,
                  data_collator = data_collator
                  )
trainer.train()

  trainer = Trainer(model = model,


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1024,0.112411,0.594841,0.405797,0.482462,0.960396
2,0.0608,0.11023,0.616337,0.515528,0.561443,0.965724
3,0.0411,0.124164,0.592551,0.543478,0.566955,0.965486


TrainOutput(global_step=4704, training_loss=0.07930757987255953, metrics={'train_runtime': 1099.2451, 'train_samples_per_second': 34.232, 'train_steps_per_second': 4.279, 'total_flos': 863782522947762.0, 'train_loss': 0.07930757987255953, 'epoch': 3.0})

In [23]:
results = trainer.evaluate()

For trainer evaluation the dev data was used, but as the results are satisfactory, no further hyperparameter tuning was performed and we predict on our final test data set.

In [24]:
predictions, labels, metrics = trainer.predict(processed_test_dataset)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
labels, predictions = convert_int_to_labels((predictions, labels))

In [26]:
final_format = []
for i in range(len(predictions)):
    final_format.append((test_data[i][0], predictions[i]))

In [None]:
def write_conll_file(data, path):
    """
    Write data back to a CoNLL file format.
    :param data: list of tuples with words and labels
    :param path: path to write to
    """
    with open(path, "w", encoding = "utf-8") as f:
        for sentence in data:
            words, labels = sentence
            for idx, (word, label) in enumerate(zip(words, labels), start = 1):
                f.write(f"{idx}\t{word}\t{label}\t-\t-\n")
            f.write("\n")

write_conll_file(final_format, "test_output.iob2")