Libraries that we might use

In [None]:
import torch
from datasets import Dataset
from torch.utils.data import DataLoader
import numpy as np
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification, BertTokenizerFast, BertForTokenClassification, Trainer, TrainingArguments, AutoConfig
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report
import evaluate
from tqdm.auto import tqdm

DATA LOADING

In [None]:
def read_conll_file(path):
    """
    read in conll file
    :param path: path to read from
    :returns: list with sequences of words and labels for each sentence
    """
    data = []
    current_words = []
    current_tags = []

    for line in open(path, encoding = "utf-8"):
        line = line.strip()
        if line:
            if line[0] == "#":
                continue # skip comments
            tok = line.split("\t")
            current_words.append(tok[1])
            current_tags.append(tok[2])
        else:
            if current_words: # skip empty lines
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []

    # check for last one
    if current_tags != []:
        data.append((current_words, current_tags))
    return data

In [None]:
train_data = read_conll_file("en_ewt-ud-train.iob2")

In [None]:
dev_data = read_conll_file("en_ewt-ud-dev.iob2")

In [None]:
test_data = read_conll_file("en_ewt-ud-test-masked.iob2")

In [None]:
train_data[0]

In [None]:
dev_data[0]

In [None]:
test_data[0]

LABEL MAPPING

In [None]:
unique_lables = list(set(lab for sen, labs in train_data for lab in labs))
unique_lables

In [None]:
lab2idx = {lab: idx for idx, lab in enumerate(unique_lables)}
lab2idx

In [None]:
idx2lab = {idx: lab for lab, idx in lab2idx.items()}
idx2lab

TOKENIZER AND ALIGNMENT

In [None]:
# Hyperparameters
max_length = 128
model_name = "google-bert/bert-base-cased"

In [None]:
# Tokenizer
tokenizer = BertTokenizerFast.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name, num_labels = len(unique_lables), id2label = idx2lab, label2id = lab2idx)

In [None]:
def tokenize_and_align_labels(examples):
    """
    For each example, tokenize the list of tokens and align the original labels 
    to the resulting subwords. Tokens can be split into multiple subwords, so we mark 
    the "extra" subwords with -100 to ignore them in the loss.
    """
    tokenized_inputs = tokenizer(
        examples["token"],
        max_length = 128,
        padding = False,
        truncation = True, 
        is_split_into_words = True
    )

    all_labels = []

    for batch_index, labels in enumerate(examples["NER_int"]):
        word_ids = tokenized_inputs.word_ids(batch_index = batch_index)
        label_ids = []
        prev_word_id = None
        for word_id in word_ids:
            if word_id is None:
                label_ids.append(-100)
            elif word_id == prev_word_id:
                continue
            else:
                label_ids.append(labels[word_id])
            prev_word_id = word_id
        all_labels.append(label_ids)

    tokenized_inputs["labels"] = all_labels

    return tokenized_inputs

DATASET

In [None]:
train_dataset = Dataset.from_dict({"token": [x[0] for x in train_data], "NER": [x[1] for x in train_data], "NER_int":[[lab2idx[x] for x in x[1]] for x in train_data]})

In [None]:
train_dataset

In [None]:
dev_dataset = Dataset.from_dict({"token": [x[0] for x in dev_data], "NER": [x[1] for x in dev_data], "NER_int":[[lab2idx[x] for x in x[1]] for x in dev_data]})

In [None]:
dev_dataset

In [None]:
# Test set (?)

In [None]:

processed_train_dataset = train_dataset.map(
    tokenize_and_align_labels,
    batched = True,
    remove_columns = ["token", "NER", "NER_int"],
    desc = "Running tokenizer on dataset"
)

processed_dev_dataset = dev_dataset.map(
    tokenize_and_align_labels,
    batched = True,
    remove_columns = ["token", "NER", "NER_int"],
    desc = "Running tokenizer on dataset"
)

In [None]:
processed_train_dataset

In [None]:
processed_dev_dataset

In [None]:
import random

In [None]:
for index in random.sample(range(len(processed_train_dataset)), 3):
    print(f"Sample {index} of the training set: {processed_train_dataset[index]}")

In [None]:
for index in random.sample(range(len(processed_dev_dataset)), 3):
    print(f"Sample {index} of the dev set: {processed_dev_dataset[index]}")

MODEL AND OPTIMIZER

In [None]:
# Model
model = AutoModelForTokenClassification.from_pretrained(model_name, config = config)
data_collator = DataCollatorForTokenClassification(tokenizer)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# Optimizer
# optimizer = torch.optim.AdamW(model.parameters())

In [None]:
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir = "output",
    evaluation_strategy = "epoch",
    metric_for_best_model = "F1"
)

In [None]:
metric = evaluate.load("seqeval")

In [None]:
def convert_int_to_labels(preds):
    logits, labels = preds
    predictions = np.argmax(logits, axis = -1)
    true_labels = [[idx2lab[label] for label in label_sequence if label != -100] for label_sequence in labels]
    true_predictions = [[idx2lab[pred] for pred, label in zip(preds_sequence, labels_sequence) if label != -100] for preds_sequence, labels_sequence in zip(predictions, labels)]
    return true_labels, true_predictions

In [None]:
def compute_metrics(preds):
    true_labels, true_predictions = convert_int_to_labels(preds)
    results = metric.compute(predictions = true_predictions, references = true_labels)
    return {
        "Precision": results["overall_precision"],
        "Recall": results["overall_recall"],
        "F1": results["overall_f1"],
        "Accuracy": results["overall_accuracy"]
    }

In [None]:
trainer = Trainer(model = model,
                  args = training_args,
                  train_dataset = processed_train_dataset,
                  eval_dataset = processed_dev_dataset,
                  tokenizer =  tokenizer,
                  compute_metrics = compute_metrics,
                  data_collator = data_collator
                  )

trainer.train()

In [None]:
results = trainer.evaluate()
results

In [None]:
predictions, labels, metrics = trainer.predict(processed_dev_dataset)

In [None]:
labels, predictions = convert_int_to_labels((predictions, labels))

In [None]:
predictions

In [None]:
labels

In [None]:
train_data[:2]

In [None]:
predictions[:2]

In [None]:
final = []
for i in range(len(predictions)):
    final.append((dev_data[i][0], predictions[i]))

In [None]:
final

In [None]:
def write_conll_file(data, path):
    """
    Write data back to a CoNLL file format.

    :param data: list of tuples with words and labels
    :param path: path to write to
    """
    with open(path, "w", encoding = "utf-8") as f:
        for sentence in data:
            words, labels = sentence
            for idx, (word, label) in enumerate(zip(words, labels), start = 1):
                f.write(f"{idx}\t{word}\t{label}\t-\t-\n")
            f.write("\n")

write_conll_file(final, "project.conll")

In [None]:
# final_output = []

# for (sentence, tags), new_tags in zip(train_data, predictions):
#     sentence = list([f"{sentence} {new_tag}" for sentence, new_tag in zip(sentence, new_tags)])
#     final_output.append("\n".join(sentence))

In [None]:
# final_output

In [None]:
# iob2_text = "\n\n".join(final_output)

In [None]:
# iob2_text

In [None]:
# with open("iob2_text.iob2", "w") as f:
#     f.write(iob2_text)

TUNING

Evaluation using span_f1.py