### Notebook overview
1. Read in the data (function)
    - Making sure that the data is in the correct format
    - Function does the label mapping and conversion of labels to label ids

2. Transform data into Huggingface Dataset object

3. Tokenize and align labels

In [1]:
# imports
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, AutoConfig, RobertaTokenizerFast, DataCollatorForTokenClassification
import numpy as np
from datasets import Dataset
import torch
from evaluate import load 
from span_f1 import readNlu

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# path to the data files
path_train = "en_ewt-ud-train.iob2"
path_dev = "en_ewt-ud-dev.iob2"
path_test = "en_ewt-ud-test-masked.iob2"

In [3]:
# saving model name
model_name = 'deepset/roberta-base-squad2'

### Getting the data

In [4]:
# creating the label to id mapping 
data_labels = readNlu(path_train) # reads in label column

label_set = set()

for labels in data_labels:
    label_set.update(labels)

num_labels = len(label_set)

label2id = {label: id for id, label in enumerate(label_set)}

id2label = {id: label for label, id in label2id.items()}


In [5]:
# function for loading iob2 data (from solution for assignment 5)
def read_iob2_file(path):
    '''
    This function reads iob2 files
    
    Parameters:
    - path: path to read from

    Returns:
    - list with dictionaries for each sentence where the keys are 'tokens', 'ner_tags', and 'tag_ids' and 
      the values are lists that hold the tokens, ner_tags, and tag_ids.
    '''

    data = []
    current_words = []
    current_tags = []
    current_tag_ids = []

    for line in open(path, encoding='utf-8'):
        line = line.strip() # removes any leading and trailing whitespaces from the line

        if line:
            if line[0] == '#': 
                continue # skip comments

            # splitting at 'tab', as the data is tab separated 
            tok = line.split('\t')

            # add the entry in the second colun (the word) to current_words
            current_words.append(tok[1]) 

            # add the current tag 
            current_tags.append(tok[2]) 

            # add the current tag mapped to the corresponding id (int)
            current_tag_ids.append(label2id[tok[2]]) 
        
        else: # skip empty lines
            if current_words: # if current_words is not empty

                # add entry to dict where tokens and ner_tags are keys and the values are lists
                data.append({"tokens": current_words, "ner_tags": current_tags, "tag_ids": current_tag_ids})

            # start over  
            current_words = []
            current_tags = []
            current_tag_ids = []

    # check for last one
    if current_tags != []:
        data.append({"tokens": current_words, "ner_tags": current_tags, "tag_ids": current_tag_ids})
        
    return data

In [6]:
# read in data
train_data = read_iob2_file(path_train)
dev_data = read_iob2_file(path_dev)
test_data = read_iob2_file(path_test)

In [7]:
# convert to huggingface format
train_dataset = Dataset.from_list(train_data)
dev_dataset = Dataset.from_list(dev_data)
test_dataset = Dataset.from_list(test_data)

### Tokenize and align labels

In [8]:
# save the tokenizer
tokenizer = RobertaTokenizerFast.from_pretrained(
    model_name, 
    use_fast = True, 
    add_prefix_space = True
)

In [9]:
# create the function 
def tokenize_and_align_labels(data):
    '''
    This function tokentizes the tokens and align the labels to the newly created subwords.
    The tokens can be split into multiple subwords, which are marked with -100, so they are ignored
    in the model *********

    Parameters:
        - data : the data we wish to tokenize and align. Must be a Huggingface dataset.

    Returns: 
        - the tokenized input with aligned labels.
    '''

    # tokenize the input
    tokenized_inputs = tokenizer(
        data["tokens"],             # tokenize the tokens (words)
        is_split_into_words = True, # tells the tokenizer each item in the list is already a separate word/token
        truncation = True,          # if a sentence is longer than the max_length it will be truncated / cut off 
        max_length = 128,           # a sentence can't be longer than 128
        padding = False             # no padding to save memory
    )

    
    # create empty list for aligned labels (to the subwords)
    all_labels = []

    # loop through each sentence
    for batch_index, labels in enumerate(data["tag_ids"]): 
        
        # 'word_ids()' returns a list the same length as the subword-tokens,
        # each entry telling you which 'word' or token it came from
        word_ids = tokenized_inputs.word_ids(batch_index = batch_index)  
        
        label_ids = []
        prev_word_id = None  

        # loop through the ids of the subword-tokens 
        for word_id in word_ids:

            if word_id is None:
                # e.g. special tokens or padding => ignore
                label_ids.append(-100)

            elif word_id == prev_word_id:
                # subword token of the same word => ignore
                label_ids.append(-100)
            
            else:
                # new subword, so use the label for the original token
                label_ids.append(labels[word_id])
            
            # move on to the next word
            prev_word_id = word_id
        
        all_labels.append(label_ids)

    # add the new algined labels to the tokenized inputs
    tokenized_inputs["labels"] = all_labels

    return tokenized_inputs

In [10]:
tokenized_train_dataset = train_dataset.map(
    tokenize_and_align_labels,
    batched = True,
    remove_columns=train_dataset.column_names
)

tokenized_dev_dataset = dev_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dev_dataset.column_names
)

tokenized_test_dataset = test_dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=test_dataset.column_names
)

Map: 100%|██████████| 12543/12543 [00:00<00:00, 24330.26 examples/s]
Map: 100%|██████████| 2001/2001 [00:00<00:00, 35771.74 examples/s]
Map: 100%|██████████| 2077/2077 [00:00<00:00, 37582.43 examples/s]


### Model training

In [11]:
# defining the model and config
config = AutoConfig.from_pretrained(
    model_name, 
    num_labels = num_labels, 
    id2label = id2label, 
    label2id = label2id
)

model = AutoModelForTokenClassification.from_pretrained(
    model_name, 
    torch_dtype = 'auto', 
    config = config
)

data_collator = DataCollatorForTokenClassification(tokenizer)


Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at deepset/roberta-base-squad2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
# we are running the code on a Mac so we use MPS. Might need to change it depending on the machine the code is run on (ex. HPC)
device = "mps" if torch.backends.mps.is_available() else "cpu"

model.to(device)

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
            

In [13]:
# defining the training arguments
args = TrainingArguments(
    output_dir = "output_trainer", 
    eval_strategy = 'epoch', 
    save_strategy = "no",
    learning_rate = 2e-5,
    per_device_train_batch_size = 2,
    per_device_eval_batch_size = 2,
    num_train_epochs = 1,
    weight_decay = 0.01
)

In [None]:
# define fuction to convert prediction into labels
def pred2label(predictions):
    '''
    
    '''
    logits, labels = predictions # unpack predictons into logits (probabilities) and labels

    preds = np.argmax(logits, axis = -1) # choose the highest probability as the prediciton

    true_labels = [] # list to hold true labels
    pred_labels = []  # list to hold predicted labels

    # convert true labels and predictions to string
    for pred_seq, label_seq in zip(preds, labels):

        true_labels.append([id2label[label] for label in label_seq if label != -100])
        
        pred_labels.append([id2label[pred] for pred, label in zip(pred_seq, label_seq) if label != -100])

    return true_labels, pred_labels


In [15]:
metric = load("seqeval")  # load the seqeval metric 

# define a function for computing metrics during training
def compute_metrics(predictions):
    '''
    This function computes precision, recall, f1 and accuracy.

    Parameters: 
    - predictions
    '''
    true_labels, pred_labels = pred2label(predictions)

    results = metric.compute(predictions = pred_labels, references = true_labels)

    return {
        "Precision": results["overall_precision"],
        "Recall": results["overall_recall"],
        "F1-score": results["overall_f1"],
        "Accuracy": results["overall_accuracy"]
    }

In [16]:
# define parameters for trainer
trainer = Trainer(
    model = model,
    args = args,
    train_dataset = tokenized_train_dataset,
    eval_dataset = tokenized_dev_dataset,
    compute_metrics = compute_metrics,
    data_collator = data_collator
)

In [17]:
# train the model
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1-score,Accuracy
1,0.035,0.081626,0.793037,0.848861,0.82,0.985646


TrainOutput(global_step=6272, training_loss=0.06814865691929448, metrics={'train_runtime': 2706.5149, 'train_samples_per_second': 4.634, 'train_steps_per_second': 2.317, 'total_flos': 174280056032742.0, 'train_loss': 0.06814865691929448, 'epoch': 1.0})

In [18]:
# Save the model
model.save_pretrained("output_trainer")
tokenizer.save_pretrained("output_trainer")

('output_trainer/tokenizer_config.json',
 'output_trainer/special_tokens_map.json',
 'output_trainer/vocab.json',
 'output_trainer/merges.txt',
 'output_trainer/added_tokens.json',
 'output_trainer/tokenizer.json')

### Predicting on the dev set
- For checking model performance and if formatting is correct with span_f1.py

In [19]:
# predicting
dev_preds, true_labels, _ = trainer.predict(tokenized_dev_dataset) 

In [20]:
# predict max logit and convert to strings
dev_labels, dev_predictions = pred2label((dev_preds, true_labels))

In [25]:
def write_iob2_file(data, predictions = None, path = None, gold = False):
    '''
    
    '''
    # formatting the predictions on dev set
    format = []

    # Loop through all items in dev_data
    for i in range(len(data)):
        if gold:
            format.append((data[i]['tokens'], (data[i]['ner_tags'])))
        else:
            # Access 'tokens' in dev_data[i] and append it with the corresponding prediction
            format.append((data[i]['tokens'], predictions[i]))

    with open(path, "w", encoding = "utf-8") as f:
        for sentence in format:
            words, labels = sentence
            for idx, (word, label) in enumerate(zip(words, labels), start = 1):
                f.write(f"{idx}\t{word}\t{label}\t-\t-\n")
            f.write("\n")

In [26]:
# writing the iob2 file with the gold labels for dev-set
write_iob2_file(dev_data,  path = "dev_gold.iob2", gold = True)

# writing the iob2 file with the predicted labels for dev-set
write_iob2_file(dev_data, predictions = dev_predictions, path = "dev_output.iob2")

In [35]:
!python span_f1.py dev_gold.iob2 dev_output.iob2

recall:    0.849896480331263
precision: 0.8072763028515241
slot-f1:   0.8280383257690368

unlabeled
ul_recall:    0.8933747412008282
ul_precision: 0.8485742379547689
ul_slot-f1:   0.870398386283409

loose (partial overlap with same label)
l_recall:    0.8944099378881988
l_precision: 0.8534906588003933
l_slot-f1:   0.8734713273421012


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


### Predict on test set

In [30]:
test_preds, test_labels, _ = trainer.predict(tokenized_test_dataset)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [33]:
# predict max logit and convert to strings
_, test_predictions = pred2label((test_preds, test_labels))

In [34]:
# write output file for predictions on test data
write_iob2_file(test_data, predictions = test_predictions, path = "test_predictions.iob2")