### Notebook overview
1. Read in the data (function)
    - Making sure that the data is in the correct format
    - Function does the label mapping and conversion of labels to label ids

2. Transform data into Huggingface Dataset object

In [1]:
# imports
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer, AutoTokenizer, AutoConfig, RobertaTokenizerFast, DataCollatorForTokenClassification
import numpy as np
import tqdm as notebook_tqdm
from datasets import Dataset, DatasetDict
import torch

from span_f1 import readNlu, toSpans, getBegEnd, getLooseOverlap, getUnlabeled


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# path to the data files
path_train = "en_ewt-ud-train.iob2"
path_dev = "en_ewt-ud-dev.iob2"
path_test = "en_ewt-ud-test-masked.iob2"

### Getting the data

In [3]:
# creating the label to id mapping 
data_labels = readNlu(path_train)

label_set = set()

for labels in data_labels:
    label_set.update(labels)

num_labels = len(label_set)

label2id = {label: id for id, label in enumerate(label_set)}
#print(label2id)

id2label = {id: label for label, id in label2id.items()}
#print(id2label)

In [4]:
# from assignment 5
# function for loading iob2 data (from solution for assignment 5)
def read_iob2_file(path):
    '''
    This function reads iob2 files
    
    Parameters:
    - path: path to read from

    Returns:
    - list with dictionaries for each sentence where the keys are 'tokens' and 'ner_tags' and the values are lists that hold the tokens and ner_tags
    '''

    data = []
    current_words = []
    current_tags = []
    current_tag_ids = []
    #counter = 0

    for line in open(path, encoding='utf-8'):
        line = line.strip() # removes any leading and trailing whitespaces from the line

        if line:
            if line[0] == '#': 
                continue # skip comments

            # splitting at 'tab', as the data is tab separated 
            tok = line.split('\t')

            # add the entry in the second colun (the word) to current_words
            current_words.append(tok[1]) 

            # add the current tag 
            current_tags.append(tok[2]) 

            # add the current tag mapped to the corresponding id (int)
            current_tag_ids.append(label2id[tok[2]]) 
        
        else: # skip empty lines
            if current_words: # if current_words is not empty

                # add entry to dict where tokens and ner_tags are keys and the values are lists
                data.append({"tokens": current_words, "ner_tags": current_tags, "tag_ids": current_tag_ids})
                 
                # print only the first 5 loops (or less if there are fewer than 5)
                #counter += 1
                #if counter <= 5:
                #    print(data[-1])  # print the last added entry (latest one)

            # start over  
            current_words = []
            current_tags = []

    # check for last one
    if current_tags != []:
        data.append({"tokens": current_words, "ner_tags": current_tags, "tag_ids": current_tag_ids})
        
    return data

In [5]:
# read in data
train_data = read_iob2_file(path_train)
dev_data = read_iob2_file(path_dev)
test_data = read_iob2_file(path_test)

In [None]:
# convert to huggingface format
train_dataset = Dataset.from_list(train_data)
#dev_dataset = Dataset.from_list(dev_data)
#test_dataset = Dataset.from_list(test_data)