In [23]:
# imports
#import numpy as np

In [24]:
# path to the data files
path_train = "../data/da_news_train.tsv"
path_dev = "../data/da_news_dev.tsv"
path_test = "../data/da_news_test.tsv"

In [25]:
# taken from span_f1.py provided and modified
def readNlu(path, target_column = 1): # default to index 1 (thats where DaN+ labels are)
    # reads labels from target_column, assumes conll-like file
    # with 1 word per line, tab separation, and empty lines
    # for sentence splits. 
    annotations = []
    cur_annotation = []
    for line in open(path, encoding='utf-8'):
        line = line.strip()
        if line == '':
            annotations.append(cur_annotation)
            cur_annotation = []
        elif line[0] == '#' and len(line.split('\t')) == 1:
            continue
        else:
            cur_annotation.append(line.split('\t')[target_column])
    return annotations

In [26]:
data_labels = readNlu(path_train) # reads in label column


In [27]:
# mapping funciton 
def mapping(path):
    '''
    
    '''
    data_labels = readNlu(path) 

    label_set = set()

    for labels in data_labels:
        filtered = [label for label in labels if 'part' not in label.lower() and 'deriv' not in label.lower()]
        label_set.update(filtered)

    num_labels = len(label_set)

    label2id = {label: id for id, label in enumerate(label_set)}

    id2label = {id: label for label, id in label2id.items()}

    return label2id, id2label
    

In [None]:
label2id, id2label = mapping(path_train)
print(label2id)
print(id2label)


{'B-PER': 0, 'I-ORG': 1, 'B-ORG': 2, 'B-MISC': 3, 'I-MISC': 4, 'O': 5, 'I-LOC': 6, 'B-LOC': 7, 'I-PER': 8}
{0: 'B-PER', 1: 'I-ORG', 2: 'B-ORG', 3: 'B-MISC', 4: 'I-MISC', 5: 'O', 6: 'I-LOC', 7: 'B-LOC', 8: 'I-PER'}


In [29]:
# load data function
# function for loading iob2 data (from solution for assignment 5)
def read_tsv_file(path):
    '''
    This function reads tsv files
    
    Parameters:
    - path: path to read from

    Returns:
    - list with dictionaries for each sentence where the keys are 'tokens', 'ner_tags', and 'tag_ids' and 
      the values are lists that hold the tokens, ner_tags, and tag_ids.
    '''

    data = []
    current_words = []
    current_tags = []
    current_tag_ids = []
    #counter = 0

    for line in open(path, encoding='utf-8'):

       # print("new line")

        line = line.strip() # removes any leading and trailing whitespaces from the line

       # print("line ", line)

        if line:
            if line[0] == '#': 
                continue # skip comments

            #print("line ", line)

            # splitting at 'tab', as the data is tab separated 
            tok = line.split('\t')

           # print("line ", line)
            #print("tok ", tok)

            # add the entry in the second colun (the word) to current_words
            current_words.append(tok[0]) 

            #print("current_words ", current_words)

            if tok[1] not in label2id:
                continue

            # add the current tag 
            current_tags.append(tok[1]) 

            #print("current_tags ", current_tags)
            #print("current_tag_ids ", current_tag_ids)
            # add the current tag mapped to the corresponding id (int)
            current_tag_ids.append(label2id[tok[1]]) 

            #print("current_tag_ids ", current_tag_ids)
        
        else: # skip empty lines
            if current_words: # if current_words is not empty

                # add entry to dict where tokens and ner_tags are keys and the values are lists
                data.append({"tokens": current_words, "ner_tags": current_tags, "tag_ids": current_tag_ids})

            # start over  
            current_words = []
            current_tags = []
            current_tag_ids = []
           # counter += 1
    # check for last one
    if current_tags != []:
        data.append({"tokens": current_words, "ner_tags": current_tags, "tag_ids": current_tag_ids})
    
    #if counter < 5:
         #   print(f"\n--- Example {counter + 1} (Final) ---")
          #  print("Tokens:   ", current_words)
          #  print("NER Tags: ", current_tags)
           # print("Tag IDs:  ", current_tag_ids)
    return data

In [30]:
data_train = read_tsv_file(path_train)
# Now collect all unique labels/tags
all_tags = set()

for entry in data_train:
    all_tags.update(entry["ner_tags"])

# Print the unique tags
print("Unique NER Tags:", sorted(all_tags))

Unique NER Tags: ['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']
