In [280]:
# imports
#import numpy as np

In [281]:
# path to the data files
path_train = "../data/da_news_train.tsv"
path_dev = "../data/da_news_dev.tsv"
path_test = "../data/da_news_test.tsv"

In [282]:
# taken from span_f1.py (modified)
def readNlu(path, target_column = 1): # default to index 1 (thats where DaN+ labels are)

    # reads labels from target_column, assumes conll-like file
    # with 1 word per line, tab separation, and empty lines
    # for sentence splits. 
    
    annotations = []
    cur_annotation = []
    for line in open(path, encoding='utf-8'):
        line = line.strip()
        if line == '':
            annotations.append(cur_annotation)
            cur_annotation = []
        elif line[0] == '#' and len(line.split('\t')) == 1:
            continue
        else:
            cur_annotation.append(line.split('\t')[target_column])
    return annotations

In [283]:
data_labels = readNlu(path_train) # reads in label column

In [284]:
print(len(data_labels)) # number of sentences
print(data_labels[0]) # tags in first sentence

4382
['O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'B-ORGpart', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [285]:
# mapping funciton 
def mapping(path):
    '''
    
    '''
    data_labels = readNlu(path) 

    label_set = set()

    # filtering out labels with "part" or "deriv"
    for labels in data_labels:
        filtered = [label for label in labels if 'part' not in label.lower() and 'deriv' not in label.lower()]
        label_set.update(filtered)

    num_labels = len(label_set)

    label2id = {label: id for id, label in enumerate(label_set)}

    id2label = {id: label for label, id in label2id.items()}

    return label2id, id2label
    

In [286]:
label2id, id2label = mapping(path_train)
print(label2id)
print(id2label)

{'I-LOC': 0, 'B-PER': 1, 'B-LOC': 2, 'I-ORG': 3, 'O': 4, 'B-ORG': 5, 'I-MISC': 6, 'B-MISC': 7, 'I-PER': 8}
{0: 'I-LOC', 1: 'B-PER', 2: 'B-LOC', 3: 'I-ORG', 4: 'O', 5: 'B-ORG', 6: 'I-MISC', 7: 'B-MISC', 8: 'I-PER'}


In [287]:
# load data function
# function for loading iob2 data (from solution for assignment 5)
def read_tsv_file(path):
    '''
    This function reads tsv files
    
    Parameters:
    - path: path to read from

    Returns:
    - list with dictionaries for each sentence where the keys are 'tokens', 'ner_tags', and 'tag_ids' and 
      the values are lists that hold the tokens, ner_tags, and tag_ids.
    '''

    data = []
    current_words = []
    current_tags = []
    current_tag_ids = []

    #counter = 0

    for line in open(path, encoding='utf-8'):

        line = line.strip() # removes any leading and trailing whitespaces from the line

        if line:
            if line[0] == '#': 
                continue # skip comments

            # splitting at 'tab', as the data is tab separated 
            tok = line.split('\t')

            word = tok[0]
            tag = tok[1]

            # add the entry in the second colun (the word) to current_words
            current_words.append(word) 

            if tag in label2id:
                current_tags.append(tag) # add the current tag 
                current_tag_ids.append(label2id[tag]) # add the current tag mapped to the corresponding id (int)

            else: # if it contains "part" or "deriv" add "O"
                current_tags.append("O")
                current_tag_ids.append(label2id["O"])
        
        else: # skip empty lines
            if current_words: # if current_words is not empty

                # add entry to dict where tokens and ner_tags are keys and the values are lists
                data.append({"tokens": current_words, "ner_tags": current_tags, "tag_ids": current_tag_ids})

            # start over  
            current_words = []
            current_tags = []
            current_tag_ids = []

           # counter += 1
           
    # check for last one
    if current_tags != []:
        data.append({"tokens": current_words, "ner_tags": current_tags, "tag_ids": current_tag_ids})
    
    #if counter < 5:
           # print(f"\n--- Example {counter + 1} (Final) ---")
           # print("Tokens:   ", current_words)
           # print("NER Tags: ", current_tags)
           # print("Tag IDs:  ", current_tag_ids)
           
    return data

In [288]:
# reading in the data
train_data = read_tsv_file(path_train)
dev_data = read_tsv_file(path_dev)
test_data = read_tsv_file(path_test)

In [289]:
print(len(train_data)) # no. of sentences

# first sentence
print(train_data[0]["tokens"])
print(train_data[0]["ner_tags"])
print(train_data[0]["tag_ids"])

4383
['På', 'fredag', 'har', 'SID', 'inviteret', 'til', 'reception', 'i', 'SID-huset', 'i', 'anledning', 'af', 'at', 'formanden', 'Kjeld', 'Christensen', 'går', 'ind', 'i', 'de', 'glade', 'tressere', '.']
['O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
[4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 8, 4, 4, 4, 4, 4, 4, 4]


In [290]:
# collecting all unique tags
all_tags = set()

for sent in train_data:
    all_tags.update(sent["ner_tags"])

print("Unique NER Tags:", sorted(all_tags))

Unique NER Tags: ['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']


In [291]:
# checking if all sequences are alligned
for i, sent in enumerate(train_data):
    tokens = sent["tokens"]
    tags = sent["ner_tags"]
    tag_ids = sent["tag_ids"]
    
    if not (len(tokens) == len(tags) == len(tag_ids)):
        print(f"Mismatch found in sentence {i}:")
        print(f"  Tokens ({len(tokens)}): {tokens}")
        print(f"  Tags   ({len(tags)}): {tags}")
        print(f"  IDs    ({len(tag_ids)}): {tag_ids}")

In [292]:
def extract_entities(dataset, exclude_label = "O", include_label_pair=False):
    '''
    This function extracts tokens from a dataset that have a string label different from `exclude_label`.
    Optionally, it can return the (token, label) pairs instead of just tokens.

    Parameters:
        dataset (List[dict]): The token-tagged dataset.
        exclude_label (str): The label to ignore (default is 'O').
        include_label_pair (bool): Whether to include the (token, label) pairs in the result (default is False).
        
    Returns:
         Set[str] or Set[Tuple[str, str]]: 
            - A set of tokens with meaningful (non-O) labels if `include_label_pair` is False.
            - A set of (token, label) pairs if `include_label_pair` is True.
    '''

    # create empty set to store the unique tokens
    labeled_tokens = set()
    
    for sentence in dataset:
        # iterate over each token and its corresponding tag ID
        for token, label in zip(sentence["tokens"], sentence["ner_tags"]):
            if label != exclude_label and label in label2id:                      # check if the tag is not the excluded one
                if include_label_pair:
                    labeled_tokens.add((token, label))      # add (token, label) pair if the flag is True
                else:
                    labeled_tokens.add(token)               # add just the token if the flag is False
    
    return labeled_tokens

In [293]:
test_data = read_tsv_file(path_test)
dev_data = read_tsv_file(path_dev)

In [294]:
# extracting all entities from datasets
train_entities = extract_entities(train_data)
dev_entities = extract_entities(dev_data)
test_entities = extract_entities(test_data)

In [None]:
# identifying overlap
overlap_train_dev = train_entities & dev_entities
overlap_train_test = train_entities & test_entities
overlap_dev_test = dev_entities & test_entities

print(f'Overlap between train and dev: {len(overlap_train_dev)} out of {len(train_entities)+len(dev_entities)}')
print(f'Overlap between train and test: {len(overlap_train_test)} out of {len(train_entities)+len(test_entities)}')
print(f'Overlap between dev and test: {len(overlap_dev_test)} out of {len(dev_entities)+len(test_entities)}')

Overlap between train and dev: 256 out of 3105
Overlap between train and test: 219 out of 3148
Overlap between dev and test: 78 out of 983


In [296]:
# Option 1: Remove Entities from the Dev and Test Sets
# Option 2: Remove Entities from the Train Set