In [None]:
# imports
import sys
sys.path.append("../")

from scripts.load_data import modified_readNlu, label_mapping, read_tsv_file, extract_labeled_tokens
from collections import Counter

In [15]:
# path to the data files
path_train = "../data/da_news_train.tsv"
path_dev = "../data/da_news_dev.tsv"
path_test = "../data/da_news_test.tsv"

### Label mapping

In [None]:
data_labels = modified_readNlu(path_train) # reads in label column

In [17]:
print(len(data_labels)) # number of sentences
print(data_labels[0]) # tags in first sentence

4382
['O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'B-ORGpart', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [None]:
label2id, id2label = label_mapping(path_train)
print(label2id)
print(id2label)

{'I-LOC': 0, 'O': 1, 'B-MISC': 2, 'I-MISC': 3, 'B-PER': 4, 'I-ORG': 5, 'B-ORG': 6, 'I-PER': 7, 'B-LOC': 8}
{0: 'I-LOC', 1: 'O', 2: 'B-MISC', 3: 'I-MISC', 4: 'B-PER', 5: 'I-ORG', 6: 'B-ORG', 7: 'I-PER', 8: 'B-LOC'}


### Reading in data

In [19]:
# reading in the data
train_data = read_tsv_file(path_train, label2id=label2id)
dev_data = read_tsv_file(path_dev, label2id=label2id)
test_data = read_tsv_file(path_test, label2id=label2id)

In [20]:
print(len(train_data)) # no. of sentences

# first sentence
print(train_data[0]["tokens"])
print(train_data[0]["ner_tags"])
print(train_data[0]["tag_ids"])

4383
['På', 'fredag', 'har', 'SID', 'inviteret', 'til', 'reception', 'i', 'SID-huset', 'i', 'anledning', 'af', 'at', 'formanden', 'Kjeld', 'Christensen', 'går', 'ind', 'i', 'de', 'glade', 'tressere', '.']
['O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
[1, 1, 1, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 7, 1, 1, 1, 1, 1, 1, 1]


### Looking at unique tags

In [21]:
# collecting all unique tags
all_tags = set()

for sent in train_data:
    all_tags.update(sent["ner_tags"])

print("Unique NER Tags:", sorted(all_tags))

Unique NER Tags: ['B-LOC', 'B-MISC', 'B-ORG', 'B-PER', 'I-LOC', 'I-MISC', 'I-ORG', 'I-PER', 'O']


In [22]:
# checking if all sequences are alligned
for i, sent in enumerate(train_data):
    tokens = sent["tokens"]
    tags = sent["ner_tags"]
    tag_ids = sent["tag_ids"]
    
    if not (len(tokens) == len(tags) == len(tag_ids)):
        print(f"Mismatch found in sentence {i}:")
        print(f"  Tokens ({len(tokens)}): {tokens}")
        print(f"  Tags   ({len(tags)}): {tags}")
        print(f"  IDs    ({len(tag_ids)}): {tag_ids}")

### Looking at overlapping entities

In [23]:
# extracting all entities from datasets
train_entities = extract_labeled_tokens(train_data)
dev_entities = extract_labeled_tokens(dev_data)
test_entities = extract_labeled_tokens(test_data)

In [24]:
# identifying overlap
overlap_train_dev = train_entities & dev_entities
overlap_train_test = train_entities & test_entities
overlap_dev_test = dev_entities & test_entities

print(f'Overlap between train and dev: {len(overlap_train_dev)}')
print(f'Overlap between train and test: {len(overlap_train_test)}')
print(f'Overlap between dev and test: {len(overlap_dev_test)}')

Overlap between train and dev: 256
Overlap between train and test: 219
Overlap between dev and test: 78


In [25]:
# extracting entities WITH labels/tags
train_entities_labeled = extract_labeled_tokens(train_data, include_label_pair=True)
dev_entities_labeled = extract_labeled_tokens(dev_data, include_label_pair=True)
test_entities_labeled = extract_labeled_tokens(test_data, include_label_pair=True)

overlap_train_dev_labeled = train_entities_labeled & dev_entities_labeled
overlap_train_test_labeled = train_entities_labeled & test_entities_labeled
overlap_dev_test_labeled = dev_entities_labeled & test_entities_labeled

In [26]:
# tag distribution in overlap
train_dev_tag_counts = Counter(tag for _, tag in overlap_train_dev_labeled)
train_test_tag_counts = Counter(tag for _, tag in overlap_train_test_labeled)
dev_test_tag_counts = Counter(tag for _, tag in overlap_dev_test_labeled)

print(train_dev_tag_counts)
print(train_test_tag_counts)
print(dev_test_tag_counts)

Counter({'B-PER': 79, 'I-PER': 43, 'B-LOC': 42, 'B-ORG': 31, 'I-ORG': 11, 'B-MISC': 10, 'I-MISC': 9, 'I-LOC': 2})
Counter({'B-PER': 62, 'B-ORG': 37, 'I-PER': 35, 'B-LOC': 30, 'I-ORG': 16, 'B-MISC': 8, 'I-MISC': 8})
Counter({'B-PER': 27, 'I-PER': 16, 'B-ORG': 11, 'B-LOC': 9, 'B-MISC': 4, 'I-ORG': 2})


In [27]:
# function to count number of times overlapping entities appear

def count_overlap_entities(data, overlap_data):

    counts = Counter()

    for example in data:
        tokens = example["tokens"]
        tags = example["ner_tags"]
        for token, tag in zip(tokens, tags):
            if (token, tag) in overlap_data:
                counts[(token, tag)] += 1

    print(counts)
    print('Number of sentences with overlapping entities:', sum(counts.values()), 'out of:', len(data))

In [28]:
count_overlap_entities(train_data, overlap_train_dev_labeled)

Counter({('Danmark', 'B-LOC'): 46, ('København', 'B-LOC'): 39, ('Lars', 'B-PER'): 29, ('Poul', 'B-PER'): 27, ('Nielsen', 'I-PER'): 25, ('Hafnia', 'B-ORG'): 23, ('Europa', 'B-LOC'): 22, ('Peter', 'B-PER'): 19, ('Det', 'B-ORG'): 19, ('Jens', 'B-PER'): 19, ('Den', 'B-ORG'): 18, ('Henrik', 'B-PER'): 18, ('Andersen', 'I-PER'): 17, ('USA', 'B-LOC'): 16, ('Hansen', 'I-PER'): 16, ('B.T.', 'B-ORG'): 15, ('John', 'B-PER'): 15, ('Jan', 'B-PER'): 15, ('Petersen', 'I-PER'): 14, ('Rasmussen', 'I-PER'): 13, ('Christensen', 'I-PER'): 12, ('Jensen', 'I-PER'): 12, ('Bent', 'B-PER'): 12, ('Ole', 'B-PER'): 11, ('SF', 'B-ORG'): 10, ('Jørgensen', 'I-PER'): 10, ('Per', 'B-PER'): 10, ('Lise', 'B-PER'): 10, ('Socialdemokratiet', 'B-ORG'): 9, ('Danmarks', 'B-LOC'): 9, ('Paris', 'B-LOC'): 9, ('Tyskland', 'B-LOC'): 9, ('Danske', 'I-ORG'): 9, ('Søren', 'B-PER'): 9, ('Henning', 'B-PER'): 8, ('Holding', 'I-ORG'): 8, ('Larsen', 'I-PER'): 8, ('Pedersen', 'I-PER'): 8, ('Diana', 'B-PER'): 8, ('Olsen', 'I-PER'): 7, ('Uni

In [29]:
count_overlap_entities(train_data, overlap_train_test_labeled)

Counter({('Danmark', 'B-LOC'): 46, ('København', 'B-LOC'): 39, ('Lars', 'B-PER'): 29, ('Poul', 'B-PER'): 27, ('Nielsen', 'I-PER'): 25, ('Peter', 'B-PER'): 19, ('Det', 'B-ORG'): 19, ('Jens', 'B-PER'): 19, ('Den', 'B-ORG'): 18, ('Henrik', 'B-PER'): 18, ('USA', 'B-LOC'): 16, ('Hansen', 'I-PER'): 16, ('B.T.', 'B-ORG'): 15, ('John', 'B-PER'): 15, ('Christensen', 'I-PER'): 12, ('Jensen', 'I-PER'): 12, ('Steen', 'B-PER'): 11, ('Jesper', 'B-PER'): 11, ('Ole', 'B-PER'): 11, ('Århus', 'B-LOC'): 11, ('SF', 'B-ORG'): 10, ('Jørgensen', 'I-PER'): 10, ('Per', 'B-PER'): 10, ('Michael', 'B-PER'): 10, ('Erik', 'B-PER'): 10, ('Dansk', 'B-ORG'): 10, ('Socialdemokratiet', 'B-ORG'): 9, ('Danmarks', 'B-LOC'): 9, ('Paris', 'B-LOC'): 9, ('Tyskland', 'B-LOC'): 9, ('Danske', 'I-ORG'): 9, ('Palle', 'B-PER'): 9, ('Henning', 'B-PER'): 8, ('Erik', 'I-PER'): 8, ('Larsen', 'I-PER'): 8, ('Pedersen', 'I-PER'): 8, ('Sverige', 'B-LOC'): 8, ('Olsen', 'I-PER'): 7, ('Hans', 'B-PER'): 7, ('Madsen', 'I-PER'): 7, ('Bosnien-Herc

In [30]:
count_overlap_entities(dev_data, overlap_train_dev_labeled)

Counter({('Henrik', 'B-PER'): 5, ('Danmark', 'B-LOC'): 5, ('Poul', 'B-PER'): 5, ('Hansen', 'I-PER'): 4, ('Nielsen', 'I-PER'): 4, ('Lars', 'B-PER'): 4, ('USA', 'B-LOC'): 4, ('Andersen', 'I-PER'): 3, ('Søren', 'B-PER'): 3, ('Larsen', 'I-PER'): 3, ('Den', 'B-ORG'): 3, ('Holding', 'I-ORG'): 3, ('Hafnia', 'B-ORG'): 3, ('B.T.', 'B-ORG'): 3, ('Europa', 'B-LOC'): 3, ('H.', 'B-PER'): 2, ('C.', 'I-PER'): 2, ('Rudbjerg', 'B-ORG'): 2, ("TEBA's", 'B-ORG'): 2, ('København', 'B-LOC'): 2, ('Peter', 'B-PER'): 2, ('Kosan', 'B-ORG'): 2, ('Østeuropa', 'B-LOC'): 2, ('Bent', 'B-PER'): 2, ('John', 'B-PER'): 2, ('Jensen', 'I-PER'): 2, ('Flemming', 'B-PER'): 2, ('Tom', 'B-PER'): 2, ('Jens', 'B-PER'): 2, ('Paris', 'B-LOC'): 2, ('Prag', 'B-LOC'): 2, ('Bjarne', 'B-PER'): 2, ('Parken', 'B-LOC'): 2, ('Lise', 'B-PER'): 2, ('Paul', 'B-PER'): 2, ('Horsens', 'B-ORG'): 2, ('NATO', 'B-ORG'): 2, ('Clinton', 'B-PER'): 2, ('SF', 'B-ORG'): 2, ('Mucomyst', 'B-MISC'): 2, ('Knud', 'B-PER'): 2, ('Nakskov', 'B-LOC'): 2, ('Cole', 

In [31]:
count_overlap_entities(dev_data, overlap_dev_test_labeled)

Counter({('Henrik', 'B-PER'): 5, ('Danmark', 'B-LOC'): 5, ('Poul', 'B-PER'): 5, ('Hansen', 'I-PER'): 4, ('Nielsen', 'I-PER'): 4, ('Lars', 'B-PER'): 4, ('USA', 'B-LOC'): 4, ('Larsen', 'I-PER'): 3, ('Den', 'B-ORG'): 3, ('B.T.', 'B-ORG'): 3, ('H.', 'B-PER'): 2, ('København', 'B-LOC'): 2, ('Peter', 'B-PER'): 2, ('John', 'B-PER'): 2, ('Jensen', 'I-PER'): 2, ('Flemming', 'B-PER'): 2, ('Jens', 'B-PER'): 2, ('Paris', 'B-LOC'): 2, ('Bjarne', 'B-PER'): 2, ('Paul', 'B-PER'): 2, ('SF', 'B-ORG'): 2, ('Divisionsforeningen', 'B-ORG'): 2, ('De', 'B-MISC'): 1, ('Kim', 'B-PER'): 1, ('Anne', 'B-PER'): 1, ('Linnet', 'I-PER'): 1, ('Danske', 'I-ORG'): 1, ('Bank', 'I-ORG'): 1, ('Nanna', 'B-PER'): 1, ('Ivan', 'B-PER'): 1, ('Per', 'B-PER'): 1, ('Morten', 'B-PER'): 1, ('Mogens', 'B-PER'): 1, ('Lykketoft', 'I-PER'): 1, ('Christensen', 'I-PER'): 1, ('Olsen', 'I-PER'): 1, ('Det', 'B-ORG'): 1, ('Berlingske', 'B-ORG'): 1, ('Tyskland', 'B-LOC'): 1, ('Folketinget', 'B-ORG'): 1, ('Christiansen', 'I-PER'): 1, ('Århus', 

In [32]:
count_overlap_entities(test_data, overlap_train_test_labeled)

Counter({('Danmark', 'B-LOC'): 8, ('Peter', 'B-PER'): 7, ('Nielsen', 'I-PER'): 7, ('Kim', 'B-PER'): 6, ('Bjarne', 'B-PER'): 5, ('Henning', 'B-PER'): 5, ('USA', 'B-LOC'): 4, ('John', 'B-PER'): 4, ('Brian', 'B-PER'): 4, ('Hans', 'B-PER'): 3, ('Socialdemokratiet', 'B-ORG'): 3, ('Anders', 'B-PER'): 3, ('Jensen', 'I-PER'): 3, ('Ove', 'B-PER'): 3, ('Det', 'B-ORG'): 3, ('kommune', 'I-ORG'): 3, ('Jens', 'B-PER'): 3, ('København', 'B-LOC'): 3, ('Birte', 'B-PER'): 2, ('Danmarks', 'B-ORG'): 2, ('Mogens', 'B-PER'): 2, ('Pedersen', 'I-PER'): 2, ('B.T.', 'B-ORG'): 2, ('Københavns', 'B-ORG'): 2, ('Brøndbys', 'B-ORG'): 2, ('Steen', 'B-PER'): 2, ('Palle', 'B-PER'): 2, ('Jane', 'B-PER'): 2, ('Madsen', 'I-PER'): 2, ('Paris', 'B-LOC'): 2, ('London', 'B-LOC'): 2, ('Marilyn', 'B-PER'): 2, ('Teaters', 'I-ORG'): 2, ('Kgl.', 'I-ORG'): 2, ('Anne', 'B-PER'): 2, ('Leif', 'B-PER'): 2, ('Odense', 'B-ORG'): 2, ('Schlüter', 'B-PER'): 2, ('Jesper', 'B-PER'): 2, ('DSB', 'B-ORG'): 2, ('Århus', 'B-LOC'): 2, ('Erik', 'B-P

In [33]:
count_overlap_entities(test_data, overlap_dev_test_labeled)

Counter({('Danmark', 'B-LOC'): 8, ('Peter', 'B-PER'): 7, ('Nielsen', 'I-PER'): 7, ('Kim', 'B-PER'): 6, ('Bjarne', 'B-PER'): 5, ('Henning', 'B-PER'): 5, ('USA', 'B-LOC'): 4, ('John', 'B-PER'): 4, ('Brian', 'B-PER'): 4, ('Socialdemokratiet', 'B-ORG'): 3, ('Jensen', 'I-PER'): 3, ('Det', 'B-ORG'): 3, ('Jens', 'B-PER'): 3, ('København', 'B-LOC'): 3, ('Mogens', 'B-PER'): 2, ('Pedersen', 'I-PER'): 2, ('B.T.', 'B-ORG'): 2, ('Divisionsforeningen', 'B-ORG'): 2, ('Paris', 'B-LOC'): 2, ('The', 'B-MISC'): 2, ('Anne', 'B-PER'): 2, ('Odense', 'B-ORG'): 2, ('ODENSE', 'B-LOC'): 2, ('Henrik', 'B-PER'): 2, ('Svend', 'B-PER'): 2, ('dollar', 'B-MISC'): 2, ('Folketinget', 'B-ORG'): 1, ('Tyskland', 'B-LOC'): 1, ('Wiese', 'I-PER'): 1, ('Christensen', 'I-PER'): 1, ('Lyngby', 'B-LOC'): 1, ('Hansen', 'I-PER'): 1, ('B.', 'I-PER'): 1, ('Johnny', 'B-PER'): 1, ('Jørgensen', 'I-PER'): 1, ('Linnet', 'I-PER'): 1, ('Odense', 'B-LOC'): 1, ('Lykketoft', 'I-PER'): 1, ('Per', 'B-PER'): 1, ('De', 'B-MISC'): 1, ('Ole', 'B-PER