In [1]:
# imports
from scripts.load_data import mapping, read_tsv_file, extract_labeled_tokens

#### Getting the data

In [2]:
# path to the data files
path_train = "data/da_news_train.tsv"
path_dev = "data/da_news_dev.tsv"
path_test = "data/da_news_test.tsv"

In [3]:
# create mapping
label2id, id2label = mapping(path_train)

In [4]:
# read in the DaN+ data
train_data = read_tsv_file(path_train, label2id)
dev_data = read_tsv_file(path_dev, label2id)
test_data = read_tsv_file(path_test, label2id)

In [None]:
# extract tokens with non-"O" labels from each split
train_tokens = extract_labeled_tokens(train_data)
dev_tokens = extract_labeled_tokens(dev_data)
test_tokens = extract_labeled_tokens(test_data)

In [None]:
# print out the number of unique tokens in each split
print(f"Unique tokes in train : {len(train_tokens)} tokens")
print(f"Unique tokes in dev : {len(dev_tokens)} tokens")
print(f"Unique tokes in test : {len(test_tokens)} tokens")

Unique tokes in train : 2635 tokens
Unique tokes in dev : 470 tokens
Unique tokes in test : 513 tokens


In [7]:
# compute intersections to find overlaps
train_dev_overlap = train_tokens & dev_tokens                   # tokens that appear in both train and dev
train_test_overlap = train_tokens & test_tokens                 # tokens that appear in both train and test
dev_test_overlap = dev_tokens & test_tokens                     # tokens that appear in both dev and test
all_three_overlap = train_tokens & dev_tokens & test_tokens     # tokens common to all three splits


In [8]:
# print out the number of overlapping tokens
print(f"Train-Dev overlap: {len(train_dev_overlap)} tokens")
print(f"Train-Test overlap: {len(train_test_overlap)} tokens")
print(f"Dev-Test overlap: {len(dev_test_overlap)} tokens")
print(f"All three overlap: {len(all_three_overlap)} tokens")


Train-Dev overlap: 256 tokens
Train-Test overlap: 219 tokens
Dev-Test overlap: 78 tokens
All three overlap: 74 tokens


In [9]:
print("Tokens in train-dev overlap with non-zero labels:", train_dev_overlap)
print("Tokens in train-test overlap with non-zero labels:", train_test_overlap)
print("Tokens in dev-test overlap with non-zero labels:", dev_test_overlap)
print("Tokens in all three splits with non-zero labels:", all_three_overlap)

Tokens in train-dev overlap with non-zero labels: {'Jens', 'Parken', 'Gotland', 'Christiansborg', 'Lene', 'Nakskov', 'Folketinget', 'Frederik', 'SF', 'The', 'Dolly', 'Robinsons', 'Halifax', 'New', 'Dannebrog', 'Anna', 'Imelda', 'Århus', 'Natalie', 'Jugoslavien', 'vivaldi', 'Camre', 'Stewart', 'Lotte', 'Nat', 'Winther', 'Danish', 'KTAS', 'Monrovia', 'Schrøder', 'Marcos', 'Vladimir', 'ACO', 'Ungarn', 'Storbritannien', 'Jensen', 'B.T.', 'Vestunionen', 'Stadion', "FN's", 'Unibanks', 'Nils', 'Citroën', 'Johnny', 'No', 'Lars', 'Bosnien', 'Konservative', 'Brian', 'Henrik', 'Gerner', 'Trabzonspor', "TEBA's", 'Justitsministeriet', 'Clintons', 'De', 'Gudme', 'Dahlfelt', 'Danske', 'Hamburg', 'Islands', 'Tyskland', 'Europa', 'Madsen', 'Hafnia', 'Land', 'Mucomyst', 'Nigeria', 'Bjarne', 'Valby', 'Steffensen', 'Jeltsin', 'HOPKINS', 'Grækenland', 'John', 'Fergie', 'Østeuropa', 'Indien', 'Ivan', 'Christiansen', 'Nielsen', 'Faaborg', 'USAs', 'Mogens', 'Mortensen', 'Japan', 'Tom', 'Diemar', 'til', 'J.', 