In [1]:
# imports
from collections import defaultdict
from collections import Counter
import random

import sys
sys.path.append('.')  # add the project root to the path

from scripts.load_data import (
    label_mapping, extract_labeled_tokens,
    read_tsv_file, write_tsv_file,
    write_iob2_file, modified_readNlu,
    read_iob2_file
)

from scripts.preprocess import (
    concatenate_data, enitity_sentence_mapping,
    group_sentences, split_sentence_groups,
    finalize_split_with_o_sentences,
    check_dataset_sizes, check_token_overlap,
    
)

In [2]:
# path to the data files
path_train = "data/da_news/da_news_train.tsv"
path_dev = "data/da_news/da_news_dev.tsv"
path_test = "data/da_news/da_news_test.tsv"

In [3]:
# create label mapping
label2id, id2label = label_mapping(path_train)

In [4]:
# read in the DaN+ data
train_data = read_tsv_file(path_train, label2id)
dev_data = read_tsv_file(path_dev, label2id)
test_data = read_tsv_file(path_test, label2id)

In [5]:
# extract tokens with non-"O" labels from each split
train_tokens = extract_labeled_tokens(train_data)
dev_tokens = extract_labeled_tokens(dev_data)
test_tokens = extract_labeled_tokens(test_data)

In [6]:
# print out the number of unique tokens in each split
print(f"Unique tokes in train : {len(train_tokens)} tokens")
print(f"Unique tokes in dev : {len(dev_tokens)} tokens")
print(f"Unique tokes in test : {len(test_tokens)} tokens")

Unique tokes in train : 2635 tokens
Unique tokes in dev : 470 tokens
Unique tokes in test : 513 tokens


In [7]:
# compute intersections to find overlaps
train_dev_overlap = train_tokens & dev_tokens                   # tokens that appear in both train and dev
train_test_overlap = train_tokens & test_tokens                 # tokens that appear in both train and test
dev_test_overlap = dev_tokens & test_tokens                     # tokens that appear in both dev and test
all_three_overlap = train_tokens & dev_tokens & test_tokens     # tokens common to all three splits

# print out the number of overlapping tokens
print(f"Train-Dev overlap: {len(train_dev_overlap)} tokens")
print(f"Train-Test overlap: {len(train_test_overlap)} tokens")
print(f"Dev-Test overlap: {len(dev_test_overlap)} tokens")
print(f"All three overlap: {len(all_three_overlap)} tokens")


Train-Dev overlap: 256 tokens
Train-Test overlap: 219 tokens
Dev-Test overlap: 78 tokens
All three overlap: 74 tokens


Fix

In [8]:
# concatenate data
total_data = concatenate_data(train_data, dev_data, test_data)

In [9]:
# extract all unique non-"O" entities
total_entities = extract_labeled_tokens(total_data)

In [10]:
# create entity sentence mapping
entity_to_sents, sent_to_entities = enitity_sentence_mapping(total_data, total_entities)

In [11]:
# group sentences by shared entities
sentence_groups = group_sentences(entity_to_sents, sent_to_entities)

In [12]:
# shuffle and split groups by total sentence count
train_group, dev_group, test_group = split_sentence_groups(sentence_groups)

In [13]:
# get final slipts (with all "O" sentences)
train_data, dev_data, test_data = finalize_split_with_o_sentences(total_data, train_group, dev_group, test_group)


In [14]:
check_dataset_sizes(train_data, dev_data, test_data)

train size: 4181
dev size: 326
test size: 327
total dataset size: 4834


In [15]:
check_token_overlap(train_data, dev_data, test_data)

overlap between train and dev: 0
overlap between dev and test: 0
overlap between train and test: 0
