In [191]:
# imports
import sys
sys.path.append("../")

from scripts.load_data import mapping, read_tsv_file
from collections import Counter

### Remeber to change extract_labeled_tokens in load_data.py to return list rather than set

In [192]:
# extracting tokens to check for overlap in train, dev and test sets
def extract_labeled_tokens(dataset, exclude_label = "O", include_label_pair=False):
    '''
    This function extracts tokens from a dataset that have a string label different from `exclude_label`.
    Optionally, it can return the (token, label) pairs instead of just tokens.

    Parameters:
        dataset (List[dict]): The token-tagged dataset.
        exclude_label (str): The label to ignore (default is 'O').
        include_label_pair (bool): Whether to include the (token, label) pairs in the result (default is False).
        
    Returns:
         Set[str] or Set[Tuple[str, str]]: 
            - A set of tokens with meaningful (non-O) labels if `include_label_pair` is False.
            - A set of (token, label) pairs if `include_label_pair` is True.
    '''

    # create empty set to store the unique tokens
    labeled_tokens = list()
    
    for sentence in dataset:
        # iterate over each token and its corresponding tag ID
        for token, label in zip(sentence["tokens"], sentence["ner_tags"]):
            if label != exclude_label:                      # check if the tag is not the excluded one
                if include_label_pair:
                    labeled_tokens.append((token, label))      # add (token, label) pair if the flag is True
                else:
                    labeled_tokens.append(token)               # add just the token if the flag is False
    
    return labeled_tokens


In [193]:
# getting data 
# path to the data files
path_train = "../data/da_news/da_news_train.tsv"
path_dev = "../data/da_news/da_news_dev.tsv"
path_test = "../data/da_news/da_news_test.tsv"

# create mapping
label2id, id2label = mapping(path_train)

# read in the DaN+ data
train_data = read_tsv_file(path_train, label2id)
dev_data = read_tsv_file(path_dev, label2id)
test_data = read_tsv_file(path_test, label2id)

In [194]:
print("train size: ", len(train_data))
print("dev size: ", len(dev_data))
print("test size: ", len(test_data))
len(train_data) + len(dev_data) + len(test_data)

train size:  4383
dev size:  564
test size:  565


5512

In [195]:
# extract tokens with non-"O" labels from each split
train_tokens = extract_labeled_tokens(train_data)
dev_tokens = extract_labeled_tokens(dev_data)
test_tokens = extract_labeled_tokens(test_data)

In [196]:
# saving as sets
train_token_set = set(train_tokens)
dev_token_set = set(dev_tokens)
test_token_set = set(test_tokens)

In [197]:
# print out the number of tokens in each split
print(f"Tokens in train : {len(train_tokens)} tokens")
print(f"Tokens in dev : {len(dev_tokens)} tokens")
print(f"Tokens in test : {len(test_tokens)} tokens")

Tokens in train : 4820 tokens
Tokens in dev : 573 tokens
Tokens in test : 695 tokens


In [198]:
# print out the number of unique tokens in each split
print(f"Unique tokes in train : {len(train_token_set)} tokens")
print(f"Unique tokes in dev : {len(dev_token_set)} tokens")
print(f"Unique tokes in test : {len(test_token_set)} tokens")

Unique tokes in train : 2635 tokens
Unique tokes in dev : 470 tokens
Unique tokes in test : 513 tokens


In [199]:
# convert lists to Counters (multisets)
train_counter = Counter(train_tokens)
dev_counter = Counter(dev_tokens)
test_counter = Counter(test_tokens)

# compute multiset (duplicate-aware) intersections
train_dev_overlap = train_counter & dev_counter
train_test_overlap = train_counter & test_counter
dev_test_overlap = dev_counter & test_counter
all_three_overlap = train_counter & dev_counter & test_counter

# convert Counters back to lists if you want to see actual token lists
train_dev_overlap_list = list(train_dev_overlap.elements())
train_test_overlap_list = list(train_test_overlap.elements())
dev_test_overlap_list = list(dev_test_overlap.elements())
all_three_overlap_list = list(all_three_overlap.elements())


In [200]:
# compute intersections to find overlaps of unique tokens
train_dev_overlap_set = train_token_set & dev_token_set                # tokens that appear in both train and dev
train_test_overlap_set = train_token_set & test_token_set                 # tokens that appear in both train and test
dev_test_overlap_set = dev_token_set & test_token_set                     # tokens that appear in both dev and test
all_three_overlap_set = train_token_set & dev_token_set & test_token_set     # tokens common to all three splits


In [201]:
# print out the number of overlapping tokens
print(f"Train-Dev overlap: {len(train_dev_overlap_list)} tokens")
print(f"Train-Test overlap: {len(train_test_overlap_list)} tokens")
print(f"Dev-Test overlap: {len(dev_test_overlap_list)} tokens")
print(f"All three overlap: {len(all_three_overlap_list)} tokens")

Train-Dev overlap: 333 tokens
Train-Test overlap: 309 tokens
Dev-Test overlap: 105 tokens
All three overlap: 100 tokens


In [202]:
# print out the number of overlapping tokens
print(f"Train-Dev overlap (unique): {len(train_dev_overlap_set)} tokens")
print(f"Train-Test overlap (unique): {len(train_test_overlap_set)} tokens")
print(f"Dev-Test overlap (unique): {len(dev_test_overlap_set)} tokens")
print(f"All three overlap (unique): {len(all_three_overlap_set)} tokens")


Train-Dev overlap (unique): 256 tokens
Train-Test overlap (unique): 219 tokens
Dev-Test overlap (unique): 78 tokens
All three overlap (unique): 74 tokens


In [203]:
# create set of the overlapping tokens(unique)
overlapping_tokens = set.union(train_dev_overlap_set, train_test_overlap_set, dev_test_overlap_set)

len(overlapping_tokens)

405

In [204]:
# create function for removing sentences with overlapping tokens from train, dev, test
# and adding them to a new dataset "overlap_data"

def remove_overlapping_sentences(dataset, overlapping_tokens):
    '''
    Removes sentences from the dataset that contain any token present in overlapping_tokens.

    Parameters:
        dataset (List[dict]): The token-tagged dataset (train/dev/test).
        overlapping_tokens (Set[str]): Set of overlapping tokens to filter out.

    Returns:
        Tuple[List[dict], List[dict]]:
            - cleaned_data: List of sentences without overlapping tokens.
            - overlap_data: List of sentences with overlapping tokens.
    '''
    cleaned_data = []
    overlap_data = []

    for sentence in dataset:
        tokens = sentence["tokens"]
        
        # check for overlap in the sentence
        labeled_tokens = extract_labeled_tokens([sentence])
        if any(token in overlapping_tokens for token in labeled_tokens):

            overlap_data.append(sentence)  # overlapping sentence
        else:
            cleaned_data.append(sentence)  # clean sentence

    return cleaned_data, overlap_data


In [205]:
# Remove overlapping sentences
train_data_cleaned, train_overlap = remove_overlapping_sentences(train_data, overlapping_tokens)
dev_data_cleaned, dev_overlap = remove_overlapping_sentences(dev_data, overlapping_tokens)
test_data_cleaned, test_overlap = remove_overlapping_sentences(test_data, overlapping_tokens)

# Combine all overlaps into one set for later processing
overlap_data = train_overlap + dev_overlap + test_overlap

print(f"Train overlap: {len(train_overlap)} sentences")
print(f"Dev overlap: {len(dev_overlap)} sentences")
print(f"Test overlap: {len(test_overlap)} sentences")
print(f"Total overlap collected: {len(overlap_data)} sentences")



Train overlap: 957 sentences
Dev overlap: 186 sentences
Test overlap: 183 sentences
Total overlap collected: 1326 sentences


In [206]:
# checking that there is no overlap in the cleaned data
# extract tokens with non-"O" labels from each split
train_tokens_clean_set = set(extract_labeled_tokens(train_data_cleaned))
dev_tokens_clean_set = set(extract_labeled_tokens(dev_data_cleaned))
test_tokens_clean_set = set(extract_labeled_tokens(test_data_cleaned))

In [207]:
# compute intersections to find overlaps of unique tokens
train_dev_overlap_clean_set = train_tokens_clean_set & dev_tokens_clean_set                # tokens that appear in both train and dev
train_test_overlap_clean_set = train_tokens_clean_set & test_tokens_clean_set                 # tokens that appear in both train and test
dev_test_overlap_clean_set = dev_tokens_clean_set & test_tokens_clean_set                     # tokens that appear in both dev and test
all_three_overlap_clean_set = train_tokens_clean_set & dev_tokens_clean_set & test_tokens_clean_set     # tokens common to all three splits


In [208]:
# print out the number of overlapping tokens
print(f"Train-Dev overlap in cleaned data (unique): {len(train_dev_overlap_clean_set)} tokens")
print(f"Train-Test overlap in cleaned data (unique): {len(train_test_overlap_clean_set)} tokens")
print(f"Dev-Test overlap in cleaned data (unique): {len(dev_test_overlap_clean_set)} tokens")
print(f"All three overlap in cleaned data (unique): {len(all_three_overlap_clean_set)} tokens")


Train-Dev overlap in cleaned data (unique): 0 tokens
Train-Test overlap in cleaned data (unique): 0 tokens
Dev-Test overlap in cleaned data (unique): 0 tokens
All three overlap in cleaned data (unique): 0 tokens


In [209]:
print("Size of cleaned train set: ", len(train_data_cleaned))
print("Size of cleaned dev set: ", len(dev_data_cleaned))
print("Size of cleaned test set: ", len(test_data_cleaned))

Size of cleaned train set:  3426
Size of cleaned dev set:  378
Size of cleaned test set:  382


#### Group overlap data by tokens 
- So all sentences containing ex. "Denmark" are together

In [210]:
import random

def split_overlap_data(overlap_data, split_ratio=(0.2, 0.4, 0.4), seed=42,
                       existing_split_tokens=None):
    '''
    Splits the overlap_data into train/dev/test with no overlapping labeled tokens.
    existing_split_tokens: optional dict with 'train', 'dev', 'test' -> sets of labeled tokens already in those splits
    '''
    random.seed(seed)
    shuffled = list(enumerate(overlap_data))
    random.shuffle(shuffled)

    split_data = {'train': [], 'dev': [], 'test': []}
    split_tokens = {'train': set(), 'dev': set(), 'test': set()}

    # Initialize existing token sets if provided
    if existing_split_tokens is None:
        existing_split_tokens = {'train': set(), 'dev': set(), 'test': set()}

    def can_add(tokens, split_name):
        others = [s for s in split_tokens if s != split_name]

        # Check against both the other split_tokens and the existing_split_tokens
        for other in others:
            for token in tokens:
                if token in split_tokens[other] or token in existing_split_tokens[other]:
                    return False
        return True

    unassigned = []

    for idx, sentence in shuffled:
        tokens = set(extract_labeled_tokens([sentence]))

        for split in ['train', 'dev', 'test']:
            if can_add(tokens, split):
                split_data[split].append(sentence)
                split_tokens[split].update(tokens)
                break
        else:
            unassigned.append(sentence)  # none of the splits could take it

    return split_data['train'], split_data['dev'], split_data['test'], unassigned


In [211]:
existing_split_tokens = {
    'train': set(extract_labeled_tokens(train_data_cleaned)),
    'dev': set(extract_labeled_tokens(dev_data_cleaned)),
    'test': set(extract_labeled_tokens(test_data_cleaned)),
}

train_overlap_clean, dev_overlap_clean, test_overlap_clean, unassigned = split_overlap_data(
    overlap_data,
    existing_split_tokens=existing_split_tokens
)


In [212]:
final_train = train_data_cleaned + train_overlap_clean
final_dev = dev_data_cleaned + dev_overlap_clean
final_test = test_data_cleaned + test_overlap_clean

# Sanity check for labeled token overlaps
train_tokens = set(extract_labeled_tokens(final_train))
dev_tokens = set(extract_labeled_tokens(final_dev))
test_tokens = set(extract_labeled_tokens(final_test))

print(f"overlap_train_dev: {len(train_tokens & dev_tokens)}")
print(f"overlap_train_test: {len(train_tokens & test_tokens)}")
print(f"overlap_dev_test: {len(dev_tokens & test_tokens)}")

print(f"Final train size: {len(final_train)}")
print(f"Final dev size: {len(final_dev)}")
print(f"Final test size: {len(final_test)}")
print("total: ", len(final_train) + len(final_dev) + len(final_test))

print(f"Unassigned sentences due to overlap constraints: {len(unassigned)}")
print(extract_labeled_tokens(unassigned))
print(len(extract_labeled_tokens(unassigned)))

print(len(train_tokens))
print(len(dev_tokens))
print(len(test_tokens))


overlap_train_dev: 0
overlap_train_test: 0
overlap_dev_test: 0
Final train size: 4708
Final dev size: 380
Final test size: 394
total:  5482
Unassigned sentences due to overlap constraints: 30
['Serbiens', 'Slobodan', 'Milosevic', 'USA', 'Rusland', 'Bosnien-Hercegovina', 'Serbiens', 'DSB', 'Ask', 'Urd', 'Henning', 'Camre', 'Jørgen', 'Leth', 'Det', 'perfekte', 'menneske', 'Det', 'gode', 'og', 'det', 'onde', 'Notater', 'om', 'kærligheden', 'Simon', 'Wiesenthal', 'Poul', 'Schlüter', 'IV', '.', 'Poul', 'Schlüters', 'Folketinget', 'Kamilla', 'Rekefjord', 'Norge', 'Rendsburg', 'Tyskland', 'Kirsten', 'Jacobsen', 'Landsforeningen', 'Ungbo', 'Ungbos', 'Ungbos', 'Torben', 'Lund', 'TV-2-Nyhederne', 'Ungbos', 'Urd', 'Århus', 'DSB', 'Danmarks', 'EFs', 'Peter', 'Duetoft', 'Keld', 'Krogh', 'Brian', 'Lentz', 'CBC', 'Subaru', 'Schweiz', 'Tyskland', 'Poul', 'Schlüter', 'Folketinget', 'Sverige', 'Danmark', 'Tyskland', 'DSBs', 'Urd', 'Århus', 'Havns', 'Hermes', 'Lone', 'Scherfig', 'Æterdrama', 'Hvor', 'er'

## Checking for overlap in new datasets

In [213]:
from scripts.load_data import extract_labeled_tokens

In [214]:
path_train = "../data/da_news_new/new_da_news_train.tsv"
path_dev = "../data/da_news_new/new_da_news_dev.tsv"
path_test = "../data/da_news_new/new_da_news_test.tsv"

label2id, id2label = mapping(path_train)

train_data = read_tsv_file(path_train, label2id)
dev_data = read_tsv_file(path_dev, label2id)
test_data = read_tsv_file(path_test, label2id)

In [215]:
print("train size:", len(train_data))
print("dev size:", len(dev_data))
print("test size:", len(test_data))

train size: 1745
dev size: 161
test size: 344


In [216]:
train_tokens = extract_labeled_tokens(train_data)
dev_tokens = extract_labeled_tokens(dev_data)
test_tokens = extract_labeled_tokens(test_data)

print(f"overlap_train_dev: {len(train_tokens & dev_tokens)}")
print(f"overlap_train_test: {len(train_tokens & test_tokens)}")
print(f"overlap_dev_test: {len(dev_tokens & test_tokens)}")

overlap_train_dev: 141
overlap_train_test: 188
overlap_dev_test: 54


In [217]:
len(train_tokens & dev_tokens & test_tokens)

45