In [1]:
# imports
import sys
sys.path.append("../")

from scripts.load_data import mapping, extract_labeled_tokens, read_tsv_file, write_tsv_file
from collections import defaultdict
from collections import Counter
import random

random.seed(20) # set seed

## Load original datasets

In [2]:
# path to the data files
path_news_train = "../data/da_news/da_news_train.tsv"
path_news_dev = "../data/da_news/da_news_dev.tsv"
path_news_test = "../data/da_news/da_news_test.tsv"

# create mapping
label2id, id2label = mapping(path_news_train)

# read in the DaN+ data
train_data_news = read_tsv_file(path_news_train, label2id)
dev_data_news = read_tsv_file(path_news_dev, label2id)
test_data_news = read_tsv_file(path_news_test, label2id)

In [3]:
# dataset sizes
print("train size:", len(train_data_news))
print("dev size:", len(dev_data_news))
print("test size:", len(test_data_news))
print("total dataset size:", len(train_data_news) + len(dev_data_news) + len(test_data_news))

train size: 4383
dev size: 564
test size: 565
total dataset size: 5512


In [4]:
# concatenate datasets
total_data = train_data_news + dev_data_news + test_data_news

In [5]:
# extraxt unique entities
total_entities = extract_labeled_tokens(total_data)

## Build mapping from entity to sentence and sentence to entity

In [6]:
# dict with entities as keys and lists of sentence IDs as values
entity_to_sents = defaultdict(set)
sent_to_entities = defaultdict(set) # also creating mapping from sentence ID to entity

for sent_id, sent in enumerate(total_data):

    for tok_id, ent in enumerate(sent["tokens"]):

        if ent in total_entities and sent['ner_tags'][tok_id] != 'O':

            entity_to_sents[ent].add(sent_id)

            sent_to_entities[sent_id].add(ent)

## Group sentences by overlapping entities

In [7]:
# group sentences by shared entities

visited = set()
sentence_groups = []

for sent_id in sent_to_entities:

    if sent_id in visited:
        continue

    group, queue = set(), [sent_id]

    while queue:

        current = queue.pop()

        if current in visited:
            continue

        visited.add(current)
        group.add(current)

        for entity in sent_to_entities[current]:

            queue.extend(entity_to_sents[entity])

    sentence_groups.append(group)

In [8]:
# shuffle and split groups by total sentence count

random.shuffle(sentence_groups)

train_group, dev_group, test_group, count = [], [], [], 0
total = sum(len(g) for g in sentence_groups)
train_cutoff, dev_cutoff = int(total * 0.7), int(total * 0.85)

for group in sentence_groups:

    if count < train_cutoff:
        train_group += group
        
    elif count < dev_cutoff:
        dev_group += group

    else:
        test_group += group

    count += len(group)

## Add sentences with only 'O' tags

In [9]:
# add unused sentences with all 'O' tags
used = set(train_group + dev_group + test_group)
o_tagged = []

for idx, sent in enumerate(total_data):
    if idx not in used and all(tag == "O" for tag in sent["ner_tags"]):
        o_tagged.append(idx)

random.shuffle(o_tagged)

cut1, cut2 = int(len(o_tagged) * 0.7), int(len(o_tagged) * 0.85)

train_group += o_tagged[:cut1]
dev_group += o_tagged[cut1:cut2]
test_group += o_tagged[cut2:]

In [10]:
# final splits
train_data = [total_data[i] for i in sorted(train_group)]
dev_data = [total_data[i] for i in sorted(dev_group)]
test_data = [total_data[i] for i in sorted(test_group)]

## Check sizes and overlap

In [11]:
# sizes of new datasets
print("train size:", len(train_data))
print("dev size:", len(dev_data))
print("test size:", len(test_data))
print("total dataset size:", len(train_data) + len(dev_data) + len(test_data))

train size: 3896
dev size: 790
test size: 826
total dataset size: 5512


In [12]:
# extract tokens with non-"O" labels from each split
train_tokens = extract_labeled_tokens(train_data)
dev_tokens = extract_labeled_tokens(dev_data)
test_tokens = extract_labeled_tokens(test_data)

# overlap between datasets
train_dev_overlap = train_tokens & dev_tokens
dev_test_overlap = dev_tokens & test_tokens
train_test_overlap = train_tokens & test_tokens

In [13]:
# check for overlap between datasets
print('overlap between train and dev:', len(train_dev_overlap))
print('overlap between dev and test:', len(dev_test_overlap))
print('overlap between train and test:', len(train_test_overlap))

overlap between train and dev: 0
overlap between dev and test: 0
overlap between train and test: 0


## Look at distribution of tokens

In [14]:
train_tokens = extract_labeled_tokens(train_data, include_label_pair=True)
dev_tokens = extract_labeled_tokens(dev_data, include_label_pair=True)
test_tokens = extract_labeled_tokens(test_data, include_label_pair=True)

train_distr = Counter(tag for _, tag in train_tokens)
test_distr = Counter(tag for _, tag in test_tokens)
dev_distr = Counter(tag for _, tag in dev_tokens)

print(train_distr)
print(dev_distr)
print(test_distr)

Counter({'I-PER': 572, 'B-PER': 455, 'B-ORG': 438, 'B-LOC': 347, 'I-ORG': 323, 'I-MISC': 206, 'B-MISC': 166, 'I-LOC': 61})
Counter({'B-PER': 98, 'B-ORG': 80, 'B-LOC': 78, 'B-MISC': 54, 'I-PER': 37, 'I-MISC': 22, 'I-ORG': 14, 'I-LOC': 5})
Counter({'B-PER': 131, 'B-ORG': 110, 'B-LOC': 75, 'I-PER': 60, 'B-MISC': 56, 'I-ORG': 40, 'I-MISC': 19, 'I-LOC': 10})


In [15]:
def get_percentage_distribution(counter):
    total = sum(counter.values())
    return {tag: round((count / total) * 100, 2) for tag, count in counter.items()}

# calculate percentage distributions
train_percent = get_percentage_distribution(train_distr)
dev_percent = get_percentage_distribution(dev_distr)
test_percent = get_percentage_distribution(test_distr)

# print results
print("Train Percentage Distribution:")
print(train_percent)
print("\nDev Percentage Distribution:")
print(dev_percent)
print("\nTest Percentage Distribution:")
print(test_percent)

Train Percentage Distribution:
{'B-PER': 17.72, 'B-MISC': 6.46, 'I-PER': 22.27, 'B-ORG': 17.06, 'I-ORG': 12.58, 'B-LOC': 13.51, 'I-MISC': 8.02, 'I-LOC': 2.38}

Dev Percentage Distribution:
{'B-PER': 25.26, 'B-ORG': 20.62, 'I-PER': 9.54, 'B-LOC': 20.1, 'B-MISC': 13.92, 'I-MISC': 5.67, 'I-LOC': 1.29, 'I-ORG': 3.61}

Test Percentage Distribution:
{'B-PER': 26.15, 'B-ORG': 21.96, 'I-PER': 11.98, 'I-LOC': 2.0, 'B-LOC': 14.97, 'I-ORG': 7.98, 'B-MISC': 11.18, 'I-MISC': 3.79}


## Check overlap and label distribution in ME data

In [16]:
## checking for ME data ##

# path to the data files
path_me_dev = "../data/me_data/middle_eastern_dev.tsv"
path_me_test = "../data/me_data/middle_eastern_test.tsv"

# read in the data
me_dev_data = read_tsv_file(path_me_dev, label2id)
me_test_data = read_tsv_file(path_me_test, label2id)

# extract labels
me_dev_tokens = extract_labeled_tokens(me_dev_data)
me_test_tokens = extract_labeled_tokens(me_test_data)

# overlap between datasets
me_train_dev_overlap = train_tokens & me_dev_tokens
me_train_test_overlap = train_tokens & me_test_tokens
me_dev_test_overlap = me_dev_tokens & me_test_tokens

print('overlap between train and ME_dev:', len(me_train_dev_overlap))
print('overlap between train and ME_test:', len(me_train_test_overlap))
print('overlap between ME_dev and ME_test:', len(me_dev_test_overlap))

overlap between train and ME_dev: 0
overlap between train and ME_test: 0
overlap between ME_dev and ME_test: 0


In [17]:
me_dev_tokens = extract_labeled_tokens(me_dev_data, include_label_pair=True)
me_test_tokens = extract_labeled_tokens(me_test_data, include_label_pair=True)

me_test_distr = Counter(tag for _, tag in me_test_tokens)
me_dev_distr = Counter(tag for _, tag in me_dev_tokens)

print(me_dev_distr)
print(me_test_distr)

Counter({'B-PER': 128, 'B-ORG': 99, 'B-LOC': 96, 'B-MISC': 54, 'I-PER': 38, 'I-MISC': 22, 'I-ORG': 15, 'I-LOC': 6})
Counter({'B-PER': 168, 'B-ORG': 121, 'B-LOC': 91, 'I-PER': 72, 'B-MISC': 56, 'I-ORG': 40, 'I-MISC': 19, 'I-LOC': 11})


## Write to tsv files

In [18]:
write_tsv_file(train_data, '../data/no_overlap_da_news/da_news_train.tsv')
write_tsv_file(dev_data, '../data/no_overlap_da_news/da_news_dev.tsv')
write_tsv_file(test_data, '../data/no_overlap_da_news/da_news_test.tsv')