In [1]:
from load_data import mapping, read_tsv_file, extract_labeled_tokens
from collections import defaultdict
import random

  from .autonotebook import tqdm as notebook_tqdm


# Reading in the data and Looking at overlapping tokens

In [2]:
# getting data 
# path to the data files
path_train = "../data/da_news_train.tsv"
path_dev = "../data/da_news_dev.tsv"
path_test = "../data/da_news_test.tsv"

# create mapping
label2id, id2label = mapping(path_train)

# read in the DaN+ data
train_data = read_tsv_file(path_train, label2id)
dev_data = read_tsv_file(path_dev, label2id)
test_data = read_tsv_file(path_test, label2id)

In [3]:
train_data

[{'tokens': ['På',
   'fredag',
   'har',
   'SID',
   'inviteret',
   'til',
   'reception',
   'i',
   'SID-huset',
   'i',
   'anledning',
   'af',
   'at',
   'formanden',
   'Kjeld',
   'Christensen',
   'går',
   'ind',
   'i',
   'de',
   'glade',
   'tressere',
   '.'],
  'ner_tags': ['O',
   'O',
   'O',
   'B-ORG',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-PER',
   'I-PER',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O'],
  'tag_ids': [8,
   8,
   8,
   2,
   8,
   8,
   8,
   8,
   8,
   8,
   8,
   8,
   8,
   8,
   1,
   3,
   8,
   8,
   8,
   8,
   8,
   8,
   8]},
 {'tokens': ['Eller',
   'slet',
   'og',
   'ret',
   'tykke',
   'og',
   'fede',
   'i',
   'mere',
   'eller',
   'mindre',
   'grad',
   ',',
   'som',
   'en',
   'enkelt',
   'udtrykker',
   'det',
   '.'],
  'ner_tags': ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',

In [4]:
print("train size:", len(train_data))
print("dev size:", len(dev_data))
print("test size:", len(test_data))
print("total dataset size:", len(train_data) + len(dev_data) + len(test_data))

train size: 4383
dev size: 564
test size: 565
total dataset size: 5512


In [5]:
# extract tokens with non-"O" labels from each split
train_tokens = extract_labeled_tokens(train_data)
dev_tokens = extract_labeled_tokens(dev_data)
test_tokens = extract_labeled_tokens(test_data)

In [6]:
# print out the number of tokens in each split
print(f"Unique tokens in train: {len(train_tokens)}")
print(f"Unique tokens in dev: {len(dev_tokens)}")
print(f"Unique tokens in test: {len(test_tokens)}")

Unique tokens in train: 2635
Unique tokens in dev: 470
Unique tokens in test: 513


In [7]:
# overlap between datasets
train_dev_overlap = train_tokens & dev_tokens
dev_test_overlap = dev_tokens & test_tokens
train_test_overlap = train_tokens & test_tokens

# union of all overlapping tokens
all_tokens_overlap = train_dev_overlap | dev_test_overlap | train_test_overlap

print('Number of unique overlapping tokens:', len(all_tokens_overlap))

Number of unique overlapping tokens: 405


# Make one great dataset 

In [8]:
total_data = train_data + dev_data + test_data

len(total_data)

5512

### Step 1: Extract entities from each sentence


In [9]:
def extract_entities(tokens, tags):
    entities = set()
    entity = []
    for token, tag in zip(tokens, tags):
        if tag.startswith("B-"):
            if entity:
                entities.add(" ".join(entity))
                entity = []
            entity = [token]
        elif tag.startswith("I-") and entity:
            entity.append(token)
        else:
            if entity:
                entities.add(" ".join(entity))
                entity = []
    if entity:
        entities.add(" ".join(entity))
    return entities

### Step 2: Build mapping from sentence to entities

In [10]:
entity_to_sents = defaultdict(list)

for idx, sentence in enumerate(total_data):
    entities = extract_entities(sentence["tokens"], sentence["ner_tags"])
    for ent in entities:
        entity_to_sents[ent].append(idx)  # store sentence indices


### Step 3: Shuffle and split entities

In [11]:
random.seed(42)

entities = list(entity_to_sents.keys())
random.shuffle(entities)

train_cutoff = int(0.65 * len(entities))
dev_cutoff = int(0.75 * len(entities))

train_entities = set(entities[:train_cutoff])
dev_entities = set(entities[train_cutoff:dev_cutoff])
test_entities = set(entities[dev_cutoff:])

### Step 4: Assign sentences to splits based on their entities

In [12]:
def get_sent_indices(entity_set):
    sent_ids = set()
    for ent in entity_set:
        sent_ids.update(entity_to_sents[ent])
    return sent_ids

train_ids = get_sent_indices(train_entities)
dev_ids = get_sent_indices(dev_entities) - train_ids
test_ids = get_sent_indices(test_entities) - train_ids - dev_ids

### Step 5: Build final splits


In [13]:
train_data = [total_data[i] for i in train_ids]
dev_data = [total_data[i] for i in dev_ids]
test_data = [total_data[i] for i in test_ids]

In [14]:
print(len(train_data))
print(len(dev_data))
print(len(test_data))

1745
161
344


In [15]:
print(train_data[0]['tokens'])
print(train_data[0]['ner_tags'])
print(train_data[0]['tag_ids'])

['På', 'fredag', 'har', 'SID', 'inviteret', 'til', 'reception', 'i', 'SID-huset', 'i', 'anledning', 'af', 'at', 'formanden', 'Kjeld', 'Christensen', 'går', 'ind', 'i', 'de', 'glade', 'tressere', '.']
['O', 'O', 'O', 'B-ORG', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PER', 'I-PER', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
[8, 8, 8, 2, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 1, 3, 8, 8, 8, 8, 8, 8, 8]


### Writing to tsv file

In [16]:
def write_tsv_file(data, path):
    '''
    Writes a list of sentence dictionaries (with 'tokens' and 'ner_tags') to a TSV file.
    Each token-label pair is written on its own line, separated by a tab.
    Sentences are separated by empty lines.

    Parameters:
        data (List[dict]): List of sentence dictionaries.
        path (str): Path to write the TSV file to.
    '''
    with open(path, 'w', encoding='utf-8') as f:
        for sentence in data:
            tokens = sentence['tokens']
            ner_tags = sentence['ner_tags']
            for token, tag in zip(tokens, ner_tags):
                f.write(f"{token}\t{tag}\n")
            f.write("\n")  # sentence separator


In [17]:
write_tsv_file(train_data, '../new_data/new_da_news_train.tsv')
write_tsv_file(dev_data, '../new_data/new_da_news_dev.tsv')
write_tsv_file(test_data, '../new_data/new_da_news_test.tsv')