In [56]:
# imports
import random
import copy
import sys
sys.path.append("../")

from scripts.load_data import write_tsv_file, extract_labeled_tokens, mapping, read_tsv_file, write_iob2_file
from middle_eastern_ne import extract_first_names, get_last_names,  load_location, load_organisation

random.seed(42)


## Get ME entities

In [57]:
ME_BPER = extract_first_names("../data_aug_sources/Ordbog_over_muslimske_fornavne_i_DK.pdf")
ME_IPER = get_last_names("../data_aug_sources/middle_eastern_last_names.txt", "../data_aug_sources/KDBGIVE.tsv")
ME_LOC = load_location("../data_aug_sources/the-middle-east-cities.csv")
ME_ORG = load_organisation("../data_aug_sources/middle_eastern_organisations.csv")

In [58]:
ME_ORG[0]

{'tokens': ['Saudi', 'Aramco'], 'ner_tags': ['B-ORG', 'I-ORG']}

## Read in data sets

In [59]:
# path to the data files
path_train = "../data/no_overlap_da_news/da_news_train.tsv"
path_dev = "../data/no_overlap_da_news/da_news_dev.tsv"
path_test = "../data/no_overlap_da_news/da_news_test.tsv"

# create mapping
label2id, id2label = mapping(path_train)

# read in the DaN+ data
train_data = read_tsv_file(path_train, label2id)
dev_data = read_tsv_file(path_dev, label2id)
test_data = read_tsv_file(path_test, label2id)

## Replace entities in dev and test set

In [60]:
# extracting all tokens in train data - to ensure no overlap later
train_tokens = extract_labeled_tokens(train_data)

In [61]:
# for saving all used entities
used_entities = set()

In [62]:
def data_aug_replace(dataset, sentence_amount, ME_LOC = ME_LOC, ME_ORG = ME_ORG,
                     ME_BPER = ME_BPER, ME_IPER = ME_IPER, used_entities = None, train_tokens=train_tokens):
    """
    Replaces named entities in a subset of the dataset with new MENAPT ones, ensuring:
    - No reused tokens across datasets
    - No tokens from train set
    - Deterministic behavior
    - Returns updated used_entities (flat set of tokens)
    """
    local_used = set(used_entities)

    # extract sentences with containing relevant tags
    eligible_sentences = [sent for sent in dataset if any(tag not in ["O", "B-MISC", "I-MISC"] for tag in sent["ner_tags"])]
    # select random sentences
    selected_sentences = random.sample(eligible_sentences, min(sentence_amount, len(eligible_sentences)))
    # create copy to not modify original dataset 
    modified_dataset = [dict(sent) for sent in dataset] 


    for sent in modified_dataset:
        if sent not in selected_sentences:
            continue

        i = 0
        while i < len(sent["tokens"]):
            tag = sent["ner_tags"][i]

            if tag == 'B-PER':
                available = [p for p in ME_BPER if p not in local_used and p not in train_tokens]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][i] = replace
                    local_used.add(replace)
                i += 1

            elif tag == 'I-PER':
                available = [p for p in ME_IPER if p not in local_used and p not in train_tokens]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][i] = replace
                    local_used.add(replace)
                i += 1

            elif tag == 'B-LOC':
                span_start = i
                span_len = 1
                i += 1
                while i < len(sent["ner_tags"]) and sent["ner_tags"][i] == "I-LOC":
                    span_len += 1
                    i += 1

                available = [
                    loc for loc in ME_LOC
                    if len(loc["tokens"]) == span_len and
                    tuple(loc["tokens"]) not in local_used and
                    tuple(loc["tokens"]) not in train_tokens
                ]
                
                if available:
                    replace = random.choice(available)
                    sent["tokens"][span_start:span_start + span_len] = replace["tokens"]
                    local_used.add(tuple(replace["tokens"]))

            elif tag == 'B-ORG':
                span_start = i
                span_len = 1
                i += 1
                while i < len(sent["ner_tags"]) and sent["ner_tags"][i] == "I-ORG":
                    span_len += 1
                    i += 1

                available = [
                    org for org in ME_ORG
                    if len(org["tokens"]) == span_len and
                    tuple(org["tokens"]) not in local_used and
                    tuple(org["tokens"]) not in train_tokens
                ]

                if available:
                    replace = random.choice(available)
                    sent["tokens"][span_start:span_start + span_len] = replace["tokens"]
                    local_used.add(tuple(replace["tokens"]))

            else:
                i += 1

    return modified_dataset, local_used


In [63]:
used_entities

set()

In [64]:
ME_dev, used_entities = data_aug_replace(dev_data, used_entities=used_entities, sentence_amount=1000)
len(used_entities)

ME_test, used_entities = data_aug_replace(test_data, used_entities=used_entities, sentence_amount=1000)
len(used_entities)

589

In [65]:
print(used_entities)

{'Alairaymiah', ('Bush',), 'Khatib', 'Afaf', ('Ashmun',), 'Shoaib', 'Usama', 'Mahmud', 'Khan', 'Al rifai', 'Naeem', 'Kazem', 'Sania', 'Zain', ('MarsaMaroc',), ('Bishah',), ('Al', 'Nabooda', 'Automobiles'), ('Libyana',), ('Muscat',), 'Alairayki', ('Palæstina',), 'Saja', ('Koutoubia',), ('Rafah',), ('SAMI',), 'Isra', ('Al-Nahar',), 'Maryam', ('Shihan',), 'Adeel', ('Gulf', 'Madhyamam'), ('Assadissa',), 'Nazir', 'Rana', ('Yazd',), 'Omer', ('Konya',), ('Careem',), ('Ceyhan',), ('Investcorp',), 'Randa', ('Izmit',), 'Umar', 'Zuhra', 'Nasser', 'Abdel abas', 'Sultan', 'Najiba', ('Epilert',), ('Sonalgaz',), ('Hawaï',), 'Mahmood.', 'Fahima', 'Lama', ('Apparel', 'Group'), ('Mubadala',), 'Syed', 'Sami', ('Damac',), 'Nahid', 'Hamad', ('SOMED',), ('Isfahan',), ('Oilibya',), ('Edirne',), 'Assia', ('Sultangazi',), ('Najaf',), 'Mazen', ('Osmaniye',), 'Waseem', 'Hamza', ('Dahuk',), 'Yusra', 'Mazhar', 'Al khaili', 'Salam', 'Altinaymi', 'Sohayr', 'Asima', ('Dnata',), 'Muhsin', ('Nablus',), 'Al hosny', 'Ali

In [66]:
print(len(ME_BPER))
BPER_left = [item for item in ME_BPER if item not in used_entities]
print(len(BPER_left))

735
517


In [67]:
print(len(ME_IPER))
IPER_left = [item for item in ME_IPER if item not in used_entities]
print(len(IPER_left))

1580
1446


In [68]:
print(len(ME_LOC))
LOC_left = [d for d in ME_LOC if tuple(d['tokens']) not in used_entities]
print(len(LOC_left))

481
352


In [69]:
print(len(ME_ORG))
ORG_left = [d for d in ME_ORG if tuple(d['tokens']) not in used_entities]
print(len(ORG_left))

427
260


In [70]:
#for sent in ME_dev: 
#    print(sent)

In [71]:
#for sent in ME_test: 
#    print(sent)

In [72]:
# save as tsv files
write_tsv_file(ME_dev, "../data/me_data/middle_eastern_dev.tsv")
write_tsv_file(ME_test, "../data/me_data/middle_eastern_test.tsv")

In [73]:
write_iob2_file(ME_test, path="../data/me_data/middle_eastern_test.iob2", gold=True)