In [516]:
# imports
import random
import copy
import sys
sys.path.append("../")

from scripts.load_data import write_tsv_file, extract_labeled_tokens, mapping, read_tsv_file, write_iob2_file
from middle_eastern_ne import extract_first_names, get_last_names,  load_location, load_organisation


## Get ME entities

In [517]:
ME_BPER = extract_first_names("../data_aug_sources/Ordbog_over_muslimske_fornavne_i_DK.pdf")
ME_IPER = get_last_names("../data_aug_sources/middle_eastern_last_names.txt", "../data_aug_sources/KDBGIVE.tsv")
ME_LOC = load_location("../data_aug_sources/the-middle-east-cities.csv")
ME_ORG = load_organisation("../data_aug_sources/middle_eastern_organisations.csv")

In [518]:
ME_ORG[0]

{'tokens': ['Saudi', 'Aramco'], 'ner_tags': ['B-ORG', 'I-ORG']}

## Read in data sets

In [519]:
# path to the data files
path_train = "../data/no_overlap_da_news/da_news_train.tsv"
path_dev = "../data/no_overlap_da_news/da_news_dev.tsv"
path_test = "../data/no_overlap_da_news/da_news_test.tsv"

# create mapping
label2id, id2label = mapping(path_train)

# read in the DaN+ data
train_data = read_tsv_file(path_train, label2id)
dev_data = read_tsv_file(path_dev, label2id)
test_data = read_tsv_file(path_test, label2id)

## Replace entities in dev and test set

In [520]:
# extracting all tokens in train data - to ensure no overlap later
train_tokens = extract_labeled_tokens(train_data)

In [521]:
# for saving all used entities
used_entities = set()

In [522]:
def data_aug_replace(dataset, sentence_amount, ME_LOC = ME_LOC, ME_ORG = ME_ORG,
                     ME_BPER = ME_BPER, ME_IPER = ME_IPER, used_entities = used_entities, train_tokens=train_tokens):
    """
    Replaces named entities in a subset of the dataset with new MENAPT ones, ensuring no reuse across datasets.
    """

    random.seed(42)

    # extract sentences with containing relevant tags
    eligible_sentences = [sent for sent in dataset if any(tag not in ["O", "B-MISC", "I-MISC"] for tag in sent["ner_tags"])]
    # select random sentences
    selected_sentences = random.sample(eligible_sentences, min(sentence_amount, len(eligible_sentences)))
    # create copy to not modify original dataset 
    modified_dataset = [dict(sent) for sent in dataset] 

    for sent in modified_dataset:
        if sent not in selected_sentences:
            continue

        i = 0
        while i < len(sent["tokens"]):
            tag = sent["ner_tags"][i]

            if tag == 'B-PER':
                available = [p for p in ME_BPER if p not in used_entities and p not in train_tokens]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][i] = replace
                    used_entities.add(replace)
                i += 1

            elif tag == 'I-PER':
                available = [p for p in ME_IPER if p not in used_entities and p not in train_tokens]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][i] = replace
                    used_entities.add(replace)
                i += 1

            elif tag == 'B-LOC':
                span_start = i
                span_len = 1

                i += 1

                while i < len(sent["ner_tags"]) and sent["ner_tags"][i] == "I-LOC":
                    span_len += 1
                    i += 1

                available = [
                    loc for loc in ME_LOC
                    if not any(token in train_tokens for token in loc["tokens"])
                    and not any(token in used_entities for token in loc["tokens"])
                    and len(loc["tokens"]) == span_len
                ]
                
                if available:
                    replace = random.choice(available)
                    sent["tokens"][span_start:span_start + span_len] = replace["tokens"]
                    used_entities.update(replace["tokens"])

            elif tag == 'B-ORG':
                span_start = i
                span_len = 1
                i += 1
                while i < len(sent["ner_tags"]) and sent["ner_tags"][i] == "I-ORG":
                    span_len += 1
                    i += 1

                available = [
                    org for org in ME_ORG
                    if not any(token in train_tokens for token in org["tokens"])
                    and not any(token in used_entities for token in org["tokens"])
                    and len(org["tokens"]) == span_len
                ]

                if available:
                    replace = random.choice(available)
                    sent["tokens"][span_start:span_start + span_len] = replace["tokens"]
                    used_entities.update(replace["tokens"])

            else:
                i += 1

    return modified_dataset


In [523]:
def data_aug_replace(dataset, sentence_amount, ME_LOC = ME_LOC, ME_ORG = ME_ORG,
                     ME_BPER = ME_BPER, ME_IPER = ME_IPER, used_entities = None, train_tokens=train_tokens):
    """
    Replaces named entities in a subset of the dataset with new MENAPT ones, ensuring:
    - No reused tokens across datasets
    - No tokens from train set
    - Deterministic behavior
    - Returns updated used_entities (flat set of tokens)
    """
    random.seed(42)
    local_used = set(used_entities)
    modified_dataset = [dict(sent) for sent in dataset]

    eligible_sentences = [
        sent for sent in modified_dataset
        if any(tag not in ["O", "B-MISC", "I-MISC"] for tag in sent["ner_tags"])
    ]
    selected_sentences = random.sample(eligible_sentences, min(sentence_amount, len(eligible_sentences)))

    for sent in modified_dataset:
        if sent not in selected_sentences:
            continue

        i = 0
        while i < len(sent["tokens"]):
            tag = sent["ner_tags"][i]

            if tag == 'B-PER':
                available = [p for p in ME_BPER if p not in local_used and p not in train_tokens]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][i] = replace
                    local_used.add(replace)
                i += 1

            elif tag == 'I-PER':
                available = [p for p in ME_IPER if p not in local_used and p not in train_tokens]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][i] = replace
                    local_used.add(replace)
                i += 1

            elif tag == 'B-LOC':
                span_start = i
                span_len = 1
                i += 1
                while i < len(sent["ner_tags"]) and sent["ner_tags"][i] == "I-LOC":
                    span_len += 1
                    i += 1

                available = [
                    loc for loc in ME_LOC
                    if len(loc["tokens"]) == span_len and
                    all(tok not in train_tokens and tok not in local_used for tok in loc["tokens"])
                ]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][span_start:span_start + span_len] = replace["tokens"]
                    local_used.update(replace["tokens"])

            elif tag == 'B-ORG':
                span_start = i
                span_len = 1
                i += 1
                while i < len(sent["ner_tags"]) and sent["ner_tags"][i] == "I-ORG":
                    span_len += 1
                    i += 1

                available = [
                    org for org in ME_ORG
                    if len(org["tokens"]) == span_len and
                    all(tok not in train_tokens and tok not in local_used for tok in org["tokens"])
                ]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][span_start:span_start + span_len] = replace["tokens"]
                    local_used.update(replace["tokens"])

            else:
                i += 1

    return modified_dataset, local_used


In [524]:
def data_aug_replace(dataset, sentence_amount, ME_LOC = ME_LOC, ME_ORG = ME_ORG,
                     ME_BPER = ME_BPER, ME_IPER = ME_IPER, used_entities = None, train_tokens=train_tokens):
    """
    Replaces named entities in a subset of the dataset with new MENAPT ones, ensuring:
    - No reused tokens across datasets
    - No tokens from train set
    - Deterministic behavior
    - Returns updated used_entities (flat set of tokens)
    """
    random.seed(42)
    local_used = set(used_entities)
    modified_dataset = [dict(sent) for sent in dataset]

    eligible_sentences = [
        sent for sent in modified_dataset
        if any(tag not in ["O", "B-MISC", "I-MISC"] for tag in sent["ner_tags"])
    ]
    selected_sentences = random.sample(eligible_sentences, min(sentence_amount, len(eligible_sentences)))

    for sent in modified_dataset:
        if sent not in selected_sentences:
            continue

        i = 0
        while i < len(sent["tokens"]):
            tag = sent["ner_tags"][i]

            if tag == 'B-PER':
                available = [p for p in ME_BPER if p not in local_used and p not in train_tokens]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][i] = replace
                    local_used.add(replace)
                i += 1

            elif tag == 'I-PER':
                available = [p for p in ME_IPER if p not in local_used and p not in train_tokens]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][i] = replace
                    local_used.add(replace)
                i += 1

            elif tag == 'B-LOC':
                span_start = i
                span_len = 1
                i += 1
                while i < len(sent["ner_tags"]) and sent["ner_tags"][i] == "I-LOC":
                    span_len += 1
                    i += 1

                available = [
                    loc for loc in ME_LOC
                    if len(loc["tokens"]) == span_len and
                    all(tok not in train_tokens and tok not in local_used for tok in loc["tokens"])
                ]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][span_start:span_start + span_len] = replace["tokens"]
                    local_used.add(tuple(replace["tokens"]))

            elif tag == 'B-ORG':
                span_start = i
                span_len = 1
                i += 1
                while i < len(sent["ner_tags"]) and sent["ner_tags"][i] == "I-ORG":
                    span_len += 1
                    i += 1

                available = [
                    org for org in ME_ORG
                    if len(org["tokens"]) == span_len and
                    all(tok not in train_tokens and tok not in local_used for tok in org["tokens"])
                ]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][span_start:span_start + span_len] = replace["tokens"]
                    local_used.add(tuple(replace["tokens"]))

            else:
                i += 1

    return modified_dataset, local_used


In [525]:
used_entities

set()

In [526]:
ME_dev, used_entities = data_aug_replace(dev_data, used_entities=used_entities, sentence_amount=1000)
len(used_entities)

268

In [527]:
ME_test, used_entities = data_aug_replace(test_data, used_entities=used_entities, sentence_amount=1000)
len(used_entities)

531

In [528]:
print(used_entities)

{('Zeytinburnu',), ('Wallyscar',), ('Aramco',), ('Deir', 'ez-Zor'), ('Al', 'Jubayl'), 'Bin', ('Ashmun',), 'Hamoud', ('Spinneys',), 'Bushra', ('Munuf',), 'Jalal', ('Patnos',), ('Sulaymaniyah',), ('Housh', 'Eissa'), 'Sabri', ('OSN',), 'El kutby', 'Kerem', 'Farhan', 'Saber', 'Sawahah', 'Roya', ('Ödemiş',), 'Mawmad', ('Abadan',), 'Sabiha', ('Aden',), 'Muhamed', 'Jasmin', 'Haydar', 'Alyami', 'Yousif', 'Arzu', ('Al-Thawra',), 'Alaidhayqi', 'Sudany', 'Anas', 'Helmy', ('Somaca',), ('Kars',), 'Nazli', ('Gemlik',), ('Biougnach',), 'Asaad', 'Kazem', ('Yozgat',), ('Ahlibank',), 'Laila', 'Asma', 'Zarifa', 'Issam', 'Fadila', ('Al', 'Sharq'), ('Emaar', 'Properties'), ('Banque', 'Saudi', 'Fransi'), 'Kadir', ('Koutoubia',), 'Naveed', ('Aswan',), 'Samia', ('Ismailia',), 'Yusuf', 'Zaina', ('Izmit',), ('Edirne',), ('Antalya',), ('Dubai', 'Investments'), 'Muhamad', 'Nadiya', ('Ramadi',), ('Solfeh',), ('al-Balad',), ('Argaam',), 'Fuqaha', 'Sheikh', 'Yusra', 'Ayah', 'Meriam', 'Mouna', 'Muhsin', ('Sorgun',), 

In [529]:
print(len(ME_BPER))
BPER_left = [item for item in ME_BPER if item not in used_entities]
print(len(BPER_left))

735
517


In [530]:
print(len(ME_IPER))
IPER_left = [item for item in ME_IPER if item not in used_entities]
print(len(IPER_left))

1580
1449


In [531]:
print(len(ME_LOC))
LOC_left = [d for d in ME_LOC if tuple(d['tokens']) not in used_entities]
print(len(LOC_left))

481
369


In [532]:
print(len(ME_ORG))
ORG_left = [d for d in ME_ORG if tuple(d['tokens']) not in used_entities]
print(len(ORG_left))

427
302


In [533]:
#for sent in ME_dev: 
#    print(sent)

In [534]:
#for sent in ME_test: 
#    print(sent)

In [535]:
# save as tsv files
write_tsv_file(ME_dev, "../data/me_data/middle_eastern_dev.tsv")
write_tsv_file(ME_test, "../data/me_data/middle_eastern_test.tsv")

In [536]:
write_iob2_file(ME_test, path="../data/me_data/middle_eastern_test.iob2", gold=True)