In [None]:
# imports
import random
import sys
sys.path.append("../")

from scripts.load_data import write_tsv_file, extract_labeled_tokens, mapping, read_tsv_file
from middle_eastern_ne import extract_first_names, get_last_names,  load_location, load_organisation

random.seed(42)

## Get ME entities

In [26]:
ME_BPER = extract_first_names("../data_aug_sources/Ordbog_over_muslimske_fornavne_i_DK.pdf")
ME_IPER = get_last_names("../data_aug_sources/middle_eastern_last_names.txt", "../data_aug_sources/KDBGIVE.tsv")
ME_LOC = load_location("../data_aug_sources/the-middle-east-cities.csv")
ME_ORG = load_organisation("../data_aug_sources/middle_eastern_organisations.csv")

In [27]:
ME_ORG[0]

{'tokens': ['Saudi', 'Aramco'], 'ner_tags': ['B-ORG', 'I-ORG']}

## Read in data sets

In [28]:
# path to the data files
path_train = "../data/no_overlap_da_news/da_news_train.tsv"
path_dev = "../data/no_overlap_da_news/da_news_dev.tsv"
path_test = "../data/no_overlap_da_news/da_news_test.tsv"

# create mapping
label2id, id2label = mapping(path_train)

# read in the DaN+ data
train_data = read_tsv_file(path_train, label2id)
dev_data = read_tsv_file(path_dev, label2id)
test_data = read_tsv_file(path_test, label2id)

## Replace entities in dev and test set

In [29]:
# extracting all tokens in train data - to ensure no overlap later
train_tokens = extract_labeled_tokens(train_data)

In [30]:
# for saving all used entities
used_entities = set()

In [31]:
def data_aug_replace(dataset, sentence_amount, ME_LOC = ME_LOC, ME_ORG = ME_ORG,
                     ME_BPER = ME_BPER, ME_IPER = ME_IPER, used_entities = used_entities, train_tokens=train_tokens):
    """
    Replaces named entities in a subset of the dataset with new MENAPT ones, ensuring no reuse across datasets.
    """

    random.seed(42)

    # extract sentences with containing relevant tags
    eligible_sentences = [sent for sent in dataset if any(tag not in ["O", "B-MISC", "I-MISC"] for tag in sent["ner_tags"])]
    # select random sentences
    selected_sentences = random.sample(eligible_sentences, min(sentence_amount, len(eligible_sentences)))
    # create copy to not modify original dataset 
    modified_dataset = [dict(sent) for sent in dataset] 

    for sent in modified_dataset:
        if sent not in selected_sentences:
            continue

        i = 0
        while i < len(sent["tokens"]):
            tag = sent["ner_tags"][i]

            if tag == 'B-PER':
                available = [p for p in ME_BPER if p not in used_entities and p not in train_tokens]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][i] = replace
                    used_entities.add(replace)
                i += 1

            elif tag == 'I-PER':
                available = [p for p in ME_IPER if p not in used_entities and p not in train_tokens]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][i] = replace
                    used_entities.add(replace)
                i += 1

            elif tag == 'B-LOC':
                span_start = i
                span_len = 1

                i += 1

                while i < len(sent["ner_tags"]) and sent["ner_tags"][i] == "I-LOC":
                    span_len += 1
                    i += 1

                available = [
                    loc for loc in ME_LOC
                    if not any(token in train_tokens for token in loc["tokens"])
                    and not any(token in used_entities for token in loc["tokens"])
                    and len(loc["tokens"]) == span_len
                ]
                
                if available:
                    replace = random.choice(available)
                    sent["tokens"][span_start:span_start + span_len] = replace["tokens"]
                    used_entities.update(replace["tokens"])

            elif tag == 'B-ORG':
                span_start = i
                span_len = 1
                i += 1
                while i < len(sent["ner_tags"]) and sent["ner_tags"][i] == "I-ORG":
                    span_len += 1
                    i += 1

                available = [
                    org for org in ME_ORG
                    if not any(token in train_tokens for token in org["tokens"])
                    and not any(token in used_entities for token in org["tokens"])
                    and len(org["tokens"]) == span_len
                ]

                if available:
                    replace = random.choice(available)
                    sent["tokens"][span_start:span_start + span_len] = replace["tokens"]
                    used_entities.update(replace["tokens"])

            else:
                i += 1

    return modified_dataset


In [32]:
ME_dev = data_aug_replace(dev_data, 1000)
ME_test = data_aug_replace(test_data, 1000)

In [33]:
print(used_entities)

{'Saeed', 'Dahlia', 'Talabat', 'Ismail', 'al-Yaoume', 'Kashif', 'Nur', 'Yassine', 'Adhree', 'Alairayri', 'Amna', 'Eco-Médias', 'Yasmine', 'Al-Kalima', 'Salihli', 'Jawad', 'Ajeer', 'Dana', 'Malatya', 'Malaeb', 'Bolu', 'Eskisehir', 'Khomeyn', 'Rustaq', 'Ash', 'Trabzon', 'Harun', 'Shatrah', 'Rize', 'AvidBeam', 'Yazd', 'Gulf', 'Albawaly', 'Nida', 'Altinaymi', 'Aden', 'Al-Thawra', 'Dalal', 'Al akhras', 'Karam', 'Yasmin', 'Rachid', 'PubliTools', 'Al-Hudaydah', 'Najwa', 'Yousif', 'Naveed', 'Mirza', 'Zahir', 'Sadik', 'İnegöl', 'Oilibya', 'Wasla', 'Hayat', 'Kessimou Elfedil', 'Atef', 'al-Watan', 'al-Arabi', 'Party', 'Nazir', 'Naqadeh', 'Socialist', 'Bahram', 'Tunisna', 'ONCF', 'Emir', 'Mallawi', "'Amarah", 'Batman', 'Sami', 'Jamjamal', 'Fatiha', 'Tahta', 'Madaba', 'NBN', 'Fadia', 'Healthcare', 'Khorramshahr', 'Mohsen', 'Sonya', 'News', 'Arif', 'Osmaniye', 'Gaziantep', 'Tariq', 'G42', 'Salam', 'Monoprix', 'Zainab', 'Comarit', 'Bibi', 'Lubna', 'Flynas', 'al-Balad', 'Sakakah', 'Ağrı', 'Abdullah', 

In [34]:
#for sent in ME_dev: 
#    print(sent)

In [35]:
#for sent in ME_test: 
#    print(sent)

In [36]:
# save as tsv files
write_tsv_file(ME_dev, "../data/me_data/middle_eastern_dev.tsv")
write_tsv_file(ME_test, "../data/me_data/middle_eastern_test.tsv")