In [1]:
# imports
import random
import sys
sys.path.append("../")

from scripts.load_data import write_tsv_file, extract_labeled_tokens, mapping, read_tsv_file, write_iob2_file
from scripts.data_aug import data_aug_replace
from middle_eastern_ne import extract_first_names, get_last_names,  load_location, load_organisation

random.seed(42)


## Get ME entities

In [2]:
ME_BPER = extract_first_names("../data_aug_sources/Ordbog_over_muslimske_fornavne_i_DK.pdf")
ME_IPER = get_last_names("../data_aug_sources/middle_eastern_last_names.txt", "../data_aug_sources/KDBGIVE.tsv")
ME_LOC = load_location("../data_aug_sources/the-middle-east-cities.csv")
ME_ORG = load_organisation("../data_aug_sources/middle_eastern_organisations.csv")

## Read in data sets

In [3]:
# path to the data files
path_train = "../data/no_overlap_da_news/da_news_train.tsv"
path_dev = "../data/no_overlap_da_news/da_news_dev.tsv"
path_test = "../data/no_overlap_da_news/da_news_test.tsv"

# create mapping
label2id, id2label = mapping(path_train)

# read in the DaN+ data
train_data = read_tsv_file(path_train, label2id)
dev_data = read_tsv_file(path_dev, label2id)
test_data = read_tsv_file(path_test, label2id)

## Replace entities in dev and test set

In [4]:
# extracting all tokens in train data - to ensure no overlap later
train_tokens = extract_labeled_tokens(train_data)

In [5]:
# for saving all used entities
used_entities = set()

In [6]:
used_entities

set()

In [7]:
ME_dev, used_entities = data_aug_replace(dev_data, sentence_amount=1000,
                                         ME_LOC = ME_LOC, ME_ORG = ME_ORG, ME_BPER = ME_BPER, ME_IPER = ME_IPER, 
                                         used_entities = used_entities, train_tokens=train_tokens)
len(used_entities)

ME_test, used_entities = data_aug_replace(test_data, sentence_amount=1000,
                                         ME_LOC = ME_LOC, ME_ORG = ME_ORG, ME_BPER = ME_BPER, ME_IPER = ME_IPER, 
                                         used_entities = used_entities, train_tokens=train_tokens)
len(used_entities)

589

In [8]:
print(used_entities)

{'Asima', ('Al-Massira',), ('Palæstina',), ('Sayhat',), 'Aisha', ('Iranshahr',), ('Bimo',), 'Obaid', ('Gulf', 'Bank'), 'Mansoor', ('Tunisavia',), ('Naseej',), 'Naila', 'Ibtisam', ('Egyptalum',), ('Ereğli',), 'Bakhsh', 'Nidal', ('Oilibya',), 'Khalifa', ('Kharafi', 'Group'), ('Inagrab',), 'Naimat', 'Naziha', 'Sahar', 'Aiman', ('Sakakah',), ('Jawan',), 'Kamal', ('Etisalat',), 'Zainab', 'Yousra', 'Taghrid', ('Ma’aden',), ('SAMI',), 'Mostafa', ('Massaya',), ('Swvl',), ('Elazig',), 'Kamila', ('Mauritel',), 'Mahmud', 'Rabih', 'Al mazroui', 'Abir', ('Ünye',), 'Alaibayli', ('Khanjarah',), 'El kordy', 'Umar', ('Akdital',), 'Mina', ('Sonatrach',), ('Ar', 'Rass'), 'Abdulqadir', 'Abid', 'Mahmoud', 'Bin Shamlan', ('Bukan',), 'Sardar', 'Sawadi', 'Mohammad', ('ADNOC',), ('BulkWhiz',), 'Khalid', ('Irak',), 'Jinan', ('Eskisehir',), ('al-Hayat', 'al-Jadida'), ('Siera',), 'Bukhari', ('Bahrain',), 'Sahyla', 'Samia', 'Mumadi', 'Alisa', 'Sahmoud', 'Albayda', ('AKKASA',), 'Nazim', ('Tarut',), 'Saif', 'Isam', 

In [9]:
print(len(ME_BPER))
BPER_left = [item for item in ME_BPER if item not in used_entities]
print(len(BPER_left))

735
514


In [10]:
print(len(ME_IPER))
IPER_left = [item for item in ME_IPER if item not in used_entities]
print(len(IPER_left))

1580
1456


In [11]:
print(len(ME_LOC))
LOC_left = [d for d in ME_LOC if tuple(d['tokens']) not in used_entities]
print(len(LOC_left))

481
352


In [12]:
print(len(ME_ORG))
ORG_left = [d for d in ME_ORG if tuple(d['tokens']) not in used_entities]
print(len(ORG_left))

427
260


In [13]:
#for sent in ME_dev: 
#    print(sent)

In [14]:
#for sent in ME_test: 
#    print(sent)

In [15]:
# save as tsv files
write_tsv_file(ME_dev, "../data/me_data/middle_eastern_dev.tsv")
write_tsv_file(ME_test, "../data/me_data/middle_eastern_test.tsv")

In [16]:
write_iob2_file(ME_test, path="../data/me_data/middle_eastern_test.iob2", gold=True)