In [21]:
# imports
import random
import copy
import sys
sys.path.append("../")

from scripts.load_data import write_tsv_file, extract_labeled_tokens, mapping, read_tsv_file, write_iob2_file
from scripts.data_aug import data_aug_replace
from middle_eastern_ne import extract_first_names, get_last_names,  load_location, load_organisation

random.seed(42)


## Get ME entities

In [22]:
ME_BPER = extract_first_names("../data_aug_sources/Ordbog_over_muslimske_fornavne_i_DK.pdf")
ME_IPER = get_last_names("../data_aug_sources/middle_eastern_last_names.txt", "../data_aug_sources/KDBGIVE.tsv")
ME_LOC = load_location("../data_aug_sources/the-middle-east-cities.csv")
ME_ORG = load_organisation("../data_aug_sources/middle_eastern_organisations.csv")

## Read in data sets

In [23]:
# path to the data files
path_train = "../data/no_overlap_da_news/da_news_train.tsv"
path_dev = "../data/no_overlap_da_news/da_news_dev.tsv"
path_test = "../data/no_overlap_da_news/da_news_test.tsv"

# create mapping
label2id, id2label = mapping(path_train)

# read in the DaN+ data
train_data = read_tsv_file(path_train, label2id)
dev_data = read_tsv_file(path_dev, label2id)
test_data = read_tsv_file(path_test, label2id)

## Replace entities in dev and test set

In [24]:
# extracting all tokens in train data - to ensure no overlap later
train_tokens = extract_labeled_tokens(train_data)

In [25]:
# for saving all used entities
used_entities = set()

In [26]:
used_entities

set()

In [27]:
ME_dev, used_entities = data_aug_replace(dev_data, sentence_amount=1000,
                                         ME_LOC = ME_LOC, ME_ORG = ME_ORG, ME_BPER = ME_BPER, ME_IPER = ME_IPER, 
                                         used_entities = used_entities, train_tokens=train_tokens)
len(used_entities)

ME_test, used_entities = data_aug_replace(test_data, sentence_amount=1000,
                                         ME_LOC = ME_LOC, ME_ORG = ME_ORG, ME_BPER = ME_BPER, ME_IPER = ME_IPER, 
                                         used_entities = used_entities, train_tokens=train_tokens)
len(used_entities)

589

In [28]:
print(used_entities)

{('Zabol',), 'Fadi', 'Omar', 'Abid', 'Javid', ('Al', 'Hayy'), ('Nagham',), ('Jidda',), 'Saif', 'Al khatiri', ('As', 'Suwayq'), 'Elias', ('Neyshabur',), ('Bayanat',), 'Sawadi', 'Layla', ('Qatar', 'Fuel'), 'Khaled', 'Elgissaini', 'Anwar', ('Miandoab',), ('Elm',), ('218TV',), ('ADNOC',), ('Al', 'Bab'), 'Sahjan', 'Baqir', ('Sonatrach',), 'Widad', 'Dunia', 'Sahar', 'Shah', 'Suhailah', 'Yassine', ('Hamadan',), ('Eskisehir',), 'Shahida', 'Rabih', ('Bukan',), ('Afriquia',), ('Mukalla',), 'Shaaban', ('Al-Nahar',), 'Najah', 'Kerim', ('Ankara',), ('Ünye',), ('Rustaq',), 'Faruk', ('Amanat', 'Holdings'), ('Saudia',), ('Mersa', 'Matruh'), ('Mauritel',), 'Mahir', 'Mansoor', ('Dubai', 'Islamic', 'Bank'), 'Amal', ('TAQA', 'Group'), 'Malak', ('Kingdom', 'Holding', 'Company'), 'Zamil', ('Bakdash',), ('Careem',), 'Nejla', ('GIB', 'Capital'), ('Zain', 'KSA'), 'Kashif', ('Irbid',), 'Alaizayli', ('Thumbay', 'Group'), 'Daud', ('Dogonbadan',), 'Kamila', ('Ağrı',), ('Gaziantep',), ('Saidal',), ('Qurayyat',), 'Z

In [29]:
print(len(ME_BPER))
BPER_left = [item for item in ME_BPER if item not in used_entities]
print(len(BPER_left))

735
514


In [30]:
print(len(ME_IPER))
IPER_left = [item for item in ME_IPER if item not in used_entities]
print(len(IPER_left))

1580
1456


In [31]:
print(len(ME_LOC))
LOC_left = [d for d in ME_LOC if tuple(d['tokens']) not in used_entities]
print(len(LOC_left))

481
352


In [32]:
print(len(ME_ORG))
ORG_left = [d for d in ME_ORG if tuple(d['tokens']) not in used_entities]
print(len(ORG_left))

427
260


In [33]:
#for sent in ME_dev: 
#    print(sent)

In [34]:
#for sent in ME_test: 
#    print(sent)

In [35]:
# save as tsv files
write_tsv_file(ME_dev, "../data/me_data/middle_eastern_dev.tsv")
write_tsv_file(ME_test, "../data/me_data/middle_eastern_test.tsv")

In [36]:
write_iob2_file(ME_test, path="../data/me_data/middle_eastern_test.iob2", gold=True)