In [None]:
# imports 
import random
import pickle
import sys
sys.path.append("../")

from scripts.load_data import write_tsv_file, extract_labeled_tokens, label_mapping, read_tsv_file, write_iob2_file
from scripts.data_augmentation import data_aug_replace
from scripts.extract_ME_entities import extract_first_names, get_last_names,  load_location, load_organisation

random.seed(42)

In [87]:
# extract ME entities
ME_BPER = extract_first_names("data/me_entity_sources/Ordbog_over_muslimske_fornavne_i_DK.pdf")
ME_IPER = get_last_names("data/me_entity_sources/middle_eastern_last_names.txt", "data/me_entity_sources//KDBGIVE.tsv")
ME_LOC = load_location("data/me_entity_sources/the-middle-east-cities.csv")
ME_ORG = load_organisation("data/me_entity_sources/middle_eastern_organisations.csv")

In [88]:
# path to the data files
path_train = "data/no_overlap_da_news/da_news_train.tsv"
path_dev = "data/no_overlap_da_news/da_news_dev.tsv"
path_test = "data/no_overlap_da_news/da_news_test.tsv"

In [89]:
# create mapping
label2id, id2label = label_mapping(path_train)

In [90]:
# read in the DaN+ data
train_data = read_tsv_file(path_train, label2id)
dev_data = read_tsv_file(path_dev, label2id)
test_data = read_tsv_file(path_test, label2id)

In [91]:
# extracting all tokens in train data - to ensure no overlap later
train_tokens = extract_labeled_tokens(train_data)

In [92]:
# for saving all used entities
used_entities = set()

In [93]:
# create augmented datasets
ME_dev, used_entities = data_aug_replace(dev_data, sentence_amount=1000,
                                         ME_LOC = ME_LOC, ME_ORG = ME_ORG, ME_BPER = ME_BPER, ME_IPER = ME_IPER, 
                                         used_entities = used_entities, train_tokens=train_tokens)

ME_test, used_entities = data_aug_replace(test_data, sentence_amount=1000,
                                         ME_LOC = ME_LOC, ME_ORG = ME_ORG, ME_BPER = ME_BPER, ME_IPER = ME_IPER, 
                                         used_entities = used_entities, train_tokens=train_tokens)

In [94]:
# save as tsv files
write_tsv_file(ME_dev, "data/me_data/middle_eastern_dev.tsv")
write_tsv_file(ME_test, "data/me_data/middle_eastern_test.tsv")

# save as iob2 files
write_iob2_file(ME_dev, path="data/me_data/middle_eastern_dev.iob2", gold=True)
write_iob2_file(ME_test, path="data/me_data/middle_eastern_test.iob2", gold=True)

In [None]:
with open('hpc_jobs/used_entities.pkl', 'wb') as f:
    pickle.dump(used_entities, f)