### Notebook for preprocessing the data

In [1]:
# imports
import pickle
import random
random.seed(20)

from scripts.load_data import (
    label_mapping, extract_labeled_tokens,
    read_tsv_file, write_tsv_file,
    write_iob2_file  
)

from scripts.preprocess import fix_overlap

from scripts.data_augmentation import data_aug_replace
from scripts.extract_ME_entities import extract_first_names, get_last_names,  load_location, load_organisation

##### Getting the data (DaN+)

In [2]:
# path to the data files
path_train = "data/da_news/da_news_train.tsv"
path_dev = "data/da_news/da_news_dev.tsv"
path_test = "data/da_news/da_news_test.tsv"

In [3]:
# create label mapping
label2id, id2label = label_mapping(path_train)

In [4]:
# read in the DaN+ data
train_data = read_tsv_file(path_train, label2id)
dev_data = read_tsv_file(path_dev, label2id)
test_data = read_tsv_file(path_test, label2id)

In [None]:
# for later prediction
write_iob2_file(test_data, path="data/da_news/da_news_test.iob2", gold=True)

In [5]:
# extract tokens with non-"O" labels from each split
train_tokens = extract_labeled_tokens(train_data)
dev_tokens = extract_labeled_tokens(dev_data)
test_tokens = extract_labeled_tokens(test_data)

In [6]:
# fix overlap
clean_train_data, clean_dev_data, clean_test_data = fix_overlap(train_data, dev_data, test_data)

In [7]:
write_tsv_file(clean_train_data, 'data/no_overlap_da_news/da_news_train.tsv')
write_tsv_file(clean_dev_data, 'data/no_overlap_da_news/da_news_dev.tsv')
write_tsv_file(clean_test_data, 'data/no_overlap_da_news/da_news_test.tsv')

In [8]:
write_iob2_file(clean_dev_data, path="data/no_overlap_da_news/da_news_dev.iob2", gold=True)
write_iob2_file(clean_test_data, path="data/no_overlap_da_news/da_news_test.iob2", gold=True)

# Create augmented dev and test set

In [9]:
# path to the data files
path_train = "data/no_overlap_da_news/da_news_train.tsv"
path_dev = "data/no_overlap_da_news/da_news_dev.tsv"
path_test = "data/no_overlap_da_news/da_news_test.tsv"

In [10]:
# create mapping
label2id, id2label = label_mapping(path_train)

In [11]:
# read in the DaN+ data
train_data = read_tsv_file(path_train, label2id)
dev_data = read_tsv_file(path_dev, label2id)
test_data = read_tsv_file(path_test, label2id)

In [13]:
# for saving all used entities
used_entities = set()

In [14]:
# create augmented datasets
ME_dev, used_entities = data_aug_replace(dev_data, sentence_amount=1000,
                                         used_entities = used_entities)

ME_test, used_entities = data_aug_replace(test_data, sentence_amount=1000,
                                         used_entities = used_entities)

In [15]:
# save as tsv files
write_tsv_file(ME_dev, "data/me_data/middle_eastern_dev.tsv")
write_tsv_file(ME_test, "data/me_data/middle_eastern_test.tsv")

# save as iob2 files
write_iob2_file(ME_dev, path="data/me_data/middle_eastern_dev.iob2", gold=True)
write_iob2_file(ME_test, path="data/me_data/middle_eastern_test.iob2", gold=True)

In [16]:
# save set of used entities
with open('hpc_jobs/used_entities.pkl', 'wb') as f:
    pickle.dump(used_entities, f)