In [None]:
# imports
import random
import sys
sys.path.append("../")

from scripts.load_data import write_tsv_file, extract_labeled_tokens, label_mapping, read_tsv_file, write_iob2_file
from scripts.data_aug import data_aug_replace
from middle_eastern_ne import extract_first_names, get_last_names,  load_location, load_organisation

random.seed(42)

In [2]:
ME_BPER = extract_first_names("../data_aug_sources/Ordbog_over_muslimske_fornavne_i_DK.pdf")
ME_IPER = get_last_names("../data_aug_sources/middle_eastern_last_names.txt", "../data_aug_sources/KDBGIVE.tsv")
ME_LOC = load_location("../data_aug_sources/the-middle-east-cities.csv")
ME_ORG = load_organisation("../data_aug_sources/middle_eastern_organisations.csv")

In [None]:
# path to the data files
path_train = "../data/no_overlap_da_news/da_news_train.tsv"
path_dev = "../data/no_overlap_da_news/da_news_dev.tsv"
path_test = "../data/no_overlap_da_news/da_news_test.tsv"

# create mapping
label2id, id2label = label_mapping(path_train)

# read in the DaN+ data
train_data = read_tsv_file(path_train, label2id)
dev_data = read_tsv_file(path_dev, label2id)
test_data = read_tsv_file(path_test, label2id)

In [4]:
# extracting all tokens in train data - to ensure no overlap later
train_tokens = extract_labeled_tokens(train_data)

In [5]:
# for saving all used entities
used_entities = set()

In [6]:
ME_dev, used_entities = data_aug_replace(dev_data, sentence_amount=1000,
                                         ME_LOC = ME_LOC, ME_ORG = ME_ORG, ME_BPER = ME_BPER, ME_IPER = ME_IPER, 
                                         used_entities = used_entities, train_tokens=train_tokens)
len(used_entities)

ME_test, used_entities = data_aug_replace(test_data, sentence_amount=1000,
                                         ME_LOC = ME_LOC, ME_ORG = ME_ORG, ME_BPER = ME_BPER, ME_IPER = ME_IPER, 
                                         used_entities = used_entities, train_tokens=train_tokens)
len(used_entities)

589

In [7]:
# save as tsv files
write_tsv_file(ME_dev, "../data/me_data/middle_eastern_dev.tsv")
write_tsv_file(ME_test, "../data/me_data/middle_eastern_test.tsv")

In [8]:
write_iob2_file(ME_test, path="../data/me_data/middle_eastern_test.iob2", gold=True)

# Checking eligible sentences in train

In [17]:
eligible_sentences = [sent for sent in train_data if any(tag not in ["O", "B-MISC", "I-MISC"] for tag in sent["ner_tags"])]
len(eligible_sentences)

1729

# Train Data Augmentation

In [19]:
final_used = used_entities

In [20]:
sentence_values = [100, 250, 500, 864, 1000, 1500, 1729]
augmented_datasets = []

for amount in sentence_values:
    aug_set, _ = data_aug_replace(train_data, sentence_amount=amount,
                                         ME_LOC = ME_LOC, ME_ORG = ME_ORG, ME_BPER = ME_BPER, ME_IPER = ME_IPER, 
                                         used_entities = final_used, train_tokens=train_tokens)
    augmented_datasets.append(aug_set)

In [22]:
augmented_datasets[3]

[{'tokens': ['På',
   'fredag',
   'har',
   'RasGas',
   'inviteret',
   'til',
   'reception',
   'i',
   'SID-huset',
   'i',
   'anledning',
   'af',
   'at',
   'formanden',
   'Ola',
   'Sahai',
   'går',
   'ind',
   'i',
   'de',
   'glade',
   'tressere',
   '.'],
  'ner_tags': ['O',
   'O',
   'O',
   'B-ORG',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'B-PER',
   'I-PER',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O'],
  'tag_ids': [3,
   3,
   3,
   2,
   3,
   3,
   3,
   3,
   3,
   3,
   3,
   3,
   3,
   3,
   8,
   5,
   3,
   3,
   3,
   3,
   3,
   3,
   3]},
 {'tokens': ['Eller',
   'slet',
   'og',
   'ret',
   'tykke',
   'og',
   'fede',
   'i',
   'mere',
   'eller',
   'mindre',
   'grad',
   ',',
   'som',
   'en',
   'enkelt',
   'udtrykker',
   'det',
   '.'],
  'ner_tags': ['O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   'O',
   '