In [1]:
import random
import pandas as pd

from data_aug_code.middle_eastern_ne import extract_first_names, extract_last_names, get_last_names, add_location, load_location, add_organisation, load_organisation


In [2]:
# reading label data from a given column
# this is the readNlu function from the provided span_f1 file
# minor modifications were made to make it usable with our data. 
def readNlu(path, target_column = 1): # default to index 1 (thats where DaN+ labels are)
    '''
    This function reads labeled annotations from a CoNLL-like file.

    It parses a file where each line typically represents a single token and its annotations,
    separated by tabs. Empty lines denote sentence boundaries. It extracts labels from a specified column
    (by default, column index 1), collecting them as a list of label sequences, one per sentence.

    Parameters:
        path (str): Path to the input file.
        target_column (int, optional): Index of the column to extract labels from. Defaults to 1.

    Returns:
        List[List[str]]: A list where each element is a list of labels (strings) corresponding
                         to tokens in a sentence.
    '''

    annotations = []    # list for storing all the label sequences (one per sentence)
    cur_annotation = [] # temp list for labels of the current sentence

    # reading through the file line by line
    for line in open(path, encoding='utf-8'):
        line = line.strip()                     # remove leading/trailing whitespaces

        # empty lines denotes end of sentence
        if line == '':
            annotations.append(cur_annotation)  # add current annotations to annotations list
            cur_annotation = []                 # reset for the next sentence
        
        # skipping comments (start with "#" and no tokens columns)
        elif line[0] == '#' and len(line.split('\t')) == 1:
            continue
        
        else:
            # extract the label from the specified column and add to current sentence
            cur_annotation.append(line.split('\t')[target_column])

    return annotations


# mapping funciton 
def mapping(path):
    '''
    This function generates mappings between labels and their corresponding integer IDs from a labeled dataset.

    It reads annotations from a CoNLL-like file using the `readNlu` function,
    filters out labels containing substrings like "part" or "deriv" (case-insensitive),
    and creates a bidirectional mapping between the remaining unique labels and integer IDs.

    Parameters:
        path (str): Path to the labeled data file.

    Returns:
        Tuple[Dict[str, int], Dict[int, str]]:
            - label2id: A dictionary mapping each label to a unique integer ID.
            - id2label: A reverse dictionary mapping each integer ID back to its label.
    '''

    # get the data labels
    data_labels = readNlu(path) 

    # create empty set to store unique labels
    label_set = set()

    for labels in data_labels:
        #  filter out any labels that contain 'part' or 'deriv' (case-insensitive)
        filtered = [label for label in labels if 'part' not in label.lower() and 'deriv' not in label.lower()]
        label_set.update(filtered)

    # count of unique filtered labels
    num_labels = len(label_set)

    # create a dictionary mapping each label to a unique integer ID
    label2id = {label: id for id, label in enumerate(label_set)}

    # create a dictionary mapping each unique integer ID to a label
    id2label = {id: label for label, id in label2id.items()}

    return label2id, id2label


# load data function
# heavily inspired by the solution from assignment 5
def read_tsv_file(path, label2id):
    '''
    This function reads a TSV file containing tokens and NER labels and converts it into structured data.
    It collects the tokens, their original labels, and their corresponding integer IDs (based on the provided `label2id` mapping) for each sentence.
    Sentences are separated by empty lines. 

    Each non-empty line in the file is expected to have at least two tab-separated columns:
    - The first column is the token.
    - The second column is the corresponding NER label.

    Parameters:
        path (str): Path to the TSV file to read.
        label2id (dict): A dictionary mapping NER label strings to their corresponding integer IDs.

    Returns:
        List[dict]: A list of dictionaries, one per sentence, with keys:
            - 'tokens': list of tokens.
            - 'ner_tags': list of original NER label strings.
            - 'tag_ids': list of integer tag IDs corresponding to the NER labels.
    '''

    data = []               # final list to hold all sentences as dictionaries
    current_words = []      # tokens for the current sentence
    current_tags = []       # NER tags for the current sentence
    current_tag_ids = []    # corresponding tag IDs for the current sentence

    for line in open(path, encoding='utf-8'):
        line = line.strip() # removes any leading and trailing whitespaces from the line

        if line:
            if line[0] == '#': 
                continue # skip comments

            # splitting at 'tab', as the data is tab separated 
            tok = line.split('\t')
            
            # extract the token (first column)
            token = tok[0]

            # check if the label is in the provided label2id dictionary
            # if it's not, replace the label with 'O'
            label = tok[1] if tok[1] in label2id else 'O'

            current_words.append(token)
            current_tags.append(label)
            current_tag_ids.append(label2id[label])
        
        else: # skip empty lines
            if current_words: # if current_words is not empty

                # add entry to dict where tokens and ner_tags are keys and the values are lists
                data.append({"tokens": current_words, "ner_tags": current_tags, "tag_ids": current_tag_ids})

            # start over  
            current_words = []
            current_tags = []
            current_tag_ids = []

    # check for last one
    if current_tags != []:
        data.append({"tokens": current_words, "ner_tags": current_tags, "tag_ids": current_tag_ids})
  
    return data

# extracting tokens to check for overlap in train, dev and test sets
def extract_labeled_tokens(dataset, exclude_label = "O", include_label_pair=False):
    '''
    This function extracts tokens from a dataset that have a string label different from `exclude_label`.
    Optionally, it can return the (token, label) pairs instead of just tokens.

    Parameters:
        dataset (List[dict]): The token-tagged dataset.
        exclude_label (str): The label to ignore (default is 'O').
        include_label_pair (bool): Whether to include the (token, label) pairs in the result (default is False).
        
    Returns:
         Set[str] or Set[Tuple[str, str]]: 
            - A set of tokens with meaningful (non-O) labels if `include_label_pair` is False.
            - A set of (token, label) pairs if `include_label_pair` is True.
    '''

    # create empty set to store the unique tokens
    labeled_tokens = set()
    
    for sentence in dataset:
        # iterate over each token and its corresponding tag ID
        for token, label in zip(sentence["tokens"], sentence["ner_tags"]):
            if label != exclude_label:                      # check if the tag is not the excluded one
                if include_label_pair:
                    labeled_tokens.add((token, label))      # add (token, label) pair if the flag is True
                else:
                    labeled_tokens.add(token)               # add just the token if the flag is False
    
    return labeled_tokens


In [3]:
ME_BPER = extract_first_names("../data_aug_sources/Ordbog_over_muslimske_fornavne_i_DK.pdf")
ME_IPER = get_last_names("../data_aug_sources/middle_eastern_last_names.txt", "../data_aug_sources/KDBGIVE.tsv")
ME_LOC = load_location("../data_aug_sources/the-middle-east-cities.csv")
ME_ORG = load_organisation("../data_aug_sources/middle_eastern_organisations.csv")

In [4]:
def entities_by_label(data, target_label):
    """
    Collects full entity name of locations and organisation from labeled the dataset. 
    If an entity is made up of multiple words, it joins them into a single string.
    
    Args:
        data (List[Dict]): Dataset containing 'tokens' and 'tags' for each sentence.
        target_label (str): Label prefix to filter for (e.g., 'B-LOC', 'B-ORG').
        
    Returns:
        Set[str]: A set of labeled token strings (e.g., {'Beirut', 'Al Mawsil al Jadidah'})
    """
    grouped_strings = set()

    for sent in data:
        tokens = sent['tokens']
        tags = sent['ner_tags']

        i = 0
        while i < len(tokens):
            tag = tags[i]

            if tag.startswith(target_label):
                span_tokens = [tokens[i]]
                i += 1
                while i < len(tokens) and tags[i].startswith('I'):
                    span_tokens.append(tokens[i])
                    i += 1

                # Join tokens into a single string and add to the set
                entity_string = ' '.join(span_tokens)
                grouped_strings.add(entity_string)
            else:
                i += 1

    return grouped_strings


In [5]:
# Joining entities of multiple tokens into single strings to check for overlap between train, dev, test, and MENAPT NEs
ME_LOC_tokens = entities_by_label(ME_LOC, target_label = "B-LOC")

ME_ORG_tokens = entities_by_label(ME_ORG, target_label = "B-ORG")

print(ME_LOC_tokens)
print(len(ME_LOC_tokens))
print(len(ME_LOC))

print("\n")

print(ME_ORG_tokens)
print(len(ME_ORG_tokens))
print(len(ME_ORG))

ME_BPER_tokens = set(ME_BPER)
ME_IPER_tokens = set(ME_IPER)

{'Sultanah', 'Ordu', 'Bishah', 'Khalis', 'Al Farwaniyah', 'Karabağlar', 'Erzurum', 'Elbistan', 'Al Khankah', 'Erbil', 'Balikesir', 'Hebron', 'Shiraz', 'Aksaray', 'Medina', 'Şanlıurfa', 'Abu Tij', 'Habbouch', 'Talkha', 'Zahedan', 'Isparta', 'Bismil', 'Rosetta', 'Sari', 'Al Basrah al Qadimah', 'Uşak', 'Al Fahahil', 'Samarra', 'Mardin', 'Nasiriyah', 'Al Qatif', 'Kahriz', 'Rafah', 'Bilbeis', 'Antalya', 'Izmit', 'Ismailia', 'Langarud', 'Adapazari', 'Qina', 'Hawalli', 'Zabol', 'Kufa', 'Birjand', 'Bushehr', 'Niğde', 'Akhisar', 'Gebze', 'Ardabil', 'Damietta', 'Hurghada', 'Doha', 'Jidda', 'Nazarabad', 'Bawshar', 'Gemlik', 'Dubai', 'Taiz', 'Al Mahallah al Kubra', 'Al Minya', 'Azadshahr', 'As Samawah', 'Sidon', 'Zagazig', 'Kozan', 'Gaziantep', 'Tyre', 'Nahavand', 'Üsküdarr', 'Ilam', 'Naqadeh', 'Bukan', 'Kilis', 'Tabuk', 'Al-Arish', 'Nevşehir', 'As Salamiyah', 'Al-Hasakah', 'Sur', 'Jounieh', 'Ras Beirut', 'Oman', 'Asyut', 'Port Said', 'Amol', 'Çankaya', 'Belek', 'Ahlat', 'Giresun', 'Aligudarz', 'I

In [6]:
# Reading in the data set

# path to the data files
path_train = "../data/no_overlap_da_news/da_news_train.tsv"
path_dev = "../data/no_overlap_da_news/da_news_dev.tsv"
path_test = "../data/no_overlap_da_news/da_news_test.tsv"

# create mapping
label2id, id2label = mapping(path_train)

# read in the DaN+ data
train_data = read_tsv_file(path_train, label2id)
dev_data = read_tsv_file(path_dev, label2id)
test_data = read_tsv_file(path_test, label2id)

In [7]:
def get_all_entities(data, exclude_label="O"):
    """
    Collects full entity name of locations and organisation from labeled the dataset. 
    If an entity is made up of multiple words, it joins them into a single string.

    Args:
        data (List[Dict]): Dataset with 'tokens' and 'tags' per sentence.
        exclude_label (str): Label to ignore (default is 'O').

    Returns:
        Set[str]: Set of labeled entity strings (e.g., {'Beirut', 'Al Mawsil al Jadidah'})
    """
    grouped_strings = set()

    for sent in data:
        tokens = sent['tokens']
        tags = sent['ner_tags']
        i = 0
        while i < len(tokens):
            tag = tags[i]

            if tag != exclude_label and tag.startswith('B-'):
                span_tokens = [tokens[i]]
                i += 1
                # Collect I- continuation tags
                while i < len(tokens) and tags[i].startswith('I-'):
                    span_tokens.append(tokens[i])
                    i += 1

                entity_string = ' '.join(span_tokens)
                grouped_strings.add(entity_string)
            else:
                i += 1

    return grouped_strings


In [8]:
train_tokens = get_all_entities(train_data)
dev_tokens = get_all_entities(dev_data)
test_tokens = get_all_entities(test_data)

In [9]:
print("overlap LOC train: ", train_tokens & ME_LOC_tokens)
print("overlap LOC dev: ", dev_tokens & ME_LOC_tokens)
print("overlap LOC test: ", test_tokens & ME_LOC_tokens)

print("overlap ORG train: ", train_tokens & ME_ORG_tokens)
print("overlap ORG dev: ", dev_tokens & ME_ORG_tokens)
print("overlap ORG test: ", test_tokens & ME_ORG_tokens)

print("overlap BPER train: ", train_tokens & ME_BPER_tokens)
print("overlap BPER dev: ", dev_tokens & ME_BPER_tokens)
print("overlap BPER test: ", test_tokens & ME_BPER_tokens)

print("overlap IPER train: ", train_tokens & ME_IPER_tokens)
print("overlap IPER dev: ", dev_tokens & ME_IPER_tokens)
print("overlap IPER test: ", test_tokens & ME_IPER_tokens)

overlap LOC train:  {'Bahrain', 'Syrien', 'Oman', 'Bagdad', 'Ankara', 'Irak', 'Abu Dhabi', 'Erzincan', 'Kuwait', 'Luxor', 'Bush'}
overlap LOC dev:  set()
overlap LOC test:  set()
overlap ORG train:  {'CBC'}
overlap ORG dev:  set()
overlap ORG test:  set()
overlap BPER train:  {'Elias', 'Bassam'}
overlap BPER dev:  set()
overlap BPER test:  set()
overlap IPER train:  {'Katie', 'John', 'Allan', 'Masood', 'Elias', 'Kim'}
overlap IPER dev:  set()
overlap IPER test:  set()


In [22]:
print("before ME_BPER_tokens:", len(ME_BPER_tokens))
updated_ME_BPER = list(ME_BPER_tokens - (train_tokens & ME_BPER_tokens) - (dev_tokens & ME_BPER_tokens) - (test_tokens & ME_BPER_tokens))
print("Updated ME_BPER_tokens:", len(updated_ME_BPER))

print("before ME_IPER_tokens:", len(ME_IPER_tokens))
updated_ME_IPER = list(ME_IPER_tokens - (train_tokens & ME_IPER_tokens) - (dev_tokens & ME_IPER_tokens) - (test_tokens & ME_IPER_tokens))
print("Updated ME_IPER_tokens:", len(updated_ME_IPER))

print("before ME_LOC_tokens:", len(ME_LOC_tokens))
updated_ME_LOC = list(ME_LOC_tokens - (train_tokens & ME_LOC_tokens) - (dev_tokens & ME_LOC_tokens) - (test_tokens & ME_LOC_tokens))
print("Updated ME_LOC_tokens:", len(updated_ME_LOC))

print("before ME_ORG_tokens:", len(ME_ORG_tokens))
updated_ME_ORG = list(ME_ORG_tokens - (train_tokens & ME_ORG_tokens) - (dev_tokens & ME_ORG_tokens) - (test_tokens & ME_ORG_tokens))
print("Updated ME_ORG_tokens:", len(updated_ME_ORG))

before ME_BPER_tokens: 735
Updated ME_BPER_tokens: 733
before ME_IPER_tokens: 1580
Updated ME_IPER_tokens: 1574
before ME_LOC_tokens: 480
Updated ME_LOC_tokens: 469
before ME_ORG_tokens: 427
Updated ME_ORG_tokens: 426


In [11]:
# Define the overlap set (tokens from train, dev, and test)
dataset_spans = train_tokens | dev_tokens | test_tokens  # Combine all three sets (train, dev, test)

# Step 3: Remove overlapping entries from ME_LOC
filtered_ME_LOC = []
for item in ME_LOC:
    entity_string = ' '.join(item['tokens'])  # Join tokens into span string
    if entity_string not in dataset_spans:    # Keep only if it's NOT overlapping
        filtered_ME_LOC.append(item)

# Step 4: Optional check
print("Before ME_LOC:", len(ME_LOC))
print("After ME_LOC:", len(filtered_ME_LOC))

Before ME_LOC: 481
After ME_LOC: 470


In [12]:

# Step 3: Remove overlapping entries from ME_LOC
filtered_ME_ORG = []
for item in ME_ORG:
    entity_string = ' '.join(item['tokens'])  # Join tokens into span string
    if entity_string not in dataset_spans:    # Keep only if it's NOT overlapping
        filtered_ME_ORG.append(item)

# Step 4: Optional check
print("Before ME_ORG:", len(ME_ORG))
print("After ME_ORG:", len(filtered_ME_ORG))
print(filtered_ME_ORG)

Before ME_ORG: 427
After ME_ORG: 426
[{'tokens': ['Saudi', 'Aramco'], 'ner_tags': ['B-ORG', 'I-ORG']}, {'tokens': ['Saudi', 'National', 'Bank'], 'ner_tags': ['B-ORG', 'I-ORG', 'I-ORG']}, {'tokens': ['International', 'Holding', 'Company'], 'ner_tags': ['B-ORG', 'I-ORG', 'I-ORG']}, {'tokens': ['QNB', 'Group'], 'ner_tags': ['B-ORG', 'I-ORG']}, {'tokens': ['First', 'Abu', 'Dhabi', 'Bank'], 'ner_tags': ['B-ORG', 'I-ORG', 'I-ORG', 'I-ORG']}, {'tokens': ['Emirates', 'NBD'], 'ner_tags': ['B-ORG', 'I-ORG']}, {'tokens': ['alrajhi', 'bank'], 'ner_tags': ['B-ORG', 'I-ORG']}, {'tokens': ['TAQA', 'Group'], 'ner_tags': ['B-ORG', 'I-ORG']}, {'tokens': ['Saudi', 'Electricity', 'Company'], 'ner_tags': ['B-ORG', 'I-ORG', 'I-ORG']}, {'tokens': ['Kuwait', 'Finance', 'House'], 'ner_tags': ['B-ORG', 'I-ORG', 'I-ORG']}, {'tokens': ['stc', 'Group'], 'ner_tags': ['B-ORG', 'I-ORG']}, {'tokens': ['ADNOC', 'Gas'], 'ner_tags': ['B-ORG', 'I-ORG']}, {'tokens': ['e&'], 'ner_tags': ['B-ORG']}, {'tokens': ['ADCB', 'Grou

In [13]:
#filtered_ME_LOC = collect_entity_strings(filtered_ME_LOC, target_label_prefix = "B-LOC")

#filtered_ME_ORG = collect_entity_strings(filtered_ME_ORG, target_label_prefix = "B-ORG")


In [34]:
#print("overlap loc train: ", train_tokens & filtered_ME_LOC)
#print("overlap loc dev: ", dev_tokens & filtered_ME_LOC)
#print("overlap loc test: ", test_tokens & filtered_ME_LOC)

#print("overlap org train: ", train_tokens & filtered_ME_ORG)
#print("overlap org dev: ", dev_tokens & filtered_ME_ORG)
#print("overlap org test: ", test_tokens & filtered_ME_ORG)

#print("overlap BPER train: ", train_tokens & set(updated_ME_BPER))
#print("overlap BPER dev: ", dev_tokens & set(updated_ME_BPER))
#print("overlap BPER test: ", test_tokens & set(updated_ME_BPER))

#print("overlap IPER train: ", train_tokens & set(updated_ME_IPER))
#print("overlap IPER dev: ", dev_tokens & set(updated_ME_IPER))
#print("overlap IPER test: ", test_tokens & set(updated_ME_IPER))


In [14]:
used_entities = {
    "LOC": set(),
    "ORG": set(),
    "BPER": set(),
    "IPER": set()
}

In [19]:
def data_aug_replace(dataset, sentence_amount, filtered_ME_LOC = filtered_ME_LOC, filtered_ME_ORG = filtered_ME_ORG,
                     updated_ME_BPER = updated_ME_BPER, updated_ME_IPER = updated_ME_IPER, used_entities = used_entities):
    """
    Replaces named entities in a subset of the dataset with new MENAPT ones, ensuring no reuse across datasets.
    """
    eligible_sentences = [sent for sent in dataset if any(tag not in ["O", "B-MISC", "I-MISC"] for tag in sent["ner_tags"])]
    selected_sentences = random.sample(eligible_sentences, min(sentence_amount, len(eligible_sentences)))
    modified_dataset = [dict(sent) for sent in dataset] # Creating copy to not modify original dataset 

    for sent in modified_dataset:
        if sent not in selected_sentences:
            continue

        i = 0
        while i < len(sent["tokens"]):
            tag = sent["ner_tags"][i]

            if tag == 'B-PER':
                available = [set(updated_ME_BPER) - used_entities["B-PER"]]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][i] = replace
                    used_entities["BPER"].add(replace)
                i += 1

            elif tag == 'I-PER':
                available = [set(updated_ME_BPER) - used_entities["I-PER"]]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][i] = replace
                    used_entities["IPER"].add(replace)
                i += 1

            elif tag == 'B-LOC':
                span_start = i
                span_len = 1
                i += 1
                while i < len(sent["ner_tags"]) and sent["ner_tags"][i] == "I-LOC":
                    span_len += 1
                    i += 1

                available = [loc for loc in filtered_ME_LOC if tuple(loc["tokens"]) not in used_entities["LOC"] and len(loc["tokens"]) == span_len]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][span_start:span_start + span_len] = replace["tokens"]
                    used_entities["LOC"].add(tuple(replace["tokens"]))

            elif tag == 'B-ORG':
                span_start = i
                span_len = 1
                i += 1
                while i < len(sent["ner_tags"]) and sent["ner_tags"][i] == "I-ORG":
                    span_len += 1
                    i += 1

                available = [org for org in filtered_ME_ORG if tuple(org["tokens"]) not in used_entities["ORG"] and len(org["tokens"]) == span_len]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][span_start:span_start + span_len] = replace["tokens"]
                    used_entities["ORG"].add(tuple(replace["tokens"]))

            else:
                i += 1

    return modified_dataset


In [15]:
# Use above function IMO
def data_aug_replace(dataset, sentence_amount, filtered_ME_LOC = filtered_ME_LOC, filtered_ME_ORG = filtered_ME_ORG,
                     updated_ME_BPER = updated_ME_BPER, updated_ME_IPER = updated_ME_IPER, used_entities = used_entities):
    """
    Replaces named entities in a subset of the dataset with new MENAPT ones, ensuring no reuse across datasets.
    """
    eligible_sentences = [sent for sent in dataset if any(tag not in ["O", "B-MISC", "I-MISC"] for tag in sent["ner_tags"])]
    selected_sentences = random.sample(eligible_sentences, min(sentence_amount, len(eligible_sentences)))
    modified_dataset = [dict(sent) for sent in dataset] # Creating copy to not modify original dataset 

    for sent in modified_dataset:
        if sent not in selected_sentences:
            continue

        i = 0
        while i < len(sent["tokens"]):
            tag = sent["ner_tags"][i]

            if tag == 'B-PER':
                available = [p for p in updated_ME_BPER if p not in used_entities["BPER"]]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][i] = replace
                    used_entities["BPER"].add(replace)
                i += 1

            elif tag == 'I-PER':
                available = [p for p in updated_ME_IPER if p not in used_entities["IPER"]]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][i] = replace
                    used_entities["IPER"].add(replace)
                i += 1

            elif tag == 'B-LOC':
                span_start = i
                span_len = 1
                i += 1
                while i < len(sent["ner_tags"]) and sent["ner_tags"][i] == "I-LOC":
                    span_len += 1
                    i += 1

                available = [loc for loc in filtered_ME_LOC if tuple(loc["tokens"]) not in used_entities["LOC"] and len(loc["tokens"]) == span_len]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][span_start:span_start + span_len] = replace["tokens"]
                    used_entities["LOC"].add(tuple(replace["tokens"]))

            elif tag == 'B-ORG':
                span_start = i
                span_len = 1
                i += 1
                while i < len(sent["ner_tags"]) and sent["ner_tags"][i] == "I-ORG":
                    span_len += 1
                    i += 1

                available = [org for org in filtered_ME_ORG if tuple(org["tokens"]) not in used_entities["ORG"] and len(org["tokens"]) == span_len]
                if available:
                    replace = random.choice(available)
                    sent["tokens"][span_start:span_start + span_len] = replace["tokens"]
                    used_entities["ORG"].add(tuple(replace["tokens"]))

            else:
                i += 1

    return modified_dataset


In [20]:
for sent in dev_data[:40]:
    print(sent)

{'tokens': ['Når', 'forløbet', 'i', 'kommunalt', 'regi', 'slutter', 'til', 'december', ',', 'håber', 'hun', ',', 'at', 'hendes', 'og', 'mandens', 'firma', 'er', 'vokset', 'stort', 'nok', 'til', ',', 'at', 'hun', 'kan', 'blive', 'ansat', 'på', 'fuld', 'tid', '.'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'tag_ids': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]}
{'tokens': ['-', 'Få', 'den', 'UD', 'AF', 'MIN', 'KANO', '!', 'siger', 'han', '.'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'tag_ids': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]}
{'tokens': ['Navnlig', ',', 'hvis', 'forhandleren', 'giver', 'et', 'godt', 'bud', 'på', 'den', 'brugte', 'bil', '-', 'og', 'lokker', 'med', 'en', 'fiks', 'finansierings-model', '.'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

In [17]:
ME_dev = data_aug_replace(dev_data[:40], 40)
for sent in ME_dev: 
    print(sent)

{'tokens': ['Når', 'forløbet', 'i', 'kommunalt', 'regi', 'slutter', 'til', 'december', ',', 'håber', 'hun', ',', 'at', 'hendes', 'og', 'mandens', 'firma', 'er', 'vokset', 'stort', 'nok', 'til', ',', 'at', 'hun', 'kan', 'blive', 'ansat', 'på', 'fuld', 'tid', '.'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'tag_ids': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]}
{'tokens': ['-', 'Få', 'den', 'UD', 'AF', 'MIN', 'KANO', '!', 'siger', 'han', '.'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'tag_ids': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]}
{'tokens': ['Navnlig', ',', 'hvis', 'forhandleren', 'giver', 'et', 'godt', 'bud', 'på', 'den', 'brugte', 'bil', '-', 'og', 'lokker', 'med', 'en', 'fiks', 'finansierings-model', '.'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'

In [21]:
ME_dev = data_aug_replace(dev_data, 40) 
for sent in ME_dev: 
    print(sent)

{'tokens': ['Når', 'forløbet', 'i', 'kommunalt', 'regi', 'slutter', 'til', 'december', ',', 'håber', 'hun', ',', 'at', 'hendes', 'og', 'mandens', 'firma', 'er', 'vokset', 'stort', 'nok', 'til', ',', 'at', 'hun', 'kan', 'blive', 'ansat', 'på', 'fuld', 'tid', '.'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'tag_ids': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]}
{'tokens': ['-', 'Få', 'den', 'UD', 'AF', 'MIN', 'KANO', '!', 'siger', 'han', '.'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'], 'tag_ids': [2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]}
{'tokens': ['Navnlig', ',', 'hvis', 'forhandleren', 'giver', 'et', 'godt', 'bud', 'på', 'den', 'brugte', 'bil', '-', 'og', 'lokker', 'med', 'en', 'fiks', 'finansierings-model', '.'], 'ner_tags': ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'