In [None]:
import os
import re
import spacy
import pandas as pd

# import requirements for converting the dataframe to Spacy Docs
from collections import defaultdict
from unicodedata import normalize
import regex
import random

# Load and process data from Coda

In [None]:
FILE_PATH = "../assets/NER_assets/Ancient_Words.csv"
# read csv file
df = pd.read_csv(FILE_PATH)
# rename columns to fit code
df.rename(columns = {'Word':'Keyword', 'Category Types':'Label'}, inplace = True)
# If a cell is empty (NaN), Fill it with the value in its parallel "Early" column
for row in df:
    df['Quote'].fillna(df['Early Quote'], inplace=True)
    df['Word Before'].fillna(df['Early Word Before'], inplace=True)
    df['Word After'].fillna(df['Early Word After'], inplace=True)
    df['Label'].fillna(df['Early Category Type'], inplace=True)
# remove rows with no Keyword
df = df.dropna(subset=['Keyword'])
# Remove any row that isn't Greek
pat = '[ء-ي]+'
#df.Keyword.str.contains(pat)
df = df[~df.Keyword.str.contains(pat, na=False)]
#replace new line in df column
df['Keyword'].replace('\n', '', regex=True, inplace=True)
#replace numbers in df
df.replace('\d+', '', regex=True, inplace=True)
#replace hyphens in df column
df.replace('-', '', regex=True, inplace=True)
# replace comma in df column
df['Keyword'].replace(',', '', regex=True, inplace=True)
#replace period in df column
df['Keyword'].replace('\.', '', regex=True, inplace=True)
#replace interpunkt in df column
df['Keyword'].replace('\·', '', regex=True, inplace=True)
# replace multiple spaces in df column
df.replace(' +', ' ', regex=True, inplace=True)
# replace end punctuation in df column
df['Keyword'].replace('\s+$', '', regex=True, inplace=True)

df.fillna(0)
df.reset_index(drop=True, inplace=True)


In [None]:
df

In [None]:
# normalize table
FORMAT = 'NFKD'
for col in ['Keyword', 'Quote', 'Word Before', 'Word After']:
    df[col] = df[col].apply(lambda x: normalize(FORMAT, str(x)) if not isinstance(x, float) else x)

## Fix similar sentences

In [None]:
df_grouped = pd.DataFrame(columns=['Quote', 'entities'])

# group all similar sentences together, and for each one append to entities: [label, word, word before, word after]
entities_list = []
for name, group in df.groupby('Quote'):
    entities = [
        [row['Label'], row["Keyword"], row['Word Before'], row['Word After']]
        for i, row in group.iterrows()
    ]
    # append to entities_list
    entities_list.append({'Quote': name, 'entities': entities})

# create new DataFrame from entities_list
df_grouped = pd.DataFrame(entities_list)

# sample random from df_grouped
df_grouped

In [None]:
# after naively grouping similar sentences, we need to check if they are actually similar
# we do this by using regex with difference up to 10 chars
checked = []
similar_indices = []

for i, row in df_grouped.iterrows():
    # if we haven't checked this sentence yet
    if i not in checked:
        checked.append(i)
        similar_indices.append([i])
        # loop over all other sentences
        pattern = fr"{re.escape(row['Quote'])}"
        # add fuzzy matching with up to 10 mistakes
        pattern = fr"(?:{pattern}){{e<=10}}"

        for j, row2 in df_grouped.iterrows():
                # if the sentences are similar
            if regex.search(row['Quote'], row2['Quote']):
                if j not in checked:
                    # append to similar_indices
                    similar_indices[-1].append(j)
                    # append to checked
                    checked.append(j)

In [None]:
print (similar_indices)
# choose a random group from similar_indices where the length of the group is more than 1
group = random.choice([group for group in similar_indices if len(group) > 1])
print (group)
# print the sentences in the group
for index in group:
    print (df_grouped.loc[index, 'Quote'])
    # print entities of each sentence in the group
    for entity in df_grouped.loc[index, 'entities']:
        print (entity)

In [None]:
# find the longest sentence in each group of similar sentences
df_grouped_regexed = pd.DataFrame(columns=['Quote', 'entities'])
for indices in similar_indices:
    # if there is only one sentence in the group add it to df_grouped_regexed
    if len(indices) == 1:
        df_grouped_regexed = pd.concat([df_grouped_regexed, df_grouped.loc[indices[0]].to_frame().T], ignore_index=True)
        continue
    # find the longest sentence in the group
    longest = max(indices, key=lambda x: len(df_grouped.loc[x, 'Quote']))
    entities = []
    # concatenate all entities from the other sentences in the group
    for index in indices:
        entities.extend(df_grouped.loc[index, 'entities'])
    # create a new row with the longest sentence and the concatenated entities
    new_row = {'Quote': df_grouped.loc[longest, 'Quote'], 'entities': entities}
    # add the new row to df_grouped_regexed
    df_grouped_regexed = pd.concat([df_grouped_regexed, pd.DataFrame([new_row])], ignore_index=True)

In [None]:
# sample random from df_grouped_regexed
df_grouped_regexed.sample(10)


In [None]:
len(df_grouped)

In [None]:
df_grouped=df_grouped_regexed

In [None]:
# sample random from df_grouped
df_grouped.sample(10)
len(df_grouped)

## Fix similar words

In [None]:
d = {ord('\N{COMBINING ACUTE ACCENT}'):None, ord('\N{COMBINING COMMA ABOVE}'):None, ord('\N{COMBINING REVERSED COMMA ABOVE}'):None}

def find_word_index(sentence, word, word_before, word_after, clean_chars=True):
    """Find the index of a word in a sentence, it can appear multiple times but we return by word_before and word_after
    :returns start and end index of word in sentence"""


    # find the index of the word in the sentence (return index of beginning of word)
    word_index = sentence.find(word)

    # if "clean_chars" is True, remove all accents from the sentence
    if clean_chars: 
        # return all chars of word before: [),.,·] (meaning none of these chars will be included) so we get only data contained within the sentence
        for char in [")", ".", "·", ',']:
            if word_before.find(char) != -1:
                word_before = word_before[word_before.find(char)+1:]
                print ("word before: ", word_before)
            if word_after.find(char) != -1:
                word_after = word_after[:word_after.find(char)]
                print ("word after: ", word_after)
    else:
        #do nothing
        pass

    pattern = fr"{re.escape(word_before)}({(word)}){re.escape(word_after)}"
    # add fuzzy matching with up to 3 mistakes
    pattern = fr"(?:{pattern}){{e<=3}}"
    if match := regex.search(pattern, sentence):
        print ("match 3: ", match.span(1))
        return match.span(1)

    # try matching only with word_before
    pattern = fr"{re.escape(word_before)}({(word)})"
    # add fuzzy matching with up to 3 mistakes
    pattern = fr"(?:{pattern}){{e<=2}}"
    if match := regex.search(pattern, sentence):
        print ("match with before: ", match.span(1))
        return match.span(1)

    # try matching only with word_after
    pattern = fr"({(word)}){re.escape(word_after)}"
    # add fuzzy matching with up to 3 mistakes
    pattern = fr"(?:{pattern}){{e<=2}}"
    if match := regex.search(pattern, sentence):
        print ("match with after: ", match.span(1))
        return match.span(1)

    # if no match found, return None
    print ("\nNo match found for", word, "in", sentence, "\nwith word_before", word_before, "and word_after", word_after)
    return None, None

#test new functions

def remove_unwanted_chars(text):
    """Remove unwanted characters from a string"""
    for char in [")", ".", "·", ",", "’"]:
        if char in text:
            print(f"Removing character '{char}' from text")
            #text = text
            text = text.replace(char, "")
    text = text
    return text

def remove_line_breaks(text):
    """Remove line breaks from a string"""
    for char in ["\n", "\r"]:
        if char in text:
            print(f"Removing line break '{char}' from text")
            text = text.replace(char, " ")
    return ' '.join(text.split())


In [None]:
def find_word_index_test(sentence, word, word_before, word_after):
    """Find the index of a word in a sentence, it can appear multiple times but we return by word_before and word_after
    :returns start and end index of word in sentence"""

    # Remove unwanted characters from word_before and word_after
    word_before = remove_unwanted_chars(word_before)
    word_before = remove_line_breaks(word_before)
    word_after = remove_unwanted_chars(word_after)
    word_after = remove_line_breaks(word_after)

    # Try matching with word_before and word_after
    pattern = fr"{re.escape(word_before)}({(word)}){re.escape(word_after)}"
    match = fuzzy_search(pattern, sentence, max_mistakes=3)
    if match:
        return match.span(1)

    # Try matching with word_before only
    pattern = fr"{re.escape(word_before)}({(word)})"
    match = fuzzy_search(pattern, sentence, max_mistakes=2)
    if match:
        return match.span(1)

    # Try matching with word_after only
    pattern = fr"({(word)}){re.escape(word_after)}"
    match = fuzzy_search(pattern, sentence, max_mistakes=2)
    if match:
        return match.span(1)

    # If no match found, return None
    print(f"No match found for '{word}' in '{sentence}' with word_before '{word_before}' and word_after '{word_after}'")
    return None, None

def remove_unwanted_chars(text):
    """Remove unwanted characters from a string"""
    for char in [")", ".", "·", ",", "’"]:
        if char in text:
            print(f"Removing character '{char}' from text")
            #text = text
            text = text.replace(char, "")
    text = text
    return text

def remove_line_breaks(text):
    """Remove line breaks from a string"""
    for char in ["\n", "\r"]:
        if char in text:
            print(f"Removing line break '{char}' from text")
            text = text.replace(char, " ")
    return ' '.join(text.split())

def fuzzy_search(pattern, text, max_mistakes):
    """Perform a fuzzy search for a regular expression pattern in a string"""
    pattern = fr"(?:{pattern}){{e<={max_mistakes}}}"
    return regex.search(pattern, text)

In [None]:
nlp = spacy.load("grc_proiel_trf") # I use small model for speed but you should use trf (transformer) model for better results

In [None]:
TRAIN_DATA = [] # list of tuples connecting sentences and all entities in that sentence


# create train data: loop over all quotes, and all entities in each quote, and find the index of the word in the sentence so now it entities was [[label, word before, word after], [label, word before, word after]] and now it is [[start, end, label], [start, end, label]]
not_found_list = []
not_found = 0
for i, row in df_grouped.iterrows():
    # loop over all entities
    #print (row['Quote'])
    new_entities = []
    print(row['Quote'])
    for entity in row['entities']:
        # find the index of the word in the sentence
        #print (entity)
        print (entity[1])
        start, end = find_word_index(row['Quote'], entity[1], entity[2], entity[3], clean_chars=True)
        print (start, end, entity[1])
        # append to TRAIN_DATA
        if start is None:
            not_found += 1
            print ("not found: ", start, end)
            # add word and sentence to not_found list
            not_found_list.append([entity[1], row['Quote']])
        else:
            new_entities.append([start, end, entity[0]])
    TRAIN_DATA.append((row['Quote'], {'entities': new_entities}))
    # print length of TRAIN_DATA and sum of all entities in TRAIN_DATA
    print(len(TRAIN_DATA), sum(len(x[1]['entities']) for x in TRAIN_DATA))
    #print not_found words and their sentences
    print (not_found_list)
    



In [None]:
not_found_list

In [None]:
# sample random from TRAIN_DATA
random.sample(TRAIN_DATA, 10)

# Add annotations from INCEpTION

In [None]:
# extract all files in inception folder to temp folder
from cassis import *
import zipfile
import tempfile
import os
from spacy.training import Example

In [None]:
INCEPTION_TRAIN_DATA = []
with tempfile.TemporaryDirectory() as tempdir:
    for f in os.listdir("../assets/NER_assets/INCEpTION_files/"):
        if f.endswith(".zip"):
            with zipfile.ZipFile(os.path.join("../assets/NER_assets/INCEpTION_files/", f), 'r') as zip_ref:
                zip_ref.extractall(tempdir)
    # open typesystem and print content
    with open('{0}/{1}'.format(tempdir, "TypeSystem.xml"), 'rb') as f:
        typesystem = load_typesystem(f)

    # iterate over all files in temp folder
    for f in os.listdir(tempdir):
        # if file is a xmi file
        if f.endswith(".xmi"):
            # load xmi file
            with open(os.path.join(tempdir, f), 'rb') as f:
                print(f.name)
                # load typesystem from temp folder
                cas = load_cas_from_xmi(f, typesystem=typesystem)
                # get all entities from cas
                for sentence in cas.select(("de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence")):
                    ents = []
                    # print original sentence
                    print("Original sentence: ", sentence.get_covered_text())
                    print("Original sentence index: ", sentence.begin, "-", sentence.end)
                    print("Original sentence length: ", len(sentence.get_covered_text()))
                    # remove line breaks from sentence text
                    #sentence_text = sentence.get_covered_text().split()
                    print("SSSSSSSSSSSSSSSSSSSSSentence text: ", sentence.get_covered_text())
                    #replace ' with ᾿
                    
                    # normalize items in sentence_text
                    #sentence_text = [normalize(FORMAT, x) for x in sentence_text]
                    #sentence_text = sentence_text.replace('\r', '').replace('\n', '')
                    # remove extra white spaces from sentence text without touching punctuation
                    #sentence_text = ' '.join(sentence_text.split())
                    # join sentence text back together
                    #sentence_text = ' '.join(sentence.get_covered_text().replace('\r', ' ').replace('\n', ' ').split())
                    #sentence_text = [x.replace("᾿", "’") for x in sentence_text]
                    sentence_text = ' '.join(sentence.get_covered_text().replace('\r', ' ').replace('\n', ' ').replace("᾿", "’").split())
                    #sentence_text = ' '.join(sentence_text)
        # FIRST VERSION:::::: sentence_text = ' '.join(sentence.get_covered_text().replace('\r', ' ').replace('\n', ' ').split())
                    #sentence_text = re.split(r"(?<!\w)\s+|\s+(?!\w)", sentence_text)
                    print('rrrrrrrrrrrrrrrrrrrr: ', sentence_text)
                    sentence_begin = sentence.begin
                    sentence_end = sentence_begin + len(sentence_text)
                    # print sentence text
                    print("Cleaned Sentence text: ", sentence_text)
                    #print clean sentence length:
                    print("Cleaned sentence length: ", len(sentence_text))
                    
                    for token in cas.select_covered('webanno.custom.CategoryType', sentence):
                        # create a span with the token start and end and the label
                        # find begin and end position of the token relative to the sentence
                        print("TOKEN: ", token.get_covered_text(), "| Index: ", token.begin, "-", token.end)
                        # print the begin and end index numbers of sentence.get_covered_text()
                        #print("Sentence index: ", sentence.begin, "-", sentence.end, "| Sentence: ", sentence.get_covered_text()[(token.begin - sentence.begin):(token.end - sentence.begin)])
                        # get the index of text before the token
                        sentence_text_before = sentence.get_covered_text()[:(token.begin - sentence_begin)].replace("᾿", "’")
                        print("1",sentence_text_before)
                        sentence_text_before = ' '.join(sentence_text_before.split()) + ' ' if sentence_text_before else ''
                        print(sentence_text_before.split())
                        print("2",sentence_text_before)
                        print("sentence_text_before calculation: ", token.begin, '-', sentence_begin)
                        sentence_text_before = normalize(FORMAT, sentence_text_before)
                        
                        sentence_text_after = sentence.get_covered_text()[:(token.end - sentence_begin)].replace("᾿", "’")
                        print("sentence_text_after calculation: ", token.end, '-', sentence_begin)
                        sentence_text_after = ' '.join(sentence_text_after.split())
                        sentence_text_after = normalize(FORMAT, sentence_text_after)
                        print("Sentence text before token begin: ", sentence_text_before, len(sentence_text_before))
                        print("Sentence text before token end: ", sentence_text_after, len(sentence_text_after))
                        token_begin = len(sentence_text_before)
                        token_end = len(sentence_text_after)
                        
                        #print("original sentence Index: ", sentence.begin, "-", sentence.end)
                        print("New Begin: ", sentence.begin, "+", len(sentence_text_before), "=", token_begin)
                        print("New end: ", sentence.begin, "+", len(sentence_text_after), "=", token_end)
                        #print("New Begin index: ", begin, "New End index: ", end)
                        #sentence.begin = sentence.begin
                        # print sentence length
                        #print("Sentence length: ", len(sentence.get_covered_text()))
                        # sentence.end is the end of the sentence_text
                        #sentence.end = sentence_begin + len(sentence_text)
                        print(sentence.begin, "+", len(sentence_text))
                        print("New sentence index: ", sentence.begin, "-", sentence_end)
                        #print string in location token_begin:token_end in sentence_text
                        #print("New token: ", sentence_begin, sentence_end, sentence_text[token_begin:token_end])
                        print("New token: ", sentence_text[token_begin:token_end])
                        
                        
                        
                        #print("New token: ", sentence_text[token_begin:token_end])
                        # normalize the sentence, word, word_before and word_after
                        ents.append((token_begin, token_end, token.value("Value")))
                        print("ents: ", ents)
                    #print sentence text length
                    print("Sentence length: ", len(sentence_text))
                    sentence_text = normalize(FORMAT, sentence_text)
                    #print sentencvce text length after normalization
                    print("Sentence length after normalization: ", len(sentence_text))
                    print("Final sentence: ", sentence_text)
                    INCEPTION_TRAIN_DATA.append((sentence_text, {'entities': ents}))

In [None]:
# save INCEPTION_TRAIN_DATA to csv file
df_INCEPTION_TRAIN_DATA = pd.DataFrame(INCEPTION_TRAIN_DATA)
df_INCEPTION_TRAIN_DATA

In [None]:
INCEPTION_TRAIN_DATA
# save TRAIN_DATA to spacy file

In [None]:
for t in INCEPTION_TRAIN_DATA:
    print (t[0]+ "\n")
    for entity in t[1]['entities']:
        print (entity, t[0][entity[0]:entity[1]])

In [None]:
for t in INCEPTION_TRAIN_DATA:
    print (t[0]+ "\n")
    for entity in t[1]['entities']:
        print (t[0][entity[0]:entity[1]], entity[2])

In [None]:
# sample random from INCEPTION_TRAIN_DATA and print for each entity text[start:end] and label
t = random.choice(INCEPTION_TRAIN_DATA)
print (t[0]+ "\n")
for entity in t[1]['entities']:
    print (t[0][entity[0]:entity[1]], entity[2])

In [None]:
MERGED_TRAIN_DATA = TRAIN_DATA + INCEPTION_TRAIN_DATA

In [None]:
docs = []
losses = {}
for text, annot in MERGED_TRAIN_DATA:
    # train model
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        # first check char span by text[start:end] and compare with annot
        print (text[start:end], label)
        # if label isnt float NaN
        if label == label and label != None:
            span = doc.char_span(start, end, label=label, alignment_mode="expand")
            print (span)
            ents.append(span)
    # check overlapping entities (each spacy token should only be part of one entity)
    ents = spacy.util.filter_spans(ents)

    doc.ents = ents
    docs.append(doc)
    print(len(docs), sum(len(doc.ents) for doc in docs))



In [None]:
# check if charspan correctly found the entities, and all of them.
doc = random.choice(docs)

print ("Spacy entities, for line:")
print (doc.text)
print([(ent.text, ent.label_) for ent in doc.ents])

# locate doc in MERGED_TRAIN_DATA
line = None
for i, row in enumerate(MERGED_TRAIN_DATA):
    if row[0] == doc.text:
        line = i
        #print (row[1])
        break

print ("\nTable entities, for same line:")
for entity in MERGED_TRAIN_DATA[line][1]['entities']:
    print (MERGED_TRAIN_DATA[line][0][entity[0]:entity[1]], entity[2])

# check if there are entities in TRAIN_DATA that are not in spacy doc
for entity in MERGED_TRAIN_DATA[line][1]['entities']:
    if entity[2] not in [ent.label_ for ent in doc.ents]:
        print ("Entity not found in spacy doc: ", MERGED_TRAIN_DATA[line][0][entity[0]:entity[1]], entity[2])

In [None]:
from spacy.tokens import DocBin

NER_DocBin = DocBin(docs=docs)
NER_DocBin.to_disk('../corpus/Merged_NER_dataset_{0}.spacy'.format(FORMAT))

In [None]:
from sklearn.model_selection import train_test_split
from pathlib import Path
from spacy.tokens import DocBin


def split_docs(docs, format):
    # Split the docs into train, test, and dev sets
    train_docs, test_docs = train_test_split(docs, test_size=0.2)
    train_docs, dev_docs = train_test_split(train_docs, test_size=0.2)

    # Create directories for the train, test, and dev sets
    Path("../corpus/train").mkdir(parents=True, exist_ok=True)
    Path("../corpus/dev").mkdir(parents=True, exist_ok=True)
    Path("../corpus/test").mkdir(parents=True, exist_ok=True)

    # Save the train, test, and dev sets to disk
    train = DocBin(docs=train_docs)
    train.to_disk("../corpus/train/train_ner_{0}.spacy".format(format))
    test = DocBin(docs=test_docs)
    test.to_disk("../corpus/test/test_ner_{0}.spacy".format(format))
    dev = DocBin(docs=dev_docs)
    dev.to_disk("../corpus/dev/dev_ner_{0}.spacy".format(format))

    # Print the number of docs in each set
    print("Train: ", len(train_docs))
    print("Test: ", len(test_docs))
    print("Dev: ", len(dev_docs))

## Equally distribute examples between train, dev and test data objects.
We need to properly represent all named entities in each set. We will use a stratification algorithm twice: 1st we split the data to train and test sets, and then we split the test set to dev and test. 

In [None]:
import spacy
from spacy.tokens import Doc, DocBin, Span
nlp = spacy.load("grc_proiel_trf")

In [None]:
FORMAT = 'NFKC'

In [None]:
merged_docbin = DocBin().from_disk("../corpus/Merged_NER_dataset_{0}.spacy".format(FORMAT))
# get docs from new_docbin
merged_docbin_docs = list(merged_docbin.get_docs(nlp.vocab))

In [None]:
for doc in merged_docbin_docs:
    for ents in doc.ents:
        print (doc, ents.text, ents.label_)

In [None]:
# A method to split the multilabel data into two sets

from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit
from sklearn.utils import indexable, _safe_indexing
from sklearn.utils.validation import _num_samples
from sklearn.model_selection._split import _validate_shuffle_split
from itertools import chain

def multilabel_train_test_split(*arrays,
                                test_size=None,
                                train_size=None,
                                random_state=None,
                                shuffle=True,
                                stratify=None):
    """
    Train test split for multilabel classification. Uses the algorithm from: 
    'Sechidis K., Tsoumakas G., Vlahavas I. (2011) On the Stratification of Multi-Label Data'.
    """
    if stratify is None:
        return train_test_split(*arrays, test_size=test_size,train_size=train_size,
                                random_state=random_state, stratify=None, shuffle=shuffle)
    
    assert shuffle, "Stratified train/test split is not implemented for shuffle=False"
    
    n_arrays = len(arrays)
    arrays = indexable(*arrays)
    n_samples = _num_samples(arrays[0])
    n_train, n_test = _validate_shuffle_split(
        n_samples, test_size, train_size, default_test_size=0.25
    )
    cv = MultilabelStratifiedShuffleSplit(test_size=n_test, train_size=n_train, random_state=123)
    train, test = next(cv.split(X=arrays[0], y=stratify))

    return list(
        chain.from_iterable(
            (_safe_indexing(a, train), _safe_indexing(a, test)) for a in arrays
        )
    )

### 1st Split: train and test

In [None]:
import numpy as np
# List of all labels as integers. 
# IMPORTANT: Make sure that the labels integers are indeed item[7].

labels_ints = [item[7] for item in merged_docbin.tokens for item in item]
#construct an ndarray with shape len(new_docbin), buffer is item[7] for item in new_docbin.tokens for item in item
labels_nd_array = np.ndarray(shape=(len(merged_docbin), len(np.unique(labels_ints))), buffer=np.array(labels_ints), dtype=int)
labels_nd_array.shape

In [None]:
X_train, X_test = multilabel_train_test_split(merged_docbin_docs ,stratify=labels_nd_array, test_size=0.20)

In [None]:
# X_train to X_train.spacy
X_train_docbin = DocBin(docs=X_train)
X_test_docbin = DocBin(docs=X_test)

In [None]:
X_train_docbin

### 2nd split: dev and test

In [None]:
# List of all labels as integers
X_labels_ints = [item[7] for item in X_test_docbin.tokens for item in item]
#construct an ndarray with shape len(new_docbin), buffer is item[7] for item in new_docbin.tokens for item in item
X_labels_nd_array = np.ndarray(shape=(len(X_test_docbin), len(np.unique(X_labels_ints))), buffer=np.array(X_labels_ints), dtype=int)
X_labels_nd_array.shape

In [None]:
X_dev, X_test = multilabel_train_test_split(X_test ,stratify=X_labels_nd_array, test_size=0.50)

In [None]:
X_dev_docbin = DocBin(docs=X_dev)

### Save new sets and validate the distribution

In [None]:
# save the train and test data to disk
train_docbin = DocBin(docs=X_train)
train_docbin.to_disk("../corpus/train/ner_train/ner_train_{0}.spacy".format(FORMAT))
dev_docbin = DocBin(docs=X_dev)
dev_docbin.to_disk("../corpus/dev/ner_dev/ner_dev_{0}.spacy".format(FORMAT))
test_docbin = DocBin(docs=X_test)
test_docbin.to_disk("../corpus/test/ner_test/ner_test_{0}.spacy".format(FORMAT))


In [None]:
train_data_labels = ([ent.label_ for docs in X_train for ent in docs.ents])
test_data_labels = ([ent.label_ for docs in X_test for ent in docs.ents])
dev_data_labels = ([ent.label_ for docs in X_dev for ent in docs.ents])


In [None]:
# visualize the distribution of the labels in the train, test and dev data
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
sns.set(style="darkgrid")
plt.figure(figsize=(20,10))
train_data_labels = ([ent.label_ for docs in X_train for ent in docs.ents])
test_data_labels = ([ent.label_ for docs in X_test for ent in docs.ents])
dev_data_labels = ([ent.label_ for docs in X_dev for ent in docs.ents])
train_data_labels = pd.DataFrame(train_data_labels, columns=['labels'])
test_data_labels = pd.DataFrame(test_data_labels, columns=['labels'])
dev_data_labels = pd.DataFrame(dev_data_labels, columns=['labels'])
sns.countplot(data=train_data_labels, x='labels')
plt.title("Distribution of labels in the train data")
plt.figure(figsize=(20,10))
sns.countplot(data=test_data_labels, x='labels')
plt.title("Distribution of labels in the test data")
plt.figure(figsize=(20,10))
sns.countplot(data=dev_data_labels, x='labels')
plt.title("Distribution of labels in the dev data")

In [None]:
# check distribution of classes in train and test set with respect to each other
import pandas as pd
train_data_labels = ([ent.label_ for docs in X_train for ent in docs.ents])
test_data_labels = ([ent.label_ for docs in X_test for ent in docs.ents])
dev_data_labels = ([ent.label_ for docs in X_dev for ent in docs.ents])
train_data_labels = pd.DataFrame(train_data_labels, columns=['labels'])
test_data_labels = pd.DataFrame(test_data_labels, columns=['labels'])
dev_data_labels = pd.DataFrame(dev_data_labels, columns=['labels'])
train_data_labels['dataset'] = 'train'
test_data_labels['dataset'] = 'test'
dev_data_labels['dataset'] = 'dev'
all_data_labels = pd.concat([train_data_labels, test_data_labels, dev_data_labels], ignore_index=True)
plt.figure(figsize=(20,10))
sns.countplot(data=all_data_labels, x='labels', hue='dataset')



In [None]:
# visualize frequency of annotation labels in X_train set
train_data_labels = ([ent.label_ for docs in X_train for ent in docs.ents])
train_data_labels = pd.DataFrame(train_data_labels, columns=['labels'])
train_data_labels = train_data_labels['labels'].value_counts()
train_data_labels = pd.DataFrame(train_data_labels)
train_data_labels = train_data_labels.reset_index()
train_data_labels = train_data_labels.rename(columns={'index': 'label', 'labels': 'frequency'})
plt.figure(figsize=(20,10))
sns.barplot(data=train_data_labels, x='label', y='frequency')
plt.title("Frequency of labels in the train data")
plt.xticks(rotation=90)

In [None]:
print ("test:", len(X_test), "dev:", len(X_dev), "train:", len(X_train))


In [None]:
# show distribution in numbers
grouped_labels = all_data_labels.groupby(['labels', 'dataset']).size()
print(grouped_labels)

In [None]:
# load the train, test and dev data
#import spacy
#from spacy.tokens import DocBin
#nlp = spacy.load("grc_proiel_trf")
FORMAT = 'NFKC'
train_docbin = DocBin().from_disk("../corpus/train/ner_train/ner_train_{0}.spacy".format(FORMAT))
# get docs from new_docbin
train_docs = list(train_docbin.get_docs(nlp.vocab))
dev_docbin = DocBin().from_disk("../corpus/dev/ner_dev/ner_dev_{0}.spacy".format(FORMAT))
# get docs from new_docbin
dev_docs = list(dev_docbin.get_docs(nlp.vocab))
test_docbin = DocBin().from_disk("../corpus/test/ner_test/ner_test_{0}.spacy".format(FORMAT))
# get docs from new_docbin
test_docs = list(test_docbin.get_docs(nlp.vocab))
# count sentences in train, test and dev data
print ("train:", len(train_docs), "dev:", len(dev_docs), "test:", len(test_docs))


In [None]:
# load the train, test and dev data
#import spacy
#from spacy.tokens import DocBin
#nlp = spacy.load("grc_proiel_trf")
FORMAT = 'NFKD'
train_docbin = DocBin().from_disk("../corpus/train/ner_train/ner_train_{0}.spacy".format(FORMAT))
# get docs from new_docbin
train_docs = list(train_docbin.get_docs(nlp.vocab))
dev_docbin = DocBin().from_disk("../corpus/dev/ner_dev/ner_dev_{0}.spacy".format(FORMAT))
# get docs from new_docbin
dev_docs = list(dev_docbin.get_docs(nlp.vocab))
test_docbin = DocBin().from_disk("../corpus/test/ner_test/ner_test_{0}.spacy".format(FORMAT))
# get docs from new_docbin
test_docs = list(test_docbin.get_docs(nlp.vocab))
# count sentences in train, test and dev data
print ("train:", len(train_docs), "dev:", len(dev_docs), "test:", len(test_docs))
