In [1]:
import os
import re
import spacy
import pandas as pd

# import requirements for converting the dataframe to Spacy Docs
from collections import defaultdict
from unicodedata import normalize
import regex
import random


# Load and process data from Coda

In [2]:
FILE_PATH = "../assets/NER_assets/Ancient_Words.csv"
# read csv file
df = pd.read_csv(FILE_PATH)
# rename columns to fit code
df.rename(columns = {'Word':'Keyword', 'Category Types':'Label'}, inplace = True)
# If a cell is empty (NaN), Fill it with the value in its parallel "Early" column
for row in df:
    df['Quote'].fillna(df['Early Quote'], inplace=True)
    df['Word Before'].fillna(df['Early Word Before'], inplace=True)
    df['Word After'].fillna(df['Early Word After'], inplace=True)
    df['Label'].fillna(df['Early Category Type'], inplace=True)
# remove rows with no Keyword
df = df.dropna(subset=['Keyword'])
# Remove any row that isn't Greek
pat = '[ء-ي]+'
#df.Keyword.str.contains(pat)
df = df[~df.Keyword.str.contains(pat, na=False)]
#replace new line in df column
df['Keyword'].replace('\n', '', regex=True, inplace=True)
#replace numbers in df
df.replace('\d+', '', regex=True, inplace=True)
#replace hyphens in df column
df.replace('-', '', regex=True, inplace=True)
# replace comma in df column
df['Keyword'].replace(',', '', regex=True, inplace=True)
#replace period in df column
df['Keyword'].replace('\.', '', regex=True, inplace=True)
#replace interpunkt in df column
df['Keyword'].replace('\·', '', regex=True, inplace=True)
# replace multiple spaces in df column
df.replace(' +', ' ', regex=True, inplace=True)
# replace end punctuation in df column
df['Keyword'].replace('\s+$', '', regex=True, inplace=True)

df.fillna(0)
df.reset_index(drop=True, inplace=True)


In [3]:
# normalize table
FORMAT = 'NFD'
for col in ['Keyword', 'Quote', 'Word Before', 'Word After']:
    df[col] = df[col].apply(lambda x: normalize(FORMAT, x))



## Fix similar sentences

In [4]:
df_grouped = pd.DataFrame(columns=['Quote', 'entities'])

# group all similar sentences together, and for each one append to entities: [label, word, word before, word after]
for name, group in df.groupby('Quote'):
    # create list of lists for entities
    entities = []
    # loop over all rows in group
    for i, row in group.iterrows():
        # append to entities list [label, word before, word after]
        entities.append([row['Label'], row["Keyword"], row['Word Before'], row['Word After']])
    # append to df_grouped
    df_grouped = df_grouped.append({'Quote': name, 'entities': entities}, ignore_index=True)

# sample random from df_grouped
df_grouped.sample(10)

Unnamed: 0,Quote,entities
280,πλὴν οὐκ εἰς τὸ κοῖλον ἀλλ’ εἰς τὸ σῶ...,"[[Topography, καταναλίσκονται, τὸ σῶμα, τῶ..."
165,Τείνει δ’ ἀπ’ αὐτῆς μία μὲν διὰ τοῦ ἡ...,"[[Topography, ἀπ’, Τείνει δ’, αὐτῆς μία],..."
107,"Καὶ ἡ μὲν φλὲψ διὰ τῆς καρδίας, εἰς δε...","[[Topography, διὰ, μὲν φλὲψ, τῆς καρδίας]..."
65,Ἐκ μέσου δὲ τῶν νεφρῶν ἑκατέρου φλὲψ κ...,"[[Body Part, φλεβῶν, διὰ τῶν, · εἶτα εἰς]]"
21,Ἅμα δ’ ἡ ἀνάπνευσις καὶ ἔκπνευσις γίν...,"[[Physiology, ἀναπνεῦσαι, τοῖς μυκτῆρσιν, ..."
51,Ἔστι δ’ ἡ μὲν ἀρτηρία χονδρώδης τὴν φυ...,"[[Physiology, ἀνασπάσωσί, ὅταν πίνοντες ..."
190,Τῶν δ’ ἄλλων σπλάγχνων ἡ καρδία μόνον ε...,"[[Body Part, αἷμα, μόνον ἔχει, . Καὶ ὁ]..."
268,"ὃς ἐὰν ἐξυγρανθεὶς φλεγμήνῃ, σταφυλὴ ...","[[Technical Appellation, σταφυλὴ, ἐξυγρανθει..."
220,ἔστι γὰρ ὁ μυκτὴρ διχότομος,"[[Body Part, μυκτὴρ, γὰρ ὁ, διχότομος. Τοι..."
120,Λέγουσι δέ τινες ὡς καὶ φρονέομεν τῇ κα...,"[[Body Part, καρδίῃ, τῇ , καὶ ]]"


In [5]:
# after naively grouping similar sentences, we need to check if they are actually similar
# we do this by using regex with difference up to 10 chars
checked = []
similar_indices = []

for i, row in df_grouped.iterrows():
    # if we haven't checked this sentence yet
    if i not in checked:
        checked.append(i)
        similar_indices.append([i])
        # loop over all other sentences
        pattern = fr"{re.escape(row['Quote'])}"
        # add fuzzy matching with up to 10 mistakes
        pattern = fr"(?:{pattern}){{e<=10}}"

        for j, row2 in df_grouped.iterrows():
            # if we haven't checked this sentence yet
            if j not in checked:
                # if the sentences are similar
                if regex.search(row['Quote'], row2['Quote']):
                    # append to similar_indices
                    similar_indices[-1].append(j)
                    # append to checked
                    checked.append(j)




In [6]:
print (similar_indices)
# choose a random group from similar_indices where the length of the group is more than 1
group = random.choice([group for group in similar_indices if len(group) > 1])
print (group)
# print the sentences in the group
for index in group:
    print (df_grouped.loc[index, 'Quote'])
    # print entities of each sentence in the group
    for entity in df_grouped.loc[index, 'entities']:
        print (entity)


[[0], [1], [2], [3], [4], [5], [6], [7], [8], [9], [10], [11], [12, 13], [14, 15], [16, 17], [18], [19], [20], [21], [22], [23], [24], [25], [26], [27, 28], [29], [30], [31], [32, 33], [34, 35], [36], [37], [38], [39], [40], [41], [42, 43], [44], [45], [46, 47], [48], [49, 50, 51], [52], [53], [54], [55], [56], [57], [58, 59], [60], [61, 62], [63], [64, 65], [66], [67, 68, 69], [70, 71], [72], [73], [74], [75, 76, 77, 78], [79], [80, 81], [82], [83], [84], [85, 86], [87], [88], [89], [90], [91], [92, 93], [94], [95], [96, 97], [98, 99], [100], [101, 102], [103], [104], [105], [106], [107, 108], [109, 110, 111], [112], [113], [114], [115], [116], [117], [118], [119], [120], [121], [122, 123], [124], [125], [126], [127], [128], [129, 130], [131], [132], [133], [134], [135], [136], [137, 138], [139], [140], [141], [142, 143], [144], [145, 146], [147, 148], [149, 150], [151], [152, 153], [154], [155, 156], [157], [158, 159], [160, 161], [162], [163, 164], [165, 166], [167], [168], [169, 17

In [7]:
# find the longest sentence in each group of similar sentences
df_grouped_regexed = pd.DataFrame(columns=['Quote', 'entities'])
for indices in similar_indices:
    # if there is only one sentence in the group add it to df_grouped_regexed
    if len(indices) == 1:
        df_grouped_regexed = df_grouped_regexed.append(df_grouped.loc[indices[0]])
        continue
    # find the longest sentence in the group
    longest = max(indices, key=lambda x: len(df_grouped.loc[x, 'Quote']))
    entities = []
    # concatenate all entities from the other sentences in the group
    for index in indices:
        entities.extend(df_grouped.loc[index, 'entities'])
    # add the longest sentence and the concatenated entities to df_grouped_regexed
    df_grouped_regexed = df_grouped_regexed.append({'Quote': df_grouped.loc[longest, 'Quote'], 'entities': entities}, ignore_index=True)



In [8]:
# sample random from df_grouped_regexed
df_grouped_regexed.sample(10)


Unnamed: 0,Quote,entities
212,πλὴν ἐκείνη μὲν ἡ διὰ τοῦ ἥπατός ἐσ...,"[[Body Part, αὕτη, ἥπατός ἐστιν,, δ’ ἑτ..."
82,"Καὶ ἡ μὲν φλὲψ διὰ τῆς καρδίας, εἰς δε...","[[Topography, διὰ, μὲν φλὲψ, τῆς καρδίας]..."
45,Ἔτι δ’ ἄλ λαι ἀπὸ τῆς μεγάλης φλεβὸς ...,"[[Topography, ἀπὸ, ἄλ λαι, τῆς μεγάλης]]"
2,Ἐξήρτηται δ' ἐκ τῆς μεγάλης φλεβὸςκαὶ ...,"[[Body Part, ἐντέρων, τὴν τῶν, θέσιν, ἄ..."
0,"Ἔχει δὲ διαφορὰς πολλάς, καθάπερ ἡ κοι...","[[Medical, διαφορὰς, Ἔχει δὲ, πολλάς, καθ..."
140,Τοῖσι δὲ πρεσβυτάτοισιν ὅταν ἐπιγένηται...,"[[Body Part, φλέβες, αἱ, κεκένωνται]]"
92,Λέγουσι δέ τινες ὡς καὶ φρονέομεν τῇ κα...,"[[Body Part, καρδίῃ, τῇ , καὶ ]]"
68,Ἢν δὲ ὀλίγον ᾖ καὶ ἐς ἀμφοτέρας τα...,"[[Adjectives/Qualities, ἀμφοτέρας, ἐς, τὰς..."
207,οἱ δ' ἄρρενες ἔχουσι πρὸ τοῦ τυφλοῦ κα...,"[[Body Part, κάτω κοιλίας, τυφλοῦ καὶ, .],..."
229,"τούτων τὸ πρόσθιον γένειον , τὸ δ’ ὀπίσ...","[[Body Part, γένειον, τὸ πρόσθιον, , τὸ δ’..."


In [9]:
len(df_grouped)

306

In [10]:
df_grouped=df_grouped_regexed

In [11]:
# sample random from df_grouped
df_grouped.sample(10)
len(df_grouped)

237

## Fix similar words

In [12]:
d = {ord('\N{COMBINING ACUTE ACCENT}'):None, ord('\N{COMBINING COMMA ABOVE}'):None, ord('\N{COMBINING REVERSED COMMA ABOVE}'):None}

def find_word_index(sentence, word, word_before, word_after):
    """Find the index of a word in a sentence, it can appear multiple times but we return by word_before and word_after
    :returns start and end index of word in sentence"""


    # find the index of the word in the sentence (return index of beginning of word)
    word_index = sentence.find(word)

    # return all chars of word before: [),.,·] (meaning none of these chars will be included) so we get only data contained within the sentence
    for char in [")", ".", "·", ',']:
        if word_before.find(char) != -1:
            word_before = word_before[word_before.find(char)+1:]
        if word_after.find(char) != -1:
            word_after = word_after[:word_after.find(char)]



    pattern = fr"{re.escape(word_before)}({(word)}){re.escape(word_after)}"
    # add fuzzy matching with up to 3 mistakes
    pattern = fr"(?:{pattern}){{e<=3}}"



    match = regex.search(pattern, sentence)

    if match:
        return match.span(1)
    else:
        # try matching only with word_before
        pattern = fr"{re.escape(word_before)}({(word)})"
        pattern = fr"(?:{pattern}){{e<=2}}"
        match = regex.search(pattern, sentence)
        if match:
            return match.span(1)

        # try matching only with word_after
        pattern = fr"({(word)}){re.escape(word_after)}"
        pattern = fr"(?:{pattern}){{e<=2}}"
        match = regex.search(pattern, sentence)
        if match:
            return match.span(1)

        else:
            print ("\nNo match found for", word, "in", sentence, "\nwith word_before", word_before, "and word_after", word_after)
            return None, None


In [13]:
nlp = spacy.load("grc_proiel_sm") # I use small model for speed but you should use trf (transformer) model for better results

In [14]:
TRAIN_DATA = [] # list of tuples connecting sentences and all entities in that sentence


# create train data: loop over all quotes, and all entities in each quote, and find the index of the word in the sentence so now it entities was [[label, word before, word after], [label, word before, word after]] and now it is [[start, end, label], [start, end, label]]

not_found = 0
for i, row in df_grouped.iterrows():
    # loop over all entities
    print (row['Quote'])
    new_entities = []
    for entity in row['entities']:
        # find the index of the word in the sentence
        print (entity)
        start, end = find_word_index(row['Quote'], entity[1], entity[2], entity[3])
        # append to TRAIN_DATA
        if start is None:
            not_found += 1
        else:
            new_entities.append([start, end, entity[0]])
    TRAIN_DATA.append((row['Quote'], {'entities': new_entities}))



 Ἔχει δὲ διαφορὰς πολλάς, καθάπερ ἡ κοιλία, καὶ τοῦτο τὸ μόριον
['Medical', 'διαφορὰς', 'Ἔχει δὲ', 'πολλάς, καθάπερ']
['Body Part', 'κοιλία', 'καθάπερ ἡ', ', καὶ τοῦτο']
 Ἐξήρτηται δ' ἐκ τῆς μεγάλης φλεβὸς καὶ τῆς ἀορτῆς, καὶ δι' αὐτοῦ φλέβες πολλαὶ καὶ πυκναί, κατατείνουσαι πρὸς τὴν τῶν ἐντέρων θέσιν, ἄνωθεν ἀρξάμεναι μέχρι κάτω
['Topography', 'ἄνωθεν', 'ἐντέρων θέσιν,', 'ἀρξάμεναι μέχρι']
['Topography', 'κατατείνουσαι', 'καὶ πυκναί,', 'πρὸς τὴν']
 Ἐξήρτηται δ' ἐκ τῆς μεγάλης φλεβὸςκαὶ τῆς ἀορτῆς, καὶ δι' αὐτοῦ φλέβες πολλαὶ καὶ πυκναί, κατατείνουσαι πρὸς τὴν τῶν ἐντέρων θέσιν, ἄνωθεν ἀρξάμεναι μέχρι κάτω
['Body Part', 'ἐντέρων', 'τὴν τῶν', 'θέσιν, ἄνωθεν']
['Medical', 'θέσιν', 'τῶν ἐντέρων', ', ἄνωθεν ἀρξάμεναι']
['Topography', 'πρὸς', 'πυκναί, κατατείνουσαι', 'τὴν τῶν']
['Topography', 'ἀρξάμεναι', 'θέσιν, ἄνωθεν', 'μέχρι κάτω.']
['Topography', 'κάτω', 'ἀρξάμε

In [15]:
not_found

10

In [16]:
# sample random from TRAIN_DATA
random.sample(TRAIN_DATA, 10)

[('Οὕτω δὲ τείνουσαι, καὶ μεταξὺ λαμβάνουσαι. τὴν ἀρτηρίαν, φέρουσι μέχρι τῶν ὤτων, ᾗ συμβάλλουσιν αἱ γένυες τῇ κεφαλῇ',
  {'entities': [[35, 48, 'Topography'],
    [27, 35, 'Topography'],
    [10, 21, 'Topography'],
    [54, 65, 'Body Part'],
    [94, 100, 'Topography'],
    [75, 82, 'Topography'],
    [100, 114, 'Topography'],
    [65, 75, 'Topography'],
    [87, 94, 'Body Part'],
    [118, 126, 'Body Part'],
    [131, 140, 'Body Part']]}),
 ('καὶ γὰρ ἀναπνεῖ καὶ ἐκπνεῖ ταύτῃ, καὶ ὁ πταρμὸς διὰ ταύτης γίνεται , πνεύματος ἀθρόου ἔξοδος , σημεῖον οἰωνιστικὸν καὶ ἱερὸν μόνον τῶν πνευμάτων',
  {'entities': [[94, 103, 'Adjectives/Qualities'],
    [72, 81, 'Topography'],
    [103, 112, 'Physiology']]}),
 ('ᾗ δὲ συνήρτηται κοῖλόν ἐστιν.',
  {'entities': [[0, 4, 'Topography'],
    [8, 20, 'Topography'],
    [0, 4, 'Topography'],
    [20, 28, 'Body Part'],
    [8, 20, 'Topography'],
    [20, 29, 'Body Part']]}),
 ('ἢ γὰρ στόμα 

# Add annotations from INCEpTION

In [17]:
# extract all files in inception folder to temp folder
from cassis import *
import zipfile
import tempfile
import os
from spacy.training import Example



In [18]:
def normalize_inception_tags(sentence, begin, end, label):
    """Problem: INCEpTION texts are in different formats, we want a uniform format defined by the constant FORMAT"""
    """When we normalize a sentence we lose the correct offsets of the entities, so we need to fix them"""
    """We use the above function find_word_index to find the correct offsets"""

    word = sentence[begin:end]
    # word before is the word before the entity (take 5 chars before if exists, else take maximum available)
    word_before = sentence[max(0, begin-5):begin]
    # word after is the word after the entity (take 5 chars after if exists, else take maximum available)
    word_after = sentence[end:min(len(sentence), end+5)]

    # normalize all variables
    sentence = normalize(FORMAT, sentence)
    word = normalize(FORMAT, word)
    word_before = normalize(FORMAT, word_before)
    word_after = normalize(FORMAT, word_after)

    return find_word_index(sentence, word, word_before, word_after)

In [19]:
INCEPTION_TRAIN_DATA = []
with tempfile.TemporaryDirectory() as tempdir:
    for f in os.listdir("../assets/NER_assets/INCEpTION_files/"):
        if f.endswith(".zip"):
            with zipfile.ZipFile(os.path.join("../assets/NER_assets/INCEpTION_files/", f), 'r') as zip_ref:
                zip_ref.extractall(tempdir)
    # open typesystem and print content
    with open('{0}/{1}'.format(tempdir, "TypeSystem.xml"), 'rb') as f:
        typesystem = load_typesystem(f)

    # iterate over all files in temp folder
    for f in os.listdir(tempdir):
        # if file is a xmi file
        if f.endswith(".xmi"):
            # load xmi file
            with open(os.path.join(tempdir, f), 'rb') as f:
                # load typesystem from temp folder
                cas = load_cas_from_xmi(f, typesystem=typesystem)
                # get all entities from cas
                for sentence in cas.select("custom.SentenceLabel"):
                    ents = []
                    for token in cas.select_covered('webanno.custom.CategoryType', sentence):
                        print(token.get_covered_text(), token.value("Value"))
                        # create a span with the token start and end and the label
                        # find begin and end position of the token relative to the sentence
                        start = token.begin - sentence.begin
                        end = token.end - sentence.begin

                        # normalize the sentence, word, word_before and word_after
                        begin, end = normalize_inception_tags(sentence.get_covered_text(), start, end, token.value("Value"))
                        ents.append((begin, end, token.value("Value")))
                    sentence = normalize(FORMAT, sentence.get_covered_text())
                    INCEPTION_TRAIN_DATA.append((sentence, {'entities': ents}))




Τελευταία Adjectives/Qualities
κύστις Body Part
κεῖται Topography
ἐξάρτησιν Topography
τοῖς None
ἀπὸ Topography
νεφρῶν Body Part
τεταμένοις Adjectives/Qualities
πόροις Body Part
παρὰ Topography
καυλὸν Body Part
ἐπὶ Topography
οὐρήθραν Body Part
πάντῃ Topography
κύκλῳ Topography
λεπτοῖς Adjectives/Qualities
ἰνώδεσιν Adjectives/Qualities
ὑμενίοις Body Part
προσειλημμένη Topography
διαζώματι Body Part
θώρακος Body Part
κύστις Body Part
μέγεθος Adjectives/Qualities
Πρὸς Topography
καυλὸν Body Part
κύστεως Body Part
συνήρτηται Topography
αἰδοῖον Body Part
ἐξωτάτω Topography
τρῆμα Body Part
συνερρωγὸς Topography
εἰς Topography
ὑποκάτω Topography
εἰς Topography
ὄρχεις Body Part
τρημάτων Body Part
εἰς Topography
κύστιν Body Part
νευρῶδες Adjectives/Qualities
χονδρῶδες Adjectives/Qualities
ἐξήρτηνται Topography
ὄρχεις Body Part
ἔσω Topography
ὑστέραις Body Part
ἐπὶ Topography
ἐντέροις Body Part
ἐπὶ Topography
ὑστέρᾳ Body Part
κύστις Body Part
ὑστερῶν Body Part
ὅμοιαι Adjectives/Qualities
ὁμοίως

In [20]:
# sample random from INCEPTION_TRAIN_DATA and print for each entity text[start:end] and label
t = random.choice(INCEPTION_TRAIN_DATA)
print (t[0]+ "\n")
for entity in t[1]['entities']:
    print (t[0][entity[0]:entity[1]], entity[2])

Ἀρτηρίη ἐξ ἑκατέρου φαρυγγέθρου τὴν ἔκφυσιν ποιευμένη ἐς 
ἄκρον πνεύμονος τελευτᾷ, κρίκοις ξυγκειμένη ὁμορυσμοῖς, τῶν περιηγέων ἁπτομένη κατ' ἐπίπεδον ἀλλήλων.

Ἀρτηρίη Body Part
ἑκατέρου Symmetry/Opposition
φαρυγγέθρου Body Part
ἔκφυσιν Topography
ποιευμένη Topography
ἄκρον Topography
πνεύμονος Body Part
τελευτᾷ Topography
κρίκοις Technical Appellation
ξυγκειμένη Adjectives/Qualities
ὁμορυσμοῖς Adjectives/Qualities
περιηγέων Topography
ἁπτομένη Adjectives/Qualities
κατ Topography
ἐπίπεδον Topography
ἀλλήλων Symmetry/Opposition


In [21]:
TRAIN_DATA = TRAIN_DATA + INCEPTION_TRAIN_DATA

In [23]:
docs = []
losses = {}
for text, annot in TRAIN_DATA:
    # train model
    doc = nlp.make_doc(text)
    ents = []
    for start, end, label in annot["entities"]:
        # first check char span by text[start:end] and compare with annot
        print (text[start:end], label)
        # if label isnt float NaN
        if label == label and label != None:
            span = doc.char_span(start, end, label=label, alignment_mode="expand")
            print (span)
            ents.append(span)
    # check overlapping entities (each spacy token should only be part of one entity)
    ents = spacy.util.filter_spans(ents)

    doc.ents = ents
    docs.append(doc)



 διαφορὰς Medical
διαφορὰς
 κοιλία Body Part
κοιλία
, ἄνωθεν Topography
, ἄνωθεν
, κατατείνουσαι Topography
, κατατείνουσαι
 ἐντέρων Body Part
ἐντέρων
 θέσιν Medical
θέσιν
 πρὸς Topography
πρὸς
 ἀρξάμεναι Topography
ἀρξάμεναι
 κάτω Topography
κάτω
 μέχρι Topography
μέχρι
 κάτω κοιλία Body Part
κάτω κοιλία
 κάτω κοιλία Technical Appellation
κάτω κοιλία
 ὁμοία Adjectives/Qualities
ὁμοία
 ὑείᾳ Medical
ὑείᾳ
 στόμαχον Body Part
στόμαχον
 ἀρτηρίαν Body Part
ἀρτηρίαν
 κοιλίαν Body Part
κοιλίαν
 ἀπὸ Topography
ἀπὸ
 ὅπου Topography
ὅπου
 ἐντέρων Body Part
ἐντέρων
 Ὑπὲρ Topography
 Ὑπὲρ
 μεσεντέριόν Body Part
μεσεντέριόν
 ἁπλοῦν Adjectives/Qualities
ἁπλοῦν
ν, εἱλιγμένον Adjectives/Qualities
ἁπλοῦν, εἱλιγμένον
 εἶτα Topography
 εἶτα
 ἔντερον Body Part
ἔντερον
 πλατύ Adjectives/Qualities
πλατύ
 ἐντέρου Body Part
ἐντέρου
 ἐντέρῳ Body Part
ἐντέρῳ
 ἐοικυῖα Adjectives/Qualities
ἐοικυῖα
 μείζων Ad

In [93]:
# check if charspan correctly found the entities, and all of them.
doc = random.choice(docs)

print ("Spacy entities, for line:")
print (doc.text)
print([(ent.text, ent.label_) for ent in doc.ents])

# locate doc in TRAIN_DATA
line = None
for i, row in enumerate(TRAIN_DATA):
    if row[0] == doc.text:
        line = i
        #print (row[1])
        break

print ("\nTable entities, for same line:")
for entity in TRAIN_DATA[line][1]['entities']:
    print (TRAIN_DATA[line][0][entity[0]:entity[1]], entity[2])

# check if there are entities in TRAIN_DATA that are not in spacy doc
for entity in TRAIN_DATA[line][1]['entities']:
    if entity[2] not in [ent.label_ for ent in doc.ents]:
        print ("Entity not found in spacy doc: ", TRAIN_DATA[line][0][entity[0]:entity[1]], entity[2])

Spacy entities, for line:
Ἔτι προσώπου μέρος τὸ μὲν ὂν τῷ πνεύματι πόρος ῥίς 
[('προσώπου', 'Body Part'), ('μέρος', 'Adjectives/Qualities'), ('πνεύματι', 'Physiology'), ('πόρος', 'Topography'), ('ῥίς', 'Body Part')]

Table entities, for same line:
 μέρος Adjectives/Qualities
 πνεύματι Physiology
 πόρος Topography
 προσώπου Body Part
 ῥίς Body Part


In [None]:
from pathlib import Path
from spacy.tokens import DocBin

docbin = DocBin(docs=docs)
docbin.to_disk("../corpus/train/merged_ner__{0}.spacy".format(FORMAT))

In [None]:
# split docs into train test and dev
from sklearn.model_selection import train_test_split


train_docs, test_docs = train_test_split(docs, test_size=0.2)
train_docs, dev_docs = train_test_split(train_docs, test_size=0.2)

Path("../corpus/train").mkdir(parents=True, exist_ok=True)
Path("../corpus/dev").mkdir(parents=True, exist_ok=True)
Path("../corpus/test").mkdir(parents=True, exist_ok=True)

train = DocBin(docs=train_docs)
train.to_disk("../corpus/train/train_ner__{0}.spacy".format(FORMAT))
test = DocBin(docs=test_docs)
test.to_disk("../corpus/test/test_ner__{0}.spacy".format(FORMAT))
dev = DocBin(docs=dev_docs)
dev.to_disk("../corpus/dev/dev_ner__{0}.spacy".format(FORMAT))

print ("Train: ", len(train_docs))
print ("Test: ", len(test_docs))
print ("Dev: ", len(dev_docs))




