In [1]:
import os
import re
import spacy
import pandas as pd

# Preprocess Text

In [2]:
FILE_PATH = "../assets/NER_assets/Ancient_Words.csv"
# read csv file
df = pd.read_csv(FILE_PATH)

In [3]:
# rename columns to fit code
df.rename(columns = {'Word':'Keyword', 'Category Types':'Label'}, inplace = True)
# If a cell is empty (NaN), Fill it with the value in its parallel "Early" column
for row in df:
    df['Quote'].fillna(df['Early Quote'], inplace=True)
    df['Word Before'].fillna(df['Early Word Before'], inplace=True)
    df['Word After'].fillna(df['Early Word After'], inplace=True)
    df['Label'].fillna(df['Early Category Type'], inplace=True)
# remove rows with no Keyword
df = df.dropna(subset=['Keyword'])
# Remove any row that isn't Greek
pat = '[ء-ي]+'
#df.Keyword.str.contains(pat)
df = df[~df.Keyword.str.contains(pat, na=False)]
#replace new line in df column
df['Keyword'].replace('\n', '', regex=True, inplace=True)
#replace numbers in df
df.replace('\d+', '', regex=True, inplace=True)
#replace hyphens in df column
df.replace('-', '', regex=True, inplace=True)
# replace comma in df column
df['Keyword'].replace(',', '', regex=True, inplace=True)
#replace period in df column
df['Keyword'].replace('\.', '', regex=True, inplace=True)
#replace interpunkt in df column
df['Keyword'].replace('\·', '', regex=True, inplace=True)
# replace multiple spaces in df column
df.replace(' +', ' ', regex=True, inplace=True)
# replace end punctuation in df column
df['Keyword'].replace('\s+$', '', regex=True, inplace=True)

df.fillna(0)
df.reset_index(drop=True, inplace=True)


In [4]:
df.head(10)

Unnamed: 0,Keyword,Word Before,Word After,Quote,Label,Lemma,Early Category Type,Early Word,Early Word Before,Early Word After,Early Quote,Lemma arabic
0,ἐγκέφαλος,ὁ,ἐστὶ τοῦ,ὁ ἐγκέφαλος ἐστὶ τοῦ ἀνθρώπου διπλόος ὥσπερ κα...,Body Part,ἐγκέφαλος,,,ο,του,,
1,διπλόος,ἀνθρώπου,ὥσπερ,ὁ ἐγκέφαλος ἐστὶ τοῦ ἀνθρώπου διπλόος ὥσπερ κα...,Adjectives/Qualities,διπλόος,,,εστι,ωσπερ,,
2,μέσον,δὲ,αὐτοῦ,τὸ δὲ μέσον αὐτοῦ διείργει μῆνιγξ λεπτή,Topography,μέσος,,,,,,
3,διείργει,αὐτοῦ,μῆνιγξ,τὸ δὲ μέσον αὐτοῦ διείργει μῆνιγξ λεπτή,Topography,διείργω,,,αυτου,μηνιγξ,,
4,μῆνιγξ,διείργει,λεπτή,τὸ δὲ μέσον αὐτοῦ διείργει μῆνιγξ λεπτή,Body Part,μῆνιγξ,,,διείργει,λεπτή,,
5,λεπτή,μῆνιγξ,.,τὸ δὲ μέσον αὐτοῦ διείργει μῆνιγξ λεπτή,Adjectives/Qualities,λεπτός,,,μῆνιγξ,.,,
6,κεφαλὴς,τῆς,ἀλγεῖ,"διότι οὐκ αἰεὶ κατὰ τωὐτὸ τῆς κεφαλὴς ἀλγεῖ, ἀ...",Body Part,κεφαλή,,,,,,
7,ἀλγεῖ,κεφαλὴς,ἀλλ',"διότι οὐκ αἰεὶ κατὰ τωὐτὸ τῆς κεφαλὴς ἀλγεῖ, ἀ...",Pathology,ἀλγέω,,,,,,
8,μέρει,ἐν,ἑκάτερον,"διότι οὐκ αἰεὶ κατὰ τωὐτὸ τῆς κεφαλὴς ἀλγεῖ, ἀ...",Topography,μέρος,,,,,,
9,ἅπασαν,δὲ,. καὶ,"διότι οὐκ αἰεὶ κατὰ τωὐτὸ τῆς κεφαλὴς ἀλγεῖ, ἀ...",Topography,ἅπας,,,,,,


In [5]:
# if any of the fields "KeyWord", "Quote", "Word Before", "Word After" are "0", drop the row
for w in ['Keyword', 'Quote', 'Word Before', 'Word After']:
    df = df[df[w] != 0]


In [6]:
# import requirements for converting the dataframe to Spacy Docs
from collections import defaultdict
from typing import List
from spacy.language import Language
from spacy.tokens import Doc, DocBin, Span
from spacy.util import filter_spans
from unicodedata import normalize
import regex
import random


In [7]:
d = {ord('\N{COMBINING ACUTE ACCENT}'):None, ord('\N{COMBINING COMMA ABOVE}'):None, ord('\N{COMBINING REVERSED COMMA ABOVE}'):None}

def find_word_index(sentence, word, word_before, word_after):
    """Find the index of a word in a sentence, it can appear multiple times but we return by word_before and word_after"""


    # find the index of the word in the sentence (return index of beginning of word)
    word_index = sentence.find(word)

    # return all chars of word before: [),.,·] (meaning none of these chars will be included) so we get only data contained within the sentence
    for char in [")", ".", "·"]:
        if word_before.find(char) != -1:
            word_before = word_before[word_before.find(char)+1:]
        if word_after.find(char) != -1:
            word_after = word_after[:word_after.find(char)]



    pattern = fr"{re.escape(word_before)}({(word)}){re.escape(word_after)}"
    # add fuzzy matching with up to 3 mistakes
    pattern = fr"(?:{pattern}){{e<=3}}"



    match = regex.search(pattern, sentence)
    try:
        print ("match:", match.span(1))
        return match.span(1)
    except:

        return None



In [8]:
# normalize table
FORMAT = 'NFKD'
for col in ['Keyword', 'Quote', 'Word Before', 'Word After']:
    df[col] = df[col].apply(lambda x: normalize(FORMAT, x))

In [9]:
# using word before and word after, we want to bring our data into the following format:
# TRAIN_DATA = [ (TEXT AS A STRING, {“entities”: [(START, END, LABEL)]}) ]

TRAIN_DATA = []
train_data_df = pd.DataFrame(columns=['text', 'keyword', 'index', 'label', 'lemma'])
no_found = 0

for i,row in df.iterrows():
    # fill train_data_df with index from find_word_index
    index = find_word_index(row['Quote'], row['Keyword'], row['Word Before'], row['Word After'])
    if index:
        row_to_add = [row['Quote'], row['Keyword'], index, row['Label'], row['Lemma']]
        train_data_df = pd.concat([train_data_df, pd.DataFrame([row_to_add], columns=['text', 'keyword', 'index', 'label', 'lemma'])], ignore_index=True)
    else:
        print ("No index found for", row['Keyword'], "in", row['Quote'])
        no_found += 1

print ("No index found for", no_found, "words")

match: (3, 14)
match: (38, 46)
match: (7, 14)
match: (22, 32)
match: (33, 40)
match: (40, 47)
match: (38, 46)
match: (47, 54)
match: (65, 72)
match: (95, 103)
match: (4, 12)
match: (27, 38)
match: (66, 74)
match: (79, 87)
match: (87, 93)
match: (97, 106)
match: (115, 121)
match: (126, 135)
match: (143, 149)
No index found for σπληνός in Καὶ φλέβες δ' ἐς αὐτὸν τείνουσιν ἐξ ἅπαντος τοῦ σώματος πολλαὶ καὶ λεπταί, δύο δὲ παχεῖαι, ἡ μὲν ἀπὸ τοῦ ἥπατος, ἡ δὲ ἀπὸ τοῦ σπληνός
match: (12, 18)
match: (23, 32)
match: (38, 45)
match: (16, 24)
match: (24, 30)
match: (30, 38)
match: (38, 43)
match: (48, 54)
match: (54, 61)
match: (61, 66)
match: (67, 74)
match: (79, 87)
match: (98, 103)
match: (104, 107)
match: (112, 119)
match: (125, 131)
match: (137, 145)
match: (146, 149)
match: (155, 160)
match: (176, 182)
match: (183, 188)
match: (14, 19)
match: (20, 27)
match: (28, 32)
match: (38, 45)
match: (56, 66)
match: (67, 71)
match: (71, 79)
match: (79, 93)
match: (103, 10

In [10]:
# compare text[index] with keyword
test_data_df = pd.DataFrame(columns=['text', 'real_keyword', 'function_found'])
# count nulls in train_data_df
nulls = train_data_df.isnull().sum()
# drop nulls in train_data_df
train_data_df = train_data_df.dropna()
# iterate over all rows in train_data_df
for index, row in train_data_df.iterrows():
    test_data_df = test_data_df.append({'text': row['text'], 'real_keyword': row['keyword'], 'function_found': row['text'][row['index'][0]:row['index'][1]]}, ignore_index=True)


  test_data_df = test_data_df.append({'text': row['text'], 'real_keyword': row['keyword'], 'function_found': row['text'][row['index'][0]:row['index'][1]]}, ignore_index=True)
  test_data_df = test_data_df.append({'text': row['text'], 'real_keyword': row['keyword'], 'function_found': row['text'][row['index'][0]:row['index'][1]]}, ignore_index=True)
  test_data_df = test_data_df.append({'text': row['text'], 'real_keyword': row['keyword'], 'function_found': row['text'][row['index'][0]:row['index'][1]]}, ignore_index=True)
  test_data_df = test_data_df.append({'text': row['text'], 'real_keyword': row['keyword'], 'function_found': row['text'][row['index'][0]:row['index'][1]]}, ignore_index=True)
  test_data_df = test_data_df.append({'text': row['text'], 'real_keyword': row['keyword'], 'function_found': row['text'][row['index'][0]:row['index'][1]]}, ignore_index=True)
  test_data_df = test_data_df.append({'text': row['text'], 'real_keyword': row['keyword'], 'function_found': row['text'][row[

In [11]:
# print nulls in train_data_df
print ("Nulls", nulls)
# print shape of train_data_df
print ("\nShape", train_data_df.shape)


Nulls text       0
keyword    0
index      0
label      2
lemma      0
dtype: int64

Shape (1492, 5)


In [12]:
# sample test_data_df where real_keyword != function_found
test_data_df[test_data_df['real_keyword'] != test_data_df['function_found']].sample(10)

Unnamed: 0,text,real_keyword,function_found
846,Τὸ δ’ ἐντὸς στόμα σιαγόνων καὶ χειλῶν,στόμα,στόμα
563,Τείνουσι δ’ ἀπὸ τε τῆς ἀορτῆς καὶ τῆς ...,ἄλλαι,ἄλλαι
904,Αἰσθάνεται δὲ καὶ ὧν ἡ ἄλλη σὰρξ πά...,μέρος,μέρος
788,Ἔτι προσώπου μέρος τὸ μὲν ὂν τῷ πνευ...,μέρος,μέρος
1483,"τοῦτο γὰρ μεταξὺ τῆς τ' ἄνω, ἐν ᾗ το...",ἐν,ἐν
992,Ταύτῃ δὲ τὸ πέρας συνήρτηται τῆς γλώττης,συνήρτηται,συνήρτηται
634,Μέχρι μὲν οὖν τῶν νεφρῶν μία οὖσα ἑκ...,νεφρῶν,νεφρῶν
595,Ἡ δὲ μεγάλη φλὲψ ἐν πᾶσι μάλιστα διάδη...,διάδηλος,διάδηλος
1085,Δύο φλέβες εἰσὶν ἐν τῷ θώρακι κατὰ τη...,ἐν,ἐν
640,ἐνταῦθα δὲ πρός τε τὴν ῥάχιν μᾶλλον πρ...,φλὲψ,φλὲψ


# Create dictionaries from dendrosearch and conllu files (supplied by Jackobo)

In [13]:
PUNCTUATION = ['.', ")", ".", "·", "(", "[", "]", ":", ";", ",", "?", "!", "،", "_"]
# extract from df a dictionary {word: lemma}
lemma_dict = {}
for index, row in df.iterrows():
    lemma_dict[row['Keyword']] = row['Lemma']

# load dendrosearch lemma dictionary
dendrosearch_lemma_dict = {}
with open('../assets/dendrosearch_lemma_dict.txt', 'r', encoding='utf-8') as f:
    for line in f:
        line = line.split()
        # check if not punctuation
        if len(line) > 1 and line[0] not in PUNCTUATION:
            dendrosearch_lemma_dict[line[0]] = line[1]

# create dictionary from all conllu files
PATH = "../assets/Lemmatization_training_files/"
conllu_lemma_dict = {}

# iterate over all files in directory
#for f in os.listdir(PATH):
#    if f.endswith(".conllu"):
#        # if file is a conllu file
#        with open(os.path.join(PATH, f), 'r', encoding='utf-8') as f:
#            for line in f:
#                # conll line is: id | keyword | lemma | pos | _
#                # we want only keyword and lemma
#                line = line.split()
#                if len(line) > 2 and line[1] not in PUNCTUATION:
#                    conllu_lemma_dict[line[1]] = line[2]



1. Error fix: I added above the line:\
<code>if f.endswith(".conllu")</code>\
Without it it gets an error because it tries to read folders and other files as well.\ :)
2. Why do you add the conllu to dictionary? ANSWER: I thought they might be useful for lemmatization, anyways they can't hurt, and if you don't want them you can just ignore them.

## Create dictionary from INCEpTION files

In [14]:
from cassis import *
import zipfile
import tempfile

inception_dict = {}
inception_sentences = []

In [15]:
# extract all files in inception folder to temp folder
with tempfile.TemporaryDirectory() as tempdir:
    for f in os.listdir("../assets/NER_assets/INCEpTION_files/"):
        if f.endswith(".zip"):
            with zipfile.ZipFile(os.path.join("../assets/NER_assets/INCEpTION_files/", f), 'r') as zip_ref:
                zip_ref.extractall(tempdir)
    print (tempdir)
    # open typesystem and print content
    with open('{0}/{1}'.format(tempdir, "TypeSystem.xml"), 'rb') as f:
        typesystem = load_typesystem(f)

    # iterate over all files in temp folder
    for f in os.listdir(tempdir):
        # if file is a xmi file
        print (typesystem)
        if f.endswith(".xmi"):
            # load xmi file
            with open(os.path.join(tempdir, f), 'rb') as f:
                # load typesystem from temp folder
                cas = load_cas_from_xmi(f, typesystem=typesystem)
                for token in cas.select('de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma'):
                    inception_dict[token.get_covered_text()] = token.value
                for sentence in cas.select("de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence"):
                    inception_sentences.append(sentence.get_covered_text())


/tmp/tmpleujz7ve
<cassis.typesystem.TypeSystem object at 0x7fade6442210>
<cassis.typesystem.TypeSystem object at 0x7fade6442210>
<cassis.typesystem.TypeSystem object at 0x7fade6442210>
<cassis.typesystem.TypeSystem object at 0x7fade6442210>
<cassis.typesystem.TypeSystem object at 0x7fade6442210>
<cassis.typesystem.TypeSystem object at 0x7fade6442210>
<cassis.typesystem.TypeSystem object at 0x7fade6442210>
<cassis.typesystem.TypeSystem object at 0x7fade6442210>
<cassis.typesystem.TypeSystem object at 0x7fade6442210>
<cassis.typesystem.TypeSystem object at 0x7fade6442210>
<cassis.typesystem.TypeSystem object at 0x7fade6442210>
<cassis.typesystem.TypeSystem object at 0x7fade6442210>
<cassis.typesystem.TypeSystem object at 0x7fade6442210>
<cassis.typesystem.TypeSystem object at 0x7fade6442210>
<cassis.typesystem.TypeSystem object at 0x7fade6442210>
<cassis.typesystem.TypeSystem object at 0x7fade6442210>
<cassis.typesystem.TypeSystem object at 0x7fade6442210>
<cassis.typesystem.TypeSystem o

In [16]:
# add to big dictionary and sample
big_dict = {**lemma_dict, **dendrosearch_lemma_dict, **conllu_lemma_dict, **inception_dict}

random.sample(list(big_dict.items()), 10)

[('προτέρη', 'πρότερος'),
 ('πολιορκοῦσι', 'πολιορκέω'),
 ('λα-ῃ', 'λα-ος'),
 ('γράσωνα', 'γράσων'),
 ('κρινῶ', 'κρίνω'),
 ('ὅττευ', 'ὅστις'),
 ('ἐνέβαλον', 'ἐμβάλλω'),
 ('ἐξαφίησι', 'ἐξαφίημι'),
 ('κατατήκομαι', 'κατατήκω'),
 ('ἐγγενέσθαι', 'ἐγγίγνομαι')]

In [17]:
# create two copies of dictionary, one in NFKD, other in NFKC
FIRST_FORMAT = 'NFKD'
SECOND_FORMAT = 'NFKC'

big_dict_nfkd = {}
big_dict_nfkc = {}

# remove all nan values from big_dict
big_dict = {k: v for k, v in big_dict.items() if pd.notnull(v)}

for key, value in big_dict.items():
    big_dict_nfkd[normalize(FIRST_FORMAT, key)] = normalize(FIRST_FORMAT, value)
    big_dict_nfkc[normalize(SECOND_FORMAT, key)] = normalize(SECOND_FORMAT, value)

In [18]:
# check delta between two dictionaries
delta = {k: big_dict_nfkd[k] for k in set(big_dict_nfkd) - set(big_dict_nfkc)}
# delta size estimation
print ("delta size:", len(delta))

delta size: 141592


# Run NLP pipeline on INCEpTION and Coda files

In [19]:
# merge sentences from inception and coda

sentences = df['Quote'].tolist() + inception_sentences
sentences = list(set(sentences))
random.sample(sentences, 10)

['καὶ διὰ τοῦτο διὰ πλείονος ἀκούει ἢ ὀσφραίνεται·',
 "Καὶ ἡ μὲν ἀπὸ τοῦ ἥπατος ὧδ' ἔχει",
 'Ἔστι δ’ ἡ τοῦ ἀνθρώπου κύστις ἐπιεικῶς \r\nἔχουσα μέγεθος.',
 'τὰ γὰρ φλέβια λεπτὰ ἐόντα οὐ δύναται παραδέχεσθαι τὸ φλέγμα ὑπὸ πάχεος καὶ πλήθεος, ἀλλὰ ἀποψύχεται καὶ πήγνυται τὸ αἷμα, καὶ οὕτως ἀποθνήσκει',
 'ἰσχύουσι δὲ μάλιστα ἐν ἐκείνοισι τοῦ σώματος καὶ αἰεὶ παχύτατά \r\nἐστιν, ἐν οἷσι τοῦ σώματος αἱ σάρκες ἐλάχισταί εἰσι.',
 'Ἐκ δὲ πλευρῆς νόθης, λέγω δὲ ἀριστερῆς, σπλὴν ἀρξάμενος ἐκτέταται ὁμοιορυσμὸς ἴχνει ποδός.',
 'αἱ μὲν \r\nτέσσερας έχουσαι, κατὰ τὰ ὦτα ἑκατέρωθεν ραφή, άλλη έμπροσθεν, \r\nἄλλη ἐξόπισθεν τῆς κεφαλῆς, οὕτω μὲν ἡ τὰς τέσσερας ἔχουσα·',
 'Ὀστέα χειρὸς εἰκοσιεπτὰ καὶ ποδὸς εἰκοσιτέσσαρα·',
 'ἐς δὲ τὰ δεξιὰ μᾶλλον καταρρεῖ ἢ ἐς τὰ ἀριστερά, ὅτι αἱ φλέβες ἐπικοιλότεραί εἰσὶ καὶ πλέονες ἢ ἐν τοῖσιν ἀριστεροῖσιν',
 'αἱ κεφαλαὶ ῥαφὰς ἔχουσιν, αἱ μὲν τρεῖς, αἱ δὲ τέσσερας·']

In [20]:
# run nlp pipeline on sentences
# install spacy grc model if not already installed
nlp = spacy.load("grc_proiel_trf") # I use small model for speed but you should use trf (transformer) model for better accuracy

# create list of Doc objects
docs: List[Doc] = []

# create df to record corrections
corrections_df = pd.DataFrame(columns=['sentence', 'token', 'lemma', 'lemma_corrected', 'dictionary'])
corrected_sentences = 0
for sentence in sentences:
    doc = nlp(sentence)
    # we use our dictionary to replace lemmas which the model didn't recognize correctly
    for token in doc:
        # if token text is a lexical value in our dictionary
        if token.text in big_dict_nfkd:
            if token.lemma_ != big_dict_nfkd[token.text]:
                corrections_df = pd.concat([corrections_df, pd.DataFrame({'sentence': sentence,  'token': token.text, 'lemma': token.lemma_, 'lemma_corrected': big_dict_nfkd[token.text], 'dictionary': 'big_dict_nfkd'}, index=[0])], ignore_index=True)
                token.lemma_ = big_dict_nfkd[token.text]

        elif token.text in big_dict_nfkc:
            if token.lemma_ != big_dict_nfkc[token.text]:
                corrections_df = pd.concat([corrections_df, pd.DataFrame({'sentence': sentence,  'token': token.text, 'lemma': token.lemma_, 'lemma_corrected': big_dict_nfkc[token.text], 'dictionary': 'big_dict_nfkc'}, index=[0])], ignore_index=True)
                token.lemma_ = big_dict_nfkc[token.text]
    docs.append(doc)

In [21]:
print ("corrections_df size:", corrections_df.shape)

corrections_df size: (3312, 5)


In [22]:
corrections_df.sample(10)


Unnamed: 0,sentence,token,lemma,lemma_corrected,dictionary
1145,Ὑπὸ δὲ τὸν πνεύμονά ἐστι τὸ διάζωμα τ...,τὸ,τὸ,ὁ,big_dict_nfkd
1778,προσάγεταί τε γὰρ καὶ λαμβάνει τούτῳ κα...,καὶ,καὶ,καί,big_dict_nfkd
660,πλὴν οὐκ εἰς τὸ κοῖλον ἀλλ’ εἰς τὸ σῶ...,τῶν,τῶν,ὁ,big_dict_nfkd
963,καὶ ἀποτεινομένη πόρρω πρός τε τὴν κεφαλ...,πρὸς,πρὸς,πρός,big_dict_nfkd
950,τὸ μέν τι τῆς φλεβὸς κάτω τείνει διὰ τω...,καθήκει,καθήκομαι,καθήκω,big_dict_nfkd
1007,καὶ τὸ μὲν παχύτατον καὶ μέγιστον καὶ κ...,ἐς,ἐς,εἰς,big_dict_nfkd
2971,ἐς δὲ τὰ δεξιὰ μᾶλλον καταρρεῖ ἢ ἐς τ...,καὶ,καὶ,καί,big_dict_nfkd
222,Ὁ δὲ στόμαχος ἤρτηται μὲν ἄνωθεν ἀπὸ...,συνεχὴς,συνεχὴς,συνεχής,big_dict_nfkd
3117,"Εἶτ' ἐντεῦθεν πάλιν, ὥσπερ ἀπὸ τῆς α...",κοιλίας,οιλίας,κοιλία,big_dict_nfkd
2304,Ποτὸν \r\nδιὰ φάρυγγος καὶ διὰ στομάχου (λάρυγ...,διὰ,διά,διὰ,big_dict_nfkc


In [23]:
# find how many corrected by each dictionary
corrections_df.groupby('dictionary').count()


Unnamed: 0_level_0,sentence,token,lemma,lemma_corrected
dictionary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
big_dict_nfkc,212,212,212,212
big_dict_nfkd,3100,3100,3100,3100


In [24]:
corrections_df[corrections_df['dictionary'] == 'big_dict_nfkc'].sample(10)

Unnamed: 0,sentence,token,lemma,lemma_corrected,dictionary
920,"ἕκαθεν δὲ κύστιος μετοχὴ, εἰς ὃ πέφυκε.",ὃ,ὁ,ὅς,big_dict_nfkc
2518,καὶ ἀπὸ τῶν φλεβὼν ὅ τι ἂν νόσημα γένηται ῥήϊό...,ἢ,ἤ,ἢ,big_dict_nfkc
2304,Ποτὸν \r\nδιὰ φάρυγγος καὶ διὰ στομάχου (λάρυγ...,διὰ,διά,διὰ,big_dict_nfkc
378,Ὅσα μὲν οὖν ἐστι ζῳοτόκα καὶ δίποδα ἢ τετράποδ...,δίποδα,διπόδης,δίποδος,big_dict_nfkc
274,"παρὰ δὲ τὸ ὀστέον περόναι δύο παρήκουσιν, ἡ μὲ...",ἔνδοθεν,ἔνδον,ἔνδοθεν,big_dict_nfkc
1256,Τὰ μὲν\r\nγὰρ κερατοφόρα καὶ μὴ ἀμφώδοντα ἔχει...,ἀμφωδόντων,ἀμφωδόντων,ἀμφώδων,big_dict_nfkc
2695,"αὖτις δ᾿ ὅθεν φρένες ἐξεπεφύκεσαν, ἀπὸ τούτου ...",κατὰ,κατά,κατὰ,big_dict_nfkc
873,Ἔτι \r\nδὲ διαφορὰ καὶ ἥδε πρὸς ἀλλήλας ἐστὶ ...,ὑστερῶν,ὑστερέω,ὑστέρα,big_dict_nfkc
1480,Φέρουσι δὲ εἰς αὐτοὺς πόροι ἔκ τε τῆς μεγάλης\...,κοῖλον,κοῖλος,κοῖλον,big_dict_nfkc
2818,Καλεῖται δὲ τούτων τὰ μὲν ὑστέρα καὶ δελφύς (ὅ...,Καλεῖται,Καλέω,καλέω,big_dict_nfkc


In [25]:
corrections_df[corrections_df['dictionary'] == 'big_dict_nfkd'].sample(10)

Unnamed: 0,sentence,token,lemma,lemma_corrected,dictionary
462,ἐπὶ μὲν τοῦ ἐντὸς κεκλιμένου τῶν δικραίων ἐπὶ ...,δ,δ,δέ,big_dict_nfkd
2269,Πάλιν δ’ ἐντεῦθεν εἰς τέτταρας σχίζονται...,φλεβὸς,φλεβὸς,φλέψ,big_dict_nfkd
1584,Αἱ μὲν γὰρ φέρουσιν εἰς τὰ πλάγια καὶ ...,δ’,δ’,δέ,big_dict_nfkd
1062,τὸ γὰρ ἄλλο πλῆθος τῶν φλεβῶν οὐχ ὡσα...,μέρη,μέρη,μέρος,big_dict_nfkd
2489,τῶν μὲν γὰρ οἱ πόροι συγκεχυμένοι καθάπ...,τινες,τίς,τις,big_dict_nfkd
543,Προϊόντι γὰρ καὶ καταβαίνοντι τῷ περιττ...,τὸ,τὸ,ὁ,big_dict_nfkd
151,Ἡ μὲν οὖν μεγάλη φλὲψ ἐκ τῆς μεγίστης...,τῆς,τῆς,ὁ,big_dict_nfkd
2645,ἐπεὶ αἰσθάνονταί γε οὐδενὸς πρότερον τ...,τὴν,τὴν,ὁ,big_dict_nfkd
1747,Φυσωμένης δὲ τῆς ἀρτηρίας διαδίδωσιν εἰ...,πλεύμονος,πλεύμονος,πνεύμων,big_dict_nfkd
2406,καὶ πάλιν ἕτε ραι διὰ τῶν βουβώνων καὶ...,βουβώνων,βουβώνω,βουβών,big_dict_nfkd


In [26]:
# print lines in corrections_df where lemma_corrected is "NA"
corrections_df[corrections_df['lemma_corrected'] == 'NA']

Unnamed: 0,sentence,token,lemma,lemma_corrected,dictionary


In [27]:
# split docs to train, dev, test randomly
from sklearn.model_selection import train_test_split
from pathlib import Path


train_docs, test_docs = train_test_split(docs, test_size=0.2, random_state=42)
train_docs, dev_docs = train_test_split(train_docs, test_size=0.4, random_state=42)

print ("train_docs size:", len(train_docs))
print ("dev_docs size:", len(dev_docs))
print ("test_docs size:", len(test_docs))

# save each one to DocBin

Path("../corpus/train/lemma_train").mkdir(parents=True, exist_ok=True)
Path("../corpus/dev/lemma_dev").mkdir(parents=True, exist_ok=True)
Path("../corpus/test/lemma_test").mkdir(parents=True, exist_ok=True)


train_bin = DocBin(docs=train_docs)
train_bin.to_disk('../corpus/train/lemma_train/lemma_train.spacy')
test_bin = DocBin(docs=test_docs)
test_bin.to_disk('../corpus/test/lemma_test/lemma_test.spacy')
dev_bin = DocBin(docs=dev_docs)
dev_bin.to_disk('../corpus/dev/lemma_dev/lemma_dev.spacy')




train_docs size: 189
dev_docs size: 127
test_docs size: 79


The spacy dataset should be exported to '../corpus/' folder.\
More specifically:\
train to '..corpus/train/lemma_train/'\
dev to '../corpus/dev/lemma_dev/'\
test to '../corpus/test/lemma_test/

In [28]:
import os
from spacy.tokens import DocBin, Doc
# Merge all dataset files in the train folder

Merged_lemma_train_dataset = DocBin().from_disk("../corpus/train/lemma_train/lemma_train.spacy")
print(len(Merged_lemma_train_dataset))
# merge all .spacy files that are in the folder (except self) with base_db.spacy
for file in os.listdir("../corpus/train/lemma_train/"):
    if file.endswith(".spacy") and file != 'lemma_train.spacy':
        print(file)
        Merged_lemma_train_dataset.merge(
            DocBin().from_disk(f"../corpus/train/lemma_train/{file}")
        )
print(len(Merged_lemma_train_dataset))
# write the merged file to disk
Merged_lemma_train_dataset.to_disk("../corpus/train/lemma_train/Merged_lemma_train_dataset.spacy")


189
euripides_i_train_NFKD.spacy
train_lemma_NFKD.spacy
galen_train_NFKD.spacy
galen_train_NFKC.spacy
train_lemma_NFKC.spacy
grc_perseus-ud-train_NFKD.spacy
grc_proiel-ud-train_NFKD.spacy
aeschylus_ii_train_NFKC.spacy
aeschylus_ii_train_NFKD.spacy
euripides_i_train_NFKC.spacy
grc_proiel-ud-train_NKFC.spacy
grc_perseus-ud-train_NFKC.spacy
8189


In [29]:
import os
from spacy.tokens import DocBin, Doc
# Merge all dataset files in the dev folder

Merged_lemma_dev_dataset = DocBin().from_disk("../corpus/dev/lemma_dev/lemma_dev.spacy")
print(len(Merged_lemma_dev_dataset))
# merge all .spacy files that are in the folder (except self) with base_db.spacy
for file in os.listdir("../corpus/dev/lemma_dev/"):
    if file.endswith(".spacy") and file != 'lemma_dev.spacy':
        print(file)
        Merged_lemma_dev_dataset.merge(
            DocBin().from_disk(f"../corpus/dev/lemma_dev/{file}")
        )
print(len(Merged_lemma_dev_dataset))
# write the merged file to disk
Merged_lemma_dev_dataset.to_disk("../corpus/dev/lemma_dev/Merged_lemma_dev_dataset.spacy")


127
galen_dev_NFKC.spacy
grc_proiel-ud-dev_NKFC.spacy
grc_perseus-ud-dev_NFKC.spacy
dev_lemma_NFKD.spacy
dev_lemma_NFKC.spacy
aeschylus_ii_dev_NFKC.spacy
galen_dev_NFKD.spacy
grc_perseus-ud-dev_NFKD.spacy
grc_proiel-ud-dev_NFKD.spacy
aeschylus_ii_dev_NFKD.spacy
euripides_i_dev_NFKD.spacy
euripides_i_dev_NFKC.spacy
1237


In [30]:
import os
from spacy.tokens import DocBin, Doc
# Merge all dataset files in the dev folder

Merged_lemma_dev_dataset = DocBin().from_disk("../corpus/test/lemma_test/lemma_test.spacy")
print(len(Merged_lemma_dev_dataset))
# merge all .spacy files that are in the folder (except self) with base_db.spacy
for file in os.listdir("../corpus/test/lemma_test/"):
    if file.endswith(".spacy") and file != 'lemma_test.spacy':
        print(file)
        Merged_lemma_dev_dataset.merge(
            DocBin().from_disk(f"../corpus/test/lemma_test/{file}")
        )
print(len(Merged_lemma_dev_dataset))
# write the merged file to disk
Merged_lemma_dev_dataset.to_disk("../corpus/test/lemma_test/Merged_lemma_test_dataset.spacy")

79
grc_proiel-ud-test_NFKD.spacy
grc_perseus-ud-test_NFKC.spacy
test_lemma_NFKD.spacy
grc_proiel-ud-test_NKFC.spacy
grc_perseus-ud-test_NFKD.spacy
test_lemma_NFKC.spacy
1233


In [31]:
!python -m spacy debug data ../configs/transformer-full.cfg  --paths.train ../corpus/train/lemma_train/Merged_lemma_train_dataset.spacy --paths.dev ../corpus/dev/lemma_dev/Merged_lemma_dev_dataset.spacy --nlp.lang=grc

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[1m
Some weights of the model checkpoint at Jacobo/aristoBERTo were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly i

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"

In [None]:
!python -m spacy train ../configs/transformer-full.cfg  --output ../training/transformer/assembled --paths.train ../corpus/train/lemma_train/Merged_lemma_train_dataset.spacy --paths.dev ../corpus/dev/lemma_dev/Merged_lemma_dev_dataset.spacy --gpu-id 0 --nlp.lang=grc

In [None]:
!python -m spacy train ../configs/transformer-full.cfg  --output ../training/transformer/assembled --paths.train ../corpus/train/lemma_train/Merged_lemma_train_dataset.spacy --paths.dev ../corpus/dev/lemma_dev/Merged_lemma_dev_dataset.spacy --gpu-id 0 --nlp.lang=grc

## Evaluations

In [None]:
!python -m spacy benchmark accuracy ../training/transformer/assembled/model-best ../corpus/test/lemma_test/lemma_test.spacy --gpu-id 0 

In [None]:
# NER eval
!python -m spacy benchmark accuracy ../training/transformer/assembled/model-best ../corpus/test/grc_proiel-ud-test_NFKD.spacy --gpu-id 0 

In [None]:
# NER eval
!python -m spacy benchmark accuracy ../training/transformer/assembled/model-best ../corpus/test/ner_test/On_Anatomy_test.spacy --gpu-id 0 

Evaluation

In [None]:
# constant evaluation sentencs
#Gal PHP 1.7
text = "Καὶ τὸ μὲν διφυὲς τοῦ στόματος παρίσθμιον, τὸ δὲ πολυφυὲς οὖλον"
#text = "ὧν πρώτη μέν ἐστι συζυγία τῶν ἐπὶ τὰς ὠμοπλάτας ἀναφερομένων, ἀφ' ὧν οὐκ ὀλίγη μοῖρα καὶ πρὸς τὸν νωτιαῖον εἴσω καταδύεται καὶ τοῖς ἐν τραχήλῳ μυσὶ διασπείρεται, δευτέρα δὲ τῶν εἰς τὰς χεῖρας φερομένων ἀρτηριῶν."
text2 = "ἐκφύεται μὲν γὰρ ἐκ τῆς ἀριστερᾶς  κοιλίας τῆς καρδίας ἀρτηρία μεγίστη, καθάπερ τι πρέμνον ἁπασῶν τῶν κατὰ τὸ ζῷον ἀρτηριῶν. ἀποβλαστάνει δὲ ἀπ' αὐτῆς πρῶτον μὲν ἡ τὴν καρδίαν περιστέφουσα· καὶ δὴ καὶ ὀνομάζουσιν οὕτως αὐτὴν οἷς μέλει τῶν ἀνατομῶν."
text3 = "καὶ φαίνεταί γε καθ' ὅλον τὸν θώρακα καὶ συναποτεινόμενά τε καὶ συγκατασχιζόμενα ταυτὶ τὰ δύο γένη τῶν ἀγγείων, εἴς τε τὸν ὑπεζωκότα τὰς πλευρὰς χιτῶνα καὶ τοὺς μεσοπλευρίους μῦς τρίτου τινὸς αὐτοῖς συναποφυομένου τε καὶ συναποτεινομένου καὶ συγκατασχιζομένου φλεβώδους ἀγγείου."
text4 = "Ταῦτ’ οὖν εἴς τε τὸν παρόντα καὶ τὸν ἑξῆς ἅπαντα λόγον οἷον ὑποθέσεις τινὰς τῶν ἀποδείξεων λαμβάνοντες ἐν ἑκάστῳ τῶν ὀργάνων τὴν ἐξ αὐτῶν ὠφέλειαν ἐροῦμεν ἀπὸ τῶν δακτύλων αὖθις ἀρξάμενοι."
text5 = 'Ἔχει δὲ δι’αὑτοῦ καὶ φλέβας τεταμένας'


In [None]:
import spacy
spacy.require_gpu()
import unicodedata
import pandas as pd


In [None]:
nlp1 = spacy.load("grc_proiel_trf")
nlp2 = spacy.load("../training/transformer/lemmatizer/model-best")

In [None]:
# Infer word lemma in sentence without normalization
text = text
doc = nlp1(text)
for token in doc:
    print(token.text, token.lemma_)
print('............')

# Infer word lemma in sentence with NFKD normalization
text_nfkd = unicodedata.normalize("NFKD", text)
doc = nlp1(text_nfkd)
for token in doc:
    print(token.text, token.lemma_)
print('............')
# Infer word lemma in sentence with NFKC normalization
text_nfkc = unicodedata.normalize("NFKC", text)
doc = nlp1(text_nfkc)
for token in doc:
    print(token.text, token.lemma_)


Text examples:
text4 = "Ταῦτ’ οὖν εἴς τε τὸν παρόντα καὶ τὸν ἑξῆς ἅπαντα λόγον οἷον ὑποθέσεις τινὰς τῶν ἀποδείξεων λαμβάνοντες ἐν ἑκάστῳ τῶν ὀργάνων τὴν ἐξ αὐτῶν ὠφέλειαν ἐροῦμεν ἀπὸ τῶν δακτύλων αὖθις ἀρξάμενοι."
text5 = 'Ἔχει δὲ δι’αὑτοῦ καὶ φλέβας τεταμένας'

In [None]:
# Infer word lemma in sentence without normalization
import pandas as pd

# check if the normalization is NFKD or NFKC
def line_norm(intext):
    if intext == unicodedata.normalize("NFKD", intext):
        return unicodedata.normalize("NFKD", intext)
    elif intext == unicodedata.normalize("NFKC", intext):
        return unicodedata.normalize("NFKC", intext)
    else:
        print('unknown normalization')
        return 'other'
    
# create a dataframe with the results
df = pd.DataFrame(columns=['text', 'M1 lemma', 'M2 lemma', 'M1 lemma_nfkd', 'M2 lemma_nfkd', 'M1 lemma_nfkc', 'M2 lemma_nfkc'])
# Infer word lemma in sentence without normalization
doc = nlp1(text3) 
# check if the normalization is NFKD or NFKC
if unicodedata.is_normalized('NFKD', doc[0].text):
    doc_norm = 'NFKD'
elif unicodedata.is_normalized('NFKC', doc[0].text):
    doc_norm = 'NFKC'
print(doc_norm)
# Infer word lemma in sentence without normalization
for token in doc:
    df = df.append({'text': token.text, 'M1 lemma': token.lemma_}, ignore_index=True)
doc = nlp2(text3)
for token in doc:
        df.loc[df['text'] == line_norm(token.text), 'M2 lemma'] = token.lemma_

# Infer word lemma in sentence with NFKD normalization
text_nfkd = unicodedata.normalize("NFKD", text3)
doc = nlp1(text_nfkd)
for token in doc:
    df.loc[df['text'] == unicodedata.normalize(doc_norm, token.text), 'M1 lemma_nfkd'] = token.lemma_
doc = nlp2(text_nfkd)
for token in doc:
    df.loc[df['text'] == unicodedata.normalize(doc_norm, token.text), 'M2 lemma_nfkd'] = token.lemma_

# Infer word lemma in sentence with NFKC normalization
text_nfkc = unicodedata.normalize("NFKC", text3)
doc = nlp1(text_nfkc)
for token in doc:
    df.loc[df['text'] == unicodedata.normalize(doc_norm, token.text), 'M1 lemma_nfkc'] = token.lemma_
doc = nlp2(text_nfkc)
for token in doc:
    df.loc[df['text'] == unicodedata.normalize(doc_norm, token.text), 'M2 lemma_nfkc'] = token.lemma_
df



In [None]:

doc_norm

In [None]:
doc[0]

In [None]:
doc = nlp1(text)
if doc[0].text == unicodedata.normalize("NFKD", doc[0].text):
    print('NFKD')

doc = nlp2(text)
if doc[0].text == unicodedata.normalize("NFKD", doc[0].text):
    print('NFKD')


In [None]:
doc = nlp2(text)
for token in doc:
        df.loc[df['text'] == unicodedata.normalize('NFKC', token.text), 'M2 lemma'] = token.lemma_

In [None]:
df

In [None]:
# Loading a spacy test corpus and using that for xplicit evaluations
# load corpus
from spacy.tokens import DocBin, Doc
docs = []
corpus = DocBin().from_disk("../Archive/corpus_latest_no_accents/test/lemma_test/lemma_test.spacy")
# doc = the first document in the corpus
for doc in corpus.get_docs(nlp1.vocab):
    #print(doc.text)
    docs.append(doc)