In [1]:
import os
import re
import spacy
import pandas as pd

# Preprocess Text

In [2]:
FILE_PATH = "../assets/NER_assets/Ancient_Words.csv"
# read csv file
df = pd.read_csv(FILE_PATH)

In [89]:
df.columns

Index(['Keyword', 'Word Before', 'Word After', 'Quote', 'Label', 'Lemma',
       'Early Category Type', 'Early Word', 'Early Word Before',
       'Early Word After', 'Early Quote', 'Lemma arabic'],
      dtype='object')

In [3]:
# rename columns to fit code
df.rename(columns = {'Word':'Keyword', 'Category Types':'Label'}, inplace = True)
# If a cell is empty (NaN), Fill it with the value in its parallel "Early" column
for row in df:
    df['Quote'].fillna(df['Early Quote'], inplace=True)
    df['Word Before'].fillna(df['Early Word Before'], inplace=True)
    df['Word After'].fillna(df['Early Word After'], inplace=True)
    df['Label'].fillna(df['Early Category Type'], inplace=True)
# remove rows with no Keyword
df = df.dropna(subset=['Keyword'])
# Remove any row that isn't Greek
pat = '[ء-ي]+'
#df.Keyword.str.contains(pat)
df = df[~df.Keyword.str.contains(pat, na=False)]
#replace new line in df column
df['Keyword'].replace('\n', '', regex=True, inplace=True)
#replace numbers in df
df.replace('\d+', '', regex=True, inplace=True)
#replace hyphens in df column
df.replace('-', '', regex=True, inplace=True)
# replace comma in df column
df['Keyword'].replace(',', '', regex=True, inplace=True)
#replace period in df column
df['Keyword'].replace('\.', '', regex=True, inplace=True)
#replace interpunkt in df column
df['Keyword'].replace('\·', '', regex=True, inplace=True)
# replace multiple spaces in df column
df.replace(' +', ' ', regex=True, inplace=True)
# replace end punctuation in df column
df['Keyword'].replace('\s+$', '', regex=True, inplace=True)

df.fillna(0)
df.reset_index(drop=True, inplace=True)


In [4]:
df.head(10)

Unnamed: 0,Keyword,Word Before,Word After,Quote,Label,Lemma,Early Category Type,Early Word,Early Word Before,Early Word After,Early Quote,Lemma arabic
0,οὖλον,δὲ πολυφυὲς,· σάρκινα δὲ,Καὶ τὸ μὲν διφυὲς τοῦ στόματος παρίσθμι...,Body Part,οὖλον,Body Part,οὖλον,δὲ πολυφυὲς,· σάρκινα δὲ,Καὶ τὸ μὲν διφυὲς τοῦ στόματος παρίσθμι...,
1,παρίσθμιον,τοῦ στόματος,", τὸ δὲ",Καὶ τὸ μὲν διφυὲς τοῦ στόματος παρίσθμι...,Body Part,παρίσθμιον,Body Part,παρίσθμιον,τοῦ στόματος,", τὸ δὲ",Καὶ τὸ μὲν διφυὲς τοῦ στόματος παρίσθμι...,
2,πολυφυὲς,τὸ δὲ,οὖλον· σάρκινα,Καὶ τὸ μὲν διφυὲς τοῦ στόματος παρίσθμι...,Adjectives/Qualities,πολυφυής,Adjectives/Qualities,πολυφυὲς,τὸ δὲ,οὖλον· σάρκινα,Καὶ τὸ μὲν διφυὲς τοῦ στόματος παρίσθμι...,
3,μόριον,δ’ ἄλλο,"σταφυλοφόρον, κίων","Εἴσω δ’ ἄλλο μόριον σταφυλοφόρον , κίων...",Body Part,μόριον,Body Part,μόριον,δ’ ἄλλο,"σταφυλοφόρον, κίων","Εἴσω δ’ ἄλλο μόριον σταφυλοφόρον , κίων...",
4,ὀδόντες,Ἐντὸς δ’,ὀστέινοι. Εἴσω,Ἐντὸς δ’ ὀδόντες ὀστέινοι,Body Part,ὀδούς,Body Part,ὀδόντες,Ἐντὸς δ’,ὀστέινοι. Εἴσω,Ἐντὸς δ’ ὀδόντες ὀστέινοι,
5,ὀστέινοι,δ’ ὀδόντες,. Εἴσω δ’,Ἐντὸς δ’ ὀδόντες ὀστέινοι,Adjectives/Qualities,ὀστέινος,Adjectives/Qualities,ὀστέινοι,δ’ ὀδόντες,. Εἴσω δ’,Ἐντὸς δ’ ὀδόντες ὀστέινοι,
6,σταφυλοφόρον,ἄλλο μόριον,", κίων ἐπίφλεβος·","Εἴσω δ’ ἄλλο μόριον σταφυλοφόρον , κίων...",Adjectives/Qualities,σταφυλοφόρος,Adjectives/Qualities,σταφυλοφόρον,ἄλλο μόριον,", κίων ἐπίφλεβος·","Εἴσω δ’ ἄλλο μόριον σταφυλοφόρον , κίων...",
7,ἐπίφλεβος,"σταφυλοφόρον, κίων",· ὃς ἐὰν,"Εἴσω δ’ ἄλλο μόριον σταφυλοφόρον, κίων ...",Adjectives/Qualities,ἐπίφλεβος,Adjectives/Qualities,ἐπίφλεβος,"σταφυλοφόρον, κίων",· ὃς ἐὰν,"Εἴσω δ’ ἄλλο μόριον σταφυλοφόρον, κίων ...",
8,κίων,"μόριον σταφυλοφόρον,",ἐπίφλεβος· ὃς,"Εἴσω δ’ ἄλλο μόριον σταφυλοφόρον, κίων ...",Adjectives/Qualities,κίων,Adjectives/Qualities,κίων,"μόριον σταφυλοφόρον,",ἐπίφλεβος· ὃς,"Εἴσω δ’ ἄλλο μόριον σταφυλοφόρον, κίων ...",
9,μέρος,Ἔτι προσώπου,τὸ μὲν,Ἔτι προσώπου μέρος τὸ μὲν ὂν τῷ πνευ...,Adjectives/Qualities,μέρος,Adjectives/Qualities,μέρος,Ἔτι προσώπου,τὸ μὲν,Ἔτι προσώπου μέρος τὸ μὲν ὂν τῷ πνευ...,


In [5]:
# if any of the fields "KeyWord", "Quote", "Word Before", "Word After" are "0", drop the row
for w in ['Keyword', 'Quote', 'Word Before', 'Word After']:
    df = df[df[w] != 0]


In [6]:
# import requirements for converting the dataframe to Spacy Docs
from collections import defaultdict
from typing import List
from spacy.tokens import Doc, DocBin
from unicodedata import normalize
import random


Please notice the above warning about using a deprecated method, and change accordingly.\
<code>FutureWarning: The frame.append method is deprecated and will be removed from pandas in a future version. Use pandas.concat instead.</code>


# Create dictionaries from dendrosearch and conllu files (supplied by Jackobo)

In [75]:
PUNCTUATION = ['.', ")", ".", "·", "(", "[", "]", ":", ";", ",", "?", "!", "،", "_"]
# extract from df a dictionary {word: lemma}
lemma_dict = {}
for index, row in df.iterrows():
    lemma_dict[row['Keyword']] = row['Lemma']

# load dendrosearch lemma dictionary
dendrosearch_lemma_dict = {}
with open('../assets/dendrosearch_lemma_dict.txt', 'r', encoding='utf-8') as f:
    for line in f:
        line = line.split()
        # check if not punctuation
        if len(line) > 1 and line[0] not in PUNCTUATION:
            dendrosearch_lemma_dict[line[0]] = line[1]

# create dictionary from all conllu files
PATH = "../assets/Lemmatization_training_files/"
conllu_lemma_dict = {}

# iterate over all files in directory
for f in os.listdir(PATH):
    if f.endswith(".conllu"):
        # if file is a conllu file
        with open(os.path.join(PATH, f), 'r', encoding='utf-8') as f:
            for line in f:
                # conll line is: id | keyword | lemma | pos | _
                # we want only keyword and lemma
                line = line.split()
                if len(line) > 2 and line[1] not in PUNCTUATION:
                    conllu_lemma_dict[line[1]] = line[2]



1. Error fix: I added above the line:\
<code>if f.endswith(".conllu")</code>\
Without it it gets an error because it tries to read folders and other files as well.\ :)
2. Why do you add the conllu to dictionary? ANSWER: I thought they might be useful for lemmatization, anyways they can't hurt, and if you don't want them you can just ignore them.

## Create dictionary from INCEpTION files

In [92]:
from cassis import *
import zipfile
import tempfile
import os
from tqdm import tqdm


inception_dict = {}
inception_sentences = [] # list of tuples (sentence, source_file)

In [93]:
# extract all files in inception folder to temp folder
with tempfile.TemporaryDirectory() as tempdir:
    for f in os.listdir("../assets/NER_assets/INCEpTION_files/"):
        if f.endswith(".zip"):
            with zipfile.ZipFile(os.path.join("../assets/NER_assets/INCEpTION_files/", f), 'r') as zip_ref:
                zip_ref.extractall(tempdir)
    print (tempdir)
    # open typesystem and print content
    with open('{0}/{1}'.format(tempdir, "TypeSystem.xml"), 'rb') as f:
        typesystem = load_typesystem(f)

    # iterate over all files in temp folder
    for f in os.listdir(tempdir):
        # if file is a xmi file
        print (typesystem)
        if f.endswith(".xmi"):
            # load xmi file
            with open(os.path.join(tempdir, f), 'rb') as f:
                # load typesystem from temp folder
                cas = load_cas_from_xmi(f, typesystem=typesystem)
                for token in cas.select('de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Lemma'):
                    inception_dict[token.get_covered_text()] = token.value
                for sentence in cas.select("de.tudarmstadt.ukp.dkpro.core.api.segmentation.type.Sentence"):
                    # use os to get only file name
                    inception_sentences.append((sentence.get_covered_text(), os.path.basename(f.name)))


C:\Users\roeyz\AppData\Local\Temp\tmps779gh8g
<cassis.typesystem.TypeSystem object at 0x0000017D7FF7CE50>
<cassis.typesystem.TypeSystem object at 0x0000017D7FF7CE50>
<cassis.typesystem.TypeSystem object at 0x0000017D7FF7CE50>
<cassis.typesystem.TypeSystem object at 0x0000017D7FF7CE50>
<cassis.typesystem.TypeSystem object at 0x0000017D7FF7CE50>
<cassis.typesystem.TypeSystem object at 0x0000017D7FF7CE50>
<cassis.typesystem.TypeSystem object at 0x0000017D7FF7CE50>
<cassis.typesystem.TypeSystem object at 0x0000017D7FF7CE50>
<cassis.typesystem.TypeSystem object at 0x0000017D7FF7CE50>
<cassis.typesystem.TypeSystem object at 0x0000017D7FF7CE50>
<cassis.typesystem.TypeSystem object at 0x0000017D7FF7CE50>
<cassis.typesystem.TypeSystem object at 0x0000017D7FF7CE50>
<cassis.typesystem.TypeSystem object at 0x0000017D7FF7CE50>
<cassis.typesystem.TypeSystem object at 0x0000017D7FF7CE50>
<cassis.typesystem.TypeSystem object at 0x0000017D7FF7CE50>
<cassis.typesystem.TypeSystem object at 0x0000017D7FF7

In [94]:
# print random sentence from inception_sentences
print (random.choice(inception_sentences))

('ἄρθρα 30 \r\nτῶν πλευρέων, τὰ μὲν ὄπισθεν τοῦ σώματος πρὸς τοὺς σπονδύλους, τὰ\r\nδ᾽ ἔμπροσθεν ἐν τῷ στέρνῳ πρὸς ἑωυτάς.', 'hippocrates places in man 6.1-2.xmi')


In [95]:
# create big dict df with word, lemma, source
big_dict = {
    'Conllu': conllu_lemma_dict,  # files from Jackobo
    'Inception': inception_dict,
    'Coda': lemma_dict,
    'Dendrosearch': dendrosearch_lemma_dict
}



In [96]:
# create two copies of dictionary, one in NFD, other in NFKC

# remove all nan values from big_dict
# big_dict is a dictionary of dictionaries

big_dict = {k: big_dict[k] for k in big_dict if big_dict[k]}
big_dict_nfkd = {}
big_dict_nfkc = {}

for source in big_dict:
    big_dict_nfkd[source] = {}
    big_dict_nfkc[source] = {}
    for word in big_dict[source]:
        # if word and lemma are not nan: if
        if word and not pd.isnull(word) and big_dict[source][word] and not pd.isnull(big_dict[source][word]):
            big_dict_nfkd[source][normalize('NFKD', word)] = normalize('NFD', big_dict[source][word])
            big_dict_nfkc[source][normalize('NFKC', word)] = normalize('NFKC', big_dict[source][word])


1. Notice the error above: TypeError: normalize() argument 2 must be str, not float
This is probably due to havving numbers as values in dictionary. NOTE: this in due to nan values in dictionary, anyways there are only 6 of them so I just removed them.
2. Why NFD and not NFKD? No reason , you can use NFKD if you want.

# Run NLP pipeline on INCEpTION and Coda files

In [None]:
# install spacy grc model if not already installed
nlp = spacy.load("grc_proiel_sm") # I use small model for speed but you should use trf (transformer) model for better accuracy


In [87]:
# merge sentences from inception and coda, keep record of source
sentences = []
for sentence in inception_sentences:
    sentences.append((sentence[0], sentence[1]))

# add coda sentences(from original df) to sentences list
for sentence in df['Quote'].tolist():
    sentences.append((sentence, 'Coda'))


In [111]:

# create list of Doc objects
docs_nfkd: List[Doc] = []

# create df to record corrections
corrections_df_nfkd = pd.DataFrame(columns=['sentence', 'source', 'token', 'lemma', 'lemma_corrected', 'correction_source'])
corrected_sentences = 0

for sentence in tqdm(sentences):
    sentence = (normalize('NFKD', sentence[0]), sentence[1])
    doc = nlp(sentence[0])


    for token in doc:
        for source in big_dict_nfkd: # each source is a dictionary
            if token.text in big_dict_nfkd[source]: # if token is in dictionary
                if big_dict_nfkd[source][token.text] != token.lemma_: # if lemma is not the same as the one in the dictionary
                    corrections_df_nfkd = corrections_df_nfkd.append({
                        'sentence': sentence[0],
                        'source': sentence[1],
                        'token': token.text,
                        'lemma': token.lemma_,
                        'lemma_corrected': big_dict_nfkd[source][token.text],
                        'correction_source': source
                    }, ignore_index=True)
                    corrected_sentences += 1
                    token.lemma_ = big_dict_nfkd[source][token.text]

                    break

    docs_nfkd.append(doc)

100%|██████████| 100/100 [00:06<00:00, 14.97it/s]


In [117]:
# filter by correction source: Coda Annotation
corrections_df_nfkd.sample(10)

Unnamed: 0,sentence,source,token,lemma,lemma_corrected,correction_source
1226,ἡ δ’ ἑτέρη ἄνω τείνει διὰ τῶν φρενῶν ...,Coda,καὶ,καὶ,καί,Conllu
1515,Ἕτερον δὲ μέρος ἀπὸ τῶν ἀριστερῶν τῆ...,Coda,ἀποσχισθὲν,ἀποσχισθὲν,ἀποσχίζω,Conllu
863,Ἔστι δ’ ἡ μὲν ἀρτηρία χονδρώδης τὴν φυ...,Coda,ᾗ,ᾗ,ὅς,Conllu
961,πλὴν ἐκείνη μὲν ἡ διὰ τοῦ ἥπατός ἐσ...,Coda,τεινούσης,τεινούσης,εἰς,Coda
882,Αἷμα δὲ πλεῖστον μὲν ὁ πλεύμων ἔχει τ...,Coda,πλεῖστον,πλεῖστος,πολύς,Conllu
1504,Καὶ περὶ ταῦτα τὰ μόρια πολλαὶ ἀπ’ αὐτ...,Coda,πολλαὶ,πολλαὶ,πολύς,Conllu
1448,"Ἀπὸ δὲ κώλου πέφυκεν ἀρχὸς λοίσθιος, σ...",On Anatomy (2).xmi,δακτυλίου,δακτυλίου,δακτύλιος,Conllu
1800,Καὶ φλέβες δ' ἐς αὐτὸν τείνουσιν ἐξ ἅ...,Coda,δ',δ',δέ,Conllu
5,ἐξ ἅπαντος γὰρ τοῦ σώματος φλέβες ἐς α...,Coda,φλέβες,φλέβες,φλέψ,Conllu
1616,Εἰς δὲ τὸ ἧπαρ καὶ τὸν σπλῆνα οὐδεμί...,Coda,ἀπὸ,ἀπὸ,ἀπό,Conllu


Again, the deprecation warning; I don't get these deprecation warnings.

In [122]:
# create list of Doc objects
docs_nfkc: List[Doc] = []

# create df to record corrections
corrections_df_nfkc = pd.DataFrame(columns=['sentence', 'source', 'token', 'lemma', 'lemma_corrected', 'correction_source'])
corrected_sentences = 0


for sentence in tqdm(sentences):
    sentence = (normalize('NFKC', sentence[0]), sentence[1])
    doc = nlp(sentence[0])

    # we search train_df for the sentence and add keywords to the doc


    for token in doc:
        for source in big_dict_nfkc:
            if token.text in big_dict_nfkc[source]:
                if big_dict_nfkc[source][token.text] != token.lemma_:
                    corrections_df_nfkc = corrections_df_nfkc.append({
                        'sentence': sentence[0],
                        'source': sentence[1],
                        'token': token.text,
                        'lemma': token.lemma_,
                        'lemma_corrected': big_dict_nfkc[source][token.text],
                        'correction_source': source
                    }, ignore_index=True)
                    corrected_sentences += 1
                    token.lemma_ = big_dict_nfkc[source][token.text]

                    break

    docs_nfkc.append(doc)




100%|██████████| 100/100 [00:05<00:00, 17.99it/s]


In [123]:
corrections_df_nfkc.sample(10)

Unnamed: 0,sentence,source,token,lemma,lemma_corrected,correction_source
574,Ἐπὶ δὲ θάτερα καθήκει εἰς τὸ μεταξὺ τοῦ πλεύμο...,Coda,δὲ,δέ,δὲ,Dendrosearch
226,"ἐνίοις μὲν γὰρ εὐρύτερον τὸ πρὸς τῇ κοιλίᾳ, τὸ...",Coda,δὲ,δέ,δὲ,Dendrosearch
384,Τούτου δ' αἴτιον ὅτι ἐν μὲν τοῖς ἐξ ἀνάγκης ἔχ...,Coda,αἴτιον,τις,αἴτιος,Conllu
531,Ἔστι δ’ ἡ μὲν ἀρτηρία χονδρώδης τὴν φύσιν καὶ ...,Coda,διὰ,διά,διὰ,Inception
317,"καὶ ἀπὸ μιᾶς δύο ἐστὶ μόρια τῆς ἀρτηρίας, εἰς ...",Coda,δύο,γε,δύο,Conllu
314,Ἔστι δ’ ἡ ἀορτὴ ἀπὸ μὲν τῆς καρδίας ἀγομένη εὖ...,Coda,ἐπιστενοτέρα,ἐπιστενοτέρα,ἐπίστενος,Coda
327,ἡ δ’ ἑτέρη ἄνω τείνει διὰ τῶν φρενῶν καὶ τοῦ π...,Coda,φρενῶν,φρενή,φρήν,Conllu
443,"Εἶτ' ἐντεῦθεν πάλιν, ὥσπερ ἀπὸ τῆς ἄνω κοιλίας...",Coda,ἕλικα,ἕλιξ,ἕλιξ,Coda
602,Ἔστι δ’ ἡ μὲν ἀρτηρία χονδρώδης τὴν φύσιν καὶ ...,Coda,κατὰ,κατά,κατὰ,Inception
260,"διότι οὐκ αἰεὶ κατὰ τωὐτὸ τῆς κεφαλὴς ἀλγεῖ, ἀ...",Coda,δὲ,δέ,δὲ,Dendrosearch


In [124]:
# find how many corrected by each dictionary
corrections_df_nfkd.groupby('correction_source').count()

Unnamed: 0_level_0,sentence,source,token,lemma,lemma_corrected
correction_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Coda,307,307,307,307,307
Conllu,1562,1562,1562,1562,1562
Dendrosearch,34,34,34,34,34
Inception,5,5,5,5,5


In [125]:
corrections_df_nfkc.groupby('correction_source').count()

Unnamed: 0_level_0,sentence,source,token,lemma,lemma_corrected
correction_source,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Coda,291,291,291,291,291
Conllu,155,155,155,155,155
Dendrosearch,110,110,110,110,110
Inception,69,69,69,69,69


In [351]:
# split docs to train, dev, test randomly
from sklearn.model_selection import train_test_split
from pathlib import Path

# split docs to train, dev, test randomly, for each normalization

train_docs_nfkd, test_docs_nfkd = train_test_split(docs_nfkd, test_size=0.2, random_state=42)
train_docs_nfkd, dev_docs_nfkd = train_test_split(train_docs_nfkd, test_size=0.2, random_state=42)

train_docs_nfkc, test_docs_nfkc = train_test_split(docs_nfkc, test_size=0.2, random_state=42)
train_docs_nfkc, dev_docs_nfkc = train_test_split(train_docs_nfkc, test_size=0.2, random_state=42)

print (f"train: {len(train_docs_nfkd)}\ndev: {len(dev_docs_nfkd)}\ntest: {len(test_docs_nfkd)} for nfkd")
print (f"train: {len(train_docs_nfkc)}\ndev: {len(dev_docs_nfkc)}\ntest: {len(test_docs_nfkc)} for nfkc")
# save each one to DocBin

Path("../corpus/train").mkdir(parents=True, exist_ok=True)
Path("../corpus/dev").mkdir(parents=True, exist_ok=True)
Path("../corpus/test").mkdir(parents=True, exist_ok=True)


train_bin_nfkd = DocBin(docs=train_docs_nfkd)
train_bin_nfkd.to_disk("../corpus/train/train_lemma__nfkd.spacy")
test_bin_nfkd = DocBin(docs=test_docs_nfkd)
test_bin_nfkd.to_disk("../corpus/test/test_lemma__nfkd.spacy")
dev_bin_nfkd = DocBin(docs=dev_docs_nfkd)
dev_bin_nfkd.to_disk("../corpus/dev/dev_lemma__nfkd.spacy")

train_bin_nfkc = DocBin(docs=train_docs_nfkc)
train_bin_nfkc.to_disk("../corpus/train/train_lemma__nfkc.spacy")
test_bin_nfkc = DocBin(docs=test_docs_nfkc)
test_bin_nfkc.to_disk("../corpus/test/test_lemma__nfkc.spacy")
dev_bin_nfkc = DocBin(docs=dev_docs_nfkc)
dev_bin_nfkc.to_disk("../corpus/dev/dev_lemma__nfkc.spacy")



train_docs size: 739
dev_docs size: 493
test_docs size: 308


The spacy dataset should be exported to '../corpus/' folder.\
More specifically:\
train to '..corpus/train/lemma_train/'\
dev to '../corpus/dev/lemma_dev/'\
test to '../corpus/test/lemma_test/