In [1]:
import os
import re
import spacy
import pandas as pd

In [3]:
# install spacy grc model if not already installed
nlp = spacy.load("grc_proiel_sm") # I use small model for speed but you should use trf (transformer) model for better accuracy




# Preprocess Text

In [4]:
FILE_PATH = "../assets/NER_assets/Ancient_Words.csv"
# read csv file
df = pd.read_csv(FILE_PATH)
# Preprocess Text

In [5]:
# rename columns to fit code
df.rename(columns = {'Word':'Keyword', 'Category Types':'Label'}, inplace = True)
# If a cell is empty (NaN), Fill it with the value in its parallel "Early" column
for row in df:
    df['Quote'].fillna(df['Early Quote'], inplace=True)
    df['Word Before'].fillna(df['Early Word Before'], inplace=True)
    df['Word After'].fillna(df['Early Word After'], inplace=True)
    df['Label'].fillna(df['Early Category Type'], inplace=True)
# remove rows with no Keyword
df = df.dropna(subset=['Keyword'])
# Remove any row that isn't Greek
pat = '[ء-ي]+'
#df.Keyword.str.contains(pat)
df = df[~df.Keyword.str.contains(pat, na=False)]
#replace new line in df column
df['Keyword'].replace('\n', '', regex=True, inplace=True)
#replace numbers in df
df.replace('\d+', '', regex=True, inplace=True)
#replace hyphens in df column
df.replace('-', '', regex=True, inplace=True)
# replace comma in df column
df['Keyword'].replace(',', '', regex=True, inplace=True)
#replace period in df column
df['Keyword'].replace('\.', '', regex=True, inplace=True)
#replace interpunkt in df column
df['Keyword'].replace('\·', '', regex=True, inplace=True)
# replace multiple spaces in df column
df.replace(' +', ' ', regex=True, inplace=True)
# replace end punctuation in df column
df['Keyword'].replace('\s+$', '', regex=True, inplace=True)

df.fillna(0)
df.reset_index(drop=True, inplace=True)


In [6]:
df.head(10)

Unnamed: 0,Keyword,Word Before,Word After,Quote,Label,Lemma,Early Category Type,Early Word,Early Word Before,Early Word After,Early Quote,Lemma arabic
0,οὖλον,δὲ πολυφυὲς,· σάρκινα δὲ,Καὶ τὸ μὲν διφυὲς τοῦ στόματος παρίσθμι...,Body Part,οὖλον,Body Part,οὖλον,δὲ πολυφυὲς,· σάρκινα δὲ,Καὶ τὸ μὲν διφυὲς τοῦ στόματος παρίσθμι...,
1,παρίσθμιον,τοῦ στόματος,", τὸ δὲ",Καὶ τὸ μὲν διφυὲς τοῦ στόματος παρίσθμι...,Body Part,παρίσθμιον,Body Part,παρίσθμιον,τοῦ στόματος,", τὸ δὲ",Καὶ τὸ μὲν διφυὲς τοῦ στόματος παρίσθμι...,
2,πολυφυὲς,τὸ δὲ,οὖλον· σάρκινα,Καὶ τὸ μὲν διφυὲς τοῦ στόματος παρίσθμι...,Adjectives/Qualities,πολυφυής,Adjectives/Qualities,πολυφυὲς,τὸ δὲ,οὖλον· σάρκινα,Καὶ τὸ μὲν διφυὲς τοῦ στόματος παρίσθμι...,
3,μόριον,δ’ ἄλλο,"σταφυλοφόρον, κίων","Εἴσω δ’ ἄλλο μόριον σταφυλοφόρον , κίων...",Body Part,μόριον,Body Part,μόριον,δ’ ἄλλο,"σταφυλοφόρον, κίων","Εἴσω δ’ ἄλλο μόριον σταφυλοφόρον , κίων...",
4,ὀδόντες,Ἐντὸς δ’,ὀστέινοι. Εἴσω,Ἐντὸς δ’ ὀδόντες ὀστέινοι,Body Part,ὀδούς,Body Part,ὀδόντες,Ἐντὸς δ’,ὀστέινοι. Εἴσω,Ἐντὸς δ’ ὀδόντες ὀστέινοι,
5,ὀστέινοι,δ’ ὀδόντες,. Εἴσω δ’,Ἐντὸς δ’ ὀδόντες ὀστέινοι,Adjectives/Qualities,ὀστέινος,Adjectives/Qualities,ὀστέινοι,δ’ ὀδόντες,. Εἴσω δ’,Ἐντὸς δ’ ὀδόντες ὀστέινοι,
6,σταφυλοφόρον,ἄλλο μόριον,", κίων ἐπίφλεβος·","Εἴσω δ’ ἄλλο μόριον σταφυλοφόρον , κίων...",Adjectives/Qualities,σταφυλοφόρος,Adjectives/Qualities,σταφυλοφόρον,ἄλλο μόριον,", κίων ἐπίφλεβος·","Εἴσω δ’ ἄλλο μόριον σταφυλοφόρον , κίων...",
7,ἐπίφλεβος,"σταφυλοφόρον, κίων",· ὃς ἐὰν,"Εἴσω δ’ ἄλλο μόριον σταφυλοφόρον, κίων ...",Adjectives/Qualities,ἐπίφλεβος,Adjectives/Qualities,ἐπίφλεβος,"σταφυλοφόρον, κίων",· ὃς ἐὰν,"Εἴσω δ’ ἄλλο μόριον σταφυλοφόρον, κίων ...",
8,κίων,"μόριον σταφυλοφόρον,",ἐπίφλεβος· ὃς,"Εἴσω δ’ ἄλλο μόριον σταφυλοφόρον, κίων ...",Adjectives/Qualities,κίων,Adjectives/Qualities,κίων,"μόριον σταφυλοφόρον,",ἐπίφλεβος· ὃς,"Εἴσω δ’ ἄλλο μόριον σταφυλοφόρον, κίων ...",
9,μέρος,Ἔτι προσώπου,τὸ μὲν,Ἔτι προσώπου μέρος τὸ μὲν ὂν τῷ πνευ...,Adjectives/Qualities,μέρος,Adjectives/Qualities,μέρος,Ἔτι προσώπου,τὸ μὲν,Ἔτι προσώπου μέρος τὸ μὲν ὂν τῷ πνευ...,


In [7]:
# if any of the fields "KeyWord", "Quote", "Word Before", "Word After" are "0", drop the row
for w in ['Keyword', 'Quote', 'Word Before', 'Word After']:
    df = df[df[w] != 0]


In [127]:
# import requirements for converting the dataframe to Spacy Docs
from collections import defaultdict
from typing import List
from spacy.language import Language
from spacy.tokens import Doc, DocBin, Span
from spacy.util import filter_spans
from unicodedata import normalize
import regex


# Convert the dataframe to Spacy Docs

In [159]:
d = {ord('\N{COMBINING ACUTE ACCENT}'):None, ord('\N{COMBINING COMMA ABOVE}'):None, ord('\N{COMBINING REVERSED COMMA ABOVE}'):None}

def find_word_index(sentence, word, word_before, word_after):
    """Find the index of a word in a sentence, it can appear multiple times but we return by word_before and word_after"""


    # find the index of the word in the sentence (return index of beginning of word)
    word_index = sentence.find(word)

    # eliminate all spaces
    sentence_blank, word_before_blank, word_after_blank, word_blank = sentence.replace(" ", ""), word_before.replace(" ", ""), word_after.replace(" ", ""), word.replace(" ", "")


    # return all chars of word before: [),.,·] (meaning none of these chars will be included) so we get only data contained within the sentence
    for char in [")", ".", "·"]:
        if word_before_blank.find(char) != -1:
            word_before_blank = word_before_blank[word_before_blank.find(char)+1:]
        if word_after_blank.find(char) != -1:
            word_after_blank = word_after_blank[:word_after_blank.find(char)]


    print ("cut_word_before_blank:", word_before_blank)
    print ("cut_word_after_blank:", word_after_blank)
    print ("sentence_blank:", sentence_blank)
    print ("word_blank:", word_blank)


    pattern = fr"{re.escape(word_before_blank)}({(word_blank)}){re.escape(word_after_blank)}"
    # add fuzzy matching with up to 3 mistakes
    pattern = fr"(?:{pattern}){{e<=3}}"
    # normalize (NFD) both strings and remove diacritics
    pattern = normalize('NFD', pattern).translate(d)

    sentence_blank = normalize('NFD', sentence_blank).translate(d)


    print ("pattern:", pattern)
    print ("sentence_blank:", sentence_blank)

    match = regex.search(pattern, sentence_blank)
    try:
        print ("match:", match.span(1))
        return match.span(1)
    except:

        return None







In [160]:
# get first row of df
row = df.iloc[0]
# check find_word_index function
find_word_index(row['Quote'], row['Keyword'], row['Word Before'], row['Word After'])

cut_word_before_blank: δὲπολυφυὲς
cut_word_after_blank: 
sentence_blank: Καὶτὸμὲνδιφυὲςτοῦστόματοςπαρίσθμιον,τὸδὲπολυφυὲςοὖλον
word_blank: οὖλον
pattern: (?:δὲπολυφυὲς(οῦλον)){e<=3}
sentence_blank: Καὶτὸμὲνδιφυὲςτοῦστοματοςπαρισθμιον,τὸδὲπολυφυὲςοῦλον
match: (56, 62)


(56, 62)

In [161]:
# using word before and word after, we want to bring our data into the following format:
# TRAIN_DATA = [ (TEXT AS A STRING, {“entities”: [(START, END, LABEL)]}) ]

TRAIN_DATA = []
# create new train data df and for each line in original df, create line: (text, keyword, index(using find_word_index), label)

train_data_df = pd.DataFrame(columns=['text', 'keyword', 'index', 'label'])
for index, row in df.iterrows():

    # check if index found is not None
    train_data_df = train_data_df.append({'text': row['Quote'], 'keyword': row['Keyword'], 'index': find_word_index(row['Quote'], row['Keyword'], row['Word Before'], row['Word After']), 'label': row['Label']}, ignore_index=True)
    print ("******")

cut_word_before_blank: δὲπολυφυὲς
cut_word_after_blank: 
sentence_blank: Καὶτὸμὲνδιφυὲςτοῦστόματοςπαρίσθμιον,τὸδὲπολυφυὲςοὖλον
word_blank: οὖλον
pattern: (?:δὲπολυφυὲς(οῦλον)){e<=3}
sentence_blank: Καὶτὸμὲνδιφυὲςτοῦστοματοςπαρισθμιον,τὸδὲπολυφυὲςοῦλον
match: (56, 62)
******
cut_word_before_blank: τοῦστόματος
cut_word_after_blank: ,τὸδὲ
sentence_blank: Καὶτὸμὲνδιφυὲςτοῦστόματοςπαρίσθμιον,τὸδὲπολυφυὲςοὖλον
word_blank: παρίσθμιον
pattern: (?:τοῦστοματος(παρισθμιον),τὸδὲ){e<=3}
sentence_blank: Καὶτὸμὲνδιφυὲςτοῦστοματοςπαρισθμιον,τὸδὲπολυφυὲςοῦλον
match: (30, 40)
******
cut_word_before_blank: τὸδὲ
cut_word_after_blank: οὖλον
sentence_blank: Καὶτὸμὲνδιφυὲςτοῦστόματοςπαρίσθμιον,τὸδὲπολυφυὲςοὖλον
word_blank: πολυφυὲς
pattern: (?:τὸδὲ(πολυφυὲς)οῦλον){e<=3}
sentence_blank: Καὶτὸμὲνδιφυὲςτοῦστοματοςπαρισθμιον,τὸδὲπολυφυὲςοῦλον
match: (47, 56)
******
cut_word_before_blank: δ’ἄλλο
cut_word_after_blank: σταφυλοφόρο

In [41]:
train_data_df.head(10)

Unnamed: 0,text,keyword,index,label
0,Καὶ τὸ μὲν διφυὲς τοῦ στόματος παρίσθμι...,οὖλον,,Body Part
1,Καὶ τὸ μὲν διφυὲς τοῦ στόματος παρίσθμι...,παρίσθμιον,,Body Part
2,Καὶ τὸ μὲν διφυὲς τοῦ στόματος παρίσθμι...,πολυφυὲς,,Adjectives/Qualities
3,"Εἴσω δ’ ἄλλο μόριον σταφυλοφόρον , κίων...",μόριον,-1.0,Body Part
4,Ἐντὸς δ’ ὀδόντες ὀστέινοι,ὀδόντες,,Body Part
5,Ἐντὸς δ’ ὀδόντες ὀστέινοι,ὀστέινοι,,Adjectives/Qualities
6,"Εἴσω δ’ ἄλλο μόριον σταφυλοφόρον , κίων...",σταφυλοφόρον,,Adjectives/Qualities
7,"Εἴσω δ’ ἄλλο μόριον σταφυλοφόρον, κίων ...",ἐπίφλεβος,,Adjectives/Qualities
8,"Εἴσω δ’ ἄλλο μόριον σταφυλοφόρον, κίων ...",κίων,,Adjectives/Qualities
9,Ἔτι προσώπου μέρος τὸ μὲν ὂν τῷ πνευ...,μέρος,,Adjectives/Qualities
