In [None]:
import json
import math
import os
import re
import unicodedata
import random
from operator import itemgetter
from pathlib import Path
from pprint import pprint
import pandas as pd
#!pip install --upgrade spacy
import spacy
from spacy.util import compounding, minibatch
from spacy import displacy
# Uncomment if you want Spacy to use GPU for training. Note - this will use transformer architecture
spacy.require_gpu()

In [None]:
# import requirements for converting the dataframe to Spacy Docs
from collections import defaultdict
from typing import List
from spacy.language import Language
from spacy.tokens import Doc, DocBin, Span
from spacy.util import filter_spans

# Load models for evaluations

In [None]:
import pandas as pd
from tqdm import tqdm
import unicodedata as ud
import warnings


class LemmaEvaluator:
    def __init__(self, nlp1, nlp2, nlp3=None, norm_method=None):
        self.nlp1 = nlp1
        self.nlp2 = nlp2
        self.nlp3 = nlp3
        self.norm_method = norm_method
        
        # check if normalization method is specified
        if self.norm_method is None:
            warnings.warn("Normalization method not specified. Text may not be normalized correctly.", UserWarning)

    def evaluate_lemmas(self, docs):
        same_lemmas = 0
        diff_lemmas = 0
        correct_lemmas1 = 0
        correct_lemmas2 = 0
        correct_lemmas3 = 0
        total_lemmas = 0

        data = []

        for doc in tqdm(docs, desc="Evaluating models", total=len(docs)):
            for token in doc:
                gold_lemma = self.clean_text(token.lemma_)
                lemma1 = self.nlp1(self.clean_text(doc.text))[token.i].lemma_
                lemma2 = self.nlp2(self.clean_text(doc.text))[token.i].lemma_
                if self.nlp3 is not None:
                    lemma3 = self.nlp3(self.clean_text(doc.text))[token.i].lemma_
                if self.nlp3 is not None and lemma1 == lemma2 == lemma3 ==gold_lemma:
                    result = "All lemmas are the same"
                    same_lemmas += 1
                    correct_lemmas1 += 1
                    correct_lemmas2 += 1
                    correct_lemmas3 += 1
                elif lemma1 == lemma2 == gold_lemma:
                    result = "All lemmas are the same"
                    same_lemmas += 1
                    correct_lemmas1 += 1
                    correct_lemmas2 += 1
                else:
                    result = ""
                    diff_lemmas += 1
                    if lemma1 == gold_lemma:
                        correct_lemmas1 += 1
                        result += f", Model 1 correct ({lemma1})"
                    else:
                        result += f", Model 1 incorrect ({lemma1})"
                    if lemma2 == gold_lemma:
                        correct_lemmas2 += 1
                        result += f", Model 2 correct ({lemma2})"
                    else:
                        result += f", Model 2 incorrect ({lemma2})"
                    if self.nlp3 is not None:
                        if lemma3 == gold_lemma:
                            correct_lemmas3 += 1
                            result += f", Model 3 correct ({lemma3})"
                        else:
                            result += f", Model 3 incorrect ({lemma3})"
                if self.nlp3 is not None:
                    data.append([doc.text, token.text, gold_lemma, lemma1, lemma2, lemma3, result])
                else:
                    data.append([doc.text, token.text, gold_lemma, lemma1, lemma2, result])
                total_lemmas += 1

        if self.nlp3 is not None:
            df_evaluate = pd.DataFrame(data, columns=["Text", "Token", "Gold Lemma", "Model 1 Lemma", "Model 2 Lemma", "Model 3 Lemma", "Result"])
            print(df_evaluate)
            print(f"Total Lemmas: {total_lemmas}")
            print(f"Total same lemmas: {same_lemmas}")
            print(f"Total different lemmas: {diff_lemmas}")
            print(f"Total correct lemmas for Model 1: {correct_lemmas1}")
            print(f"Total correct lemmas for Model 2: {correct_lemmas2}")
            print(f"Total correct lemmas for Model 3: {correct_lemmas3}")
        else:
            df_evaluate = pd.DataFrame(data, columns=["Text", "Token", "Gold Lemma", "Model 1 Lemma", "Model 2 Lemma", "Result"])
            print(df_evaluate)
            print(f"Total Lemmas: {total_lemmas}")
            print(f"Total same lemmas: {same_lemmas}")
            print(f"Total different lemmas: {diff_lemmas}")
            print(f"Total correct lemmas for Model 1: {correct_lemmas1}")
            print(f"Total correct lemmas for Model 2: {correct_lemmas2}")
            
        # calculate and print accuracy
        if self.nlp3 is not None:
            print(f"Model 1 accuracy: {correct_lemmas1/total_lemmas:.2%}")
            print(f"Model 2 accuracy: {correct_lemmas2/total_lemmas:.2%}")
            print(f"Model 3 accuracy: {correct_lemmas3/total_lemmas:.2%}")
        else:
            print(f"Model 1 accuracy: {correct_lemmas1/total_lemmas:.2%}")
            print(f"Model 2 accuracy: {correct_lemmas2/total_lemmas:.2%}")
            
        return df_evaluate
    
    
    def evaluate_line(self, line, model_name="Model"):
        doc = self.nlp1(line)
        gold_lemmas = [token.lemma_ for token in doc]
        lemma1 = [token.lemma_ for token in self.nlp1(line)]
        lemma2 = [token.lemma_ for token in self.nlp2(line)]
        if self.nlp3 is not None:
            lemma3 = [token.lemma_ for token in self.nlp3(line)]
        else:
            lemma3 = None
        # add NER labels to dataframe
        NER1 = [token.ent_type_ for token in self.nlp1(line)]
        NER2 = [token.ent_type_ for token in self.nlp2(line)]
        if self.nlp3 is not None:
            NER3 = [token.ent_type_ for token in self.nlp3(line)]
        else:
            NER3 = None    
        data = {"Gold Lemma": gold_lemmas, f"{model_name} 1 Lemma": lemma1, f"{model_name} 2 Lemma": lemma2, f"{model_name} 3 Lemma": lemma3, f"{model_name} 1 NER": NER1, f"{model_name} 2 NER": NER2, f"{model_name} 3 NER": NER3}
        if self.nlp3 is None:
            del data[f"{model_name} 3 Lemma"]

        df_evaluate_line = pd.DataFrame(data)
        print(df_evaluate_line)
        return df_evaluate_line
        #print(data)
        #return data
    
    def evaluate_ner(self, docs):
        same_ner = 0
        diff_ner = 0
        correct_ner1 = 0
        correct_ner2 = 0
        correct_ner3 = 0

        data = []

        for doc in tqdm(docs, desc="Evaluating models", total=len(docs)):
            for ent in doc.ents:
                gold_label = self.clean_text(ent.label_)
                label1 = None
                label2 = None
                label3 = None
                result=""
                for token in doc:
                    if token.idx == ent.start_char:
                        label1 = self.nlp1(self.clean_text(token.text))[0].ent_type_
                        label2 = self.nlp2(self.clean_text(token.text))[0].ent_type_
                        if self.nlp3 is not None:
                            label3 = self.nlp3(self.clean_text(token.text))[0].ent_type_
                            #print label3 character count
                            if len(label3) > 0:
                                print(token.text, " | ", self.nlp3(self.clean_text(token.text))[0])
                        break

                if label3 is None and label1 == label2 == gold_label:
                    result = "All NER labels are the same"
                    same_ner += 1
                    correct_ner1 += 1
                    correct_ner2 += 1
                elif label1 == label2 == label3 == gold_label:
                    result = "All NER labels are the same"
                    same_ner += 1
                    correct_ner1 += 1
                    correct_ner2 += 1
                    correct_ner3 += 1
                else:
                    result = "NER labels are different"
                diff_ner += 1
                if label1 == gold_label:
                    correct_ner1 += 1
                    result += f", Model 1 correct ({label1})"
                else:
                    result += f", Model 1 incorrect ({label1})"
                if label2 == gold_label:
                    correct_ner2 += 1
                    result += f", Model 2 correct ({label2})"
                else:
                    result += f", Model 2 incorrect ({label2})"
                if self.nlp3 is not None and label3 == gold_label:
                        correct_ner3 += 1
                        result += f", Model 3 correct ({label3})"
                else:
                    result += f", Model 3 incorrect ({label3})"
                if self.nlp3 is not None:
                    data.append([doc.text, ent.text, gold_label, label1, label2, label3, result])
                else:
                    data.append([doc.text, ent.text, gold_label, label1, label2, result])
        if self.nlp3 is not None:
            df_evaluate_ner = pd.DataFrame(data, columns=["Text", "Entity", "Gold Label", "Model 1 Label", "Model 2 Label", "Model 3 Label", "Result"])
        else:
            df_evaluate_ner = pd.DataFrame(data, columns=["Text", "Entity", "Gold Label", "Model 1 Label", "Model 2 Label", "Result"])
        print(df_evaluate_ner)

        total = same_ner + diff_ner
        print(f"Total same NER labels: {same_ner} ({same_ner/total:.2%})")
        print(f"Total different NER labels: {diff_ner} ({diff_ner/total:.2%})")
        print(f"Model 1 accuracy: {correct_ner1/total:.2%}")
        print(f"Model 2 accuracy: {correct_ner2/total:.2%}")
        if self.nlp3 is not None:
            print(f"Model 3 accuracy: {correct_ner3/total:.2%}")

        return df_evaluate_ner
    
    def clean_text(self, text):
        # Check if the normalization method is valid
        if self.norm_method is not None and self.norm_method not in ['NFD', 'NFC', 'NFKD', 'NFKC']:
            raise ValueError("Normalization method is not valid. Must be one of ['NFD', 'NFC', 'NFKD', 'NFKC'].")
        elif self.norm_method is not None:
            cleaned = re.sub(r"[\(\[].*?[\)\]]", "", text)
            cleaned = ud.normalize(self.norm_method, cleaned)
        else:
            cleaned = text
        return cleaned

## Evaluate Line

In [None]:
line="Ἐξήρτηται δ' ἐκ τῆς μεγάλης φλεβὸς καὶ τῆς ἀορτῆς, καὶ δι' αὐτοῦ φλέβες πολλαὶ καὶ πυκναί, κατατείνουσαι πρὸς τὴν τῶν ἐντέρων θέσιν, ἄνωθεν ἀρξάμεναι μέχρι κάτω"
#line = "Τείνει δὲ πρῶτον μὲν ἄνω ἀπὸ τῆς καρδίας τῆς μεγάλης φλεβὸς μόριον πρὸς τὸν πλεύμονα καὶ τὴν σύναψιν τῆς ἀορτῆς, ἄσχιστος καὶ μεγάλη οὖσα φλέψ"
#line = "'ᾗ δὲ συνήρτηται κοῖλόν ἐστιν."
line = " Ἔχει δὲ διαφορὰς πολλάς, καθάπερ ἡ κοιλία, καὶ τοῦτο τὸ μόριον"

import spacy
from spacy.tokens import Doc, DocBin, Span
spacy.prefer_gpu()

#Lemma Evaluations:
nlp1 = spacy.load("grc_proiel_trf")
nlp2 = spacy.load("grc_odycy_joint_trf")
nlp3 = spacy.load('../training/grc_ud_proiel_trf_Lem_NER/model-best') #this is an old model from march 23

evaluator = LemmaEvaluator(nlp1, nlp2, nlp3)

evaluate_quote = evaluator.evaluate_line(line)


In [None]:
line="Ἐξήρτηται δ' ἐκ τῆς μεγάλης φλεβὸς καὶ τῆς ἀορτῆς, καὶ δι' αὐτοῦ φλέβες πολλαὶ καὶ πυκναί, κατατείνουσαι πρὸς τὴν τῶν ἐντέρων θέσιν, ἄνωθεν ἀρξάμεναι μέχρι κάτω"
#line = " Ἔχει δὲ διαφορὰς πολλάς, καθάπερ ἡ κοιλία, καὶ τοῦτο τὸ μόριον"

nlp1 = spacy.load('../training/ATLOMY_G_NER_pipeline/sm/model-best')
linenlp = nlp1(line)
for token in linenlp:
    print(token.text, token.ent_iob_, token.ent_type_, "| ", token.lemma_)
#for ent in linenlp.ents:
#    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
line = " Ἐξήρτηται δ' ἐκ τῆς μεγάλης φλεβὸς καὶ τῆς ἀορτῆς, καὶ δι' αὐτοῦ φλέβες πολλαὶ καὶ πυκναί, κατατείνουσαι πρὸς τὴν τῶν ἐντέρων θέσιν, ἄνωθεν ἀρξάμεναι μέχρι κάτω"
line = "Ἐξήρτηται δ' ἐκ τῆς μεγάλης φλεβὸςκαὶ τῆς ἀορτῆς, καὶ δι' αὐτοῦ φλέβες πολλαὶ καὶ πυκναί, κατατείνουσαι πρὸς τὴν τῶν ἐντέρων θέσιν, ἄνωθεν ἀρξάμεναι μέχρι κάτω"
# get gold entities for docs[1]
# if doc.text == line:
for doc in docs:
    if doc.text == line:
        print(doc.text)
        for ent in doc.ents:
            print(ent.text, ent.start_char, ent.end_char, ent.label_)
        break

## Evaluate Lemmas

In [None]:
import spacy
from spacy.tokens import Doc, DocBin, Span
spacy.prefer_gpu()

#Lemma Evaluations:
#nlp1 = spacy.load("../training/ATLOMY_G_NER_pipeline/sm/model-best")
nlp2 = spacy.load("../training/ATLOMY_G_NER_pipeline/sm_3_sep/model-best") # this is a model from Sep 3
nlp3 = spacy.load('../training/grc_ud_proiel_trf_Lem_NER/model-best') #this is an old model from march 23
nlp1 = spacy.load('../training/ATLOMY_G_NER_pipeline/sm_29_sep/model-best') # this is a model from Sep 15


evaluator = LemmaEvaluator(nlp1, nlp2, nlp3, norm_method='NFKD')

test_docs = DocBin().from_disk('../corpus/test/lemma_test/test_lemma_NFKD.spacy')
docs = list(test_docs.get_docs(nlp1.vocab))[:10]

df_evaluate_lemmas = evaluator.evaluate_lemmas(docs)

In [None]:
nlp1 = spacy.load('../training/ATLOMY_G_NER_pipeline/sm_15_sep/model-best') # this is a model from Sep 15
test_docs = DocBin().from_disk('../corpus/dev/ner_dev/ner_dev_NFKC.spacy')
docs = list(test_docs.get_docs(nlp1.vocab))

In [None]:
docs
#ἀποκαμφθεῖσα δὲ κάτω ἐπὶ σπονδύλους καταβαίνει ἔστ ̓ ἂν ἀφίκηται

## Evaluate NER

In [None]:
import spacy
from spacy.tokens import Doc, DocBin, Span
spacy.prefer_gpu()

#NER Evaluations:
nlp1 = spacy.load("../training/ATLOMY_G_NER_pipeline/sm_29_sep/model-best")
nlp2 = spacy.load("../training/ATLOMY_G_NER_pipeline/sm_3_sep/model-best")
#nlp3 = spacy.load('../training/ATLOMY_G_NER_pipeline/A_GreekBert_sm/model-best')
#nlp3 = spacy.load('../training/grc_ud_proiel_trf_Lem_NER/model-best')
#nlp3 = spacy.load('../training/ATLOMY_G_NER_pipeline/sm_NFKD_only/model-best')
nlp3 = spacy.load('../training/ATLOMY_G_NER_pipeline/sm_15_sep/model-best')

evaluator = LemmaEvaluator(nlp1, nlp2, nlp3, norm_method='NFKC')

test_docs = DocBin().from_disk('../corpus/test/ner_test/ner_test_NFKC.spacy')
docs = list(test_docs.get_docs(nlp1.vocab))

df_evaluate_ner = evaluator.evaluate_ner(docs)