In [1]:
import json
import math
import os
import re
import unicodedata
import random
from operator import itemgetter
from pathlib import Path
from pprint import pprint
import pandas as pd
#!pip install --upgrade spacy
import spacy
from spacy.util import compounding, minibatch
from spacy import displacy
# Uncomment if you want Spacy to use GPU for training. Note - this will use transformer architecture
spacy.require_gpu()

True

In [2]:
# import requirements for converting the dataframe to Spacy Docs
from collections import defaultdict
from typing import List
from spacy.language import Language
from spacy.tokens import Doc, DocBin, Span
from spacy.util import filter_spans

# Load models for evaluations

In [3]:
#NEW?

import re
import unicodedata as ud
import warnings
from typing import List, Optional
from collections import Counter
from sklearn.metrics import precision_score, recall_score, f1_score

class LemmaEvaluator:
    def __init__(self, nlp_models: List, norm_method: Optional[str] = None):
        self.nlp_models = nlp_models
        self.norm_method = norm_method

        if self.norm_method is not None and self.norm_method not in ['NFD', 'NFC', 'NFKD', 'NFKC']:
            raise ValueError("Normalization method is not valid. Must be one of ['NFD', 'NFC', 'NFKD', 'NFKC'].")

    def evaluate_lemmas(self, docs):
        """
        Evaluate the lemmatization performance of the provided NLP models on the given documents.

        Args:
            docs (List[spacy.tokens.Doc]): A list of spaCy Doc objects representing the documents.

        Returns:
            None
        """
        cleaned_texts = [self.clean_text(doc.text) for doc in docs]
        gold_lemmas = [[token.lemma_ for token in doc] for doc in docs]

        all_lemmas = gold_lemmas + [self.get_predicted_lemmas(model, cleaned_texts) for model in self.nlp_models]
        mlb = self.create_multilabel_binarizer(all_lemmas)

        gold_lemmas_binary = mlb.transform([lemma for sublist in gold_lemmas for lemma in sublist])
        predicted_lemmas_binary = [mlb.transform([lemma for sublist in predicted_lemmas for lemma in sublist])
                                    for predicted_lemmas in all_lemmas[len(self.nlp_models):]]

        self.print_evaluation_metrics(gold_lemmas_binary, predicted_lemmas_binary)

    def get_predicted_lemmas(self, nlp_model, texts):
        """
        Get the predicted lemmas for the given texts using the provided NLP model.

        Args:
            nlp_model (callable): The NLP model to use for lemmatization.
            texts (List[str]): A list of text strings.

        Returns:
            List[List[str]]: A list of lists, where each inner list contains the predicted lemmas for the corresponding text.
        """
        return [[token.lemma_ for token in nlp_model(text)] for text in texts]

    def create_multilabel_binarizer(self, lemma_lists):
        """
        Create a MultiLabelBinarizer object and fit it on the provided lists of lemmas.

        Args:
            lemma_lists (List[List[str]]): A list of lists containing lemmas.

        Returns:
            sklearn.preprocessing.MultiLabelBinarizer: A fitted MultiLabelBinarizer object.
        """
        from sklearn.preprocessing import MultiLabelBinarizer

        mlb = MultiLabelBinarizer()
        mlb.fit([lemma for sublist in lemma_lists for lemma in sublist])
        return mlb

    def print_evaluation_metrics(self, gold_lemmas, predicted_lemmas):
        """
        Print the evaluation metrics (precision, recall, and F1-score) for the predicted lemmas.

        Args:
            gold_lemmas (List[List[int]]): A list of lists containing the binary representation of the gold lemmas.
            predicted_lemmas (List[List[List[int]]]): A list of lists containing the binary representation of the predicted lemmas for each model.
        """
        for i, predicted in enumerate(predicted_lemmas):
            print(f"Model {i + 1} precision: {precision_score(gold_lemmas, predicted, average='micro', zero_division=0):.2%}")
            print(f"Model {i + 1} recall: {recall_score(gold_lemmas, predicted, average='micro', zero_division=0):.2%}")
            print(f"Model {i + 1} F1-score: {f1_score(gold_lemmas, predicted, average='micro'):.2%}")

    def clean_text(self, text):
        """
        Clean the given text by removing text between parentheses and applying Unicode normalization if specified.

        Args:
            text (str): The input text to be cleaned.

        Returns:
            str: The cleaned text.
        """
        cleaned = re.sub(r"[\(\[].*?[\)\]]", "", text)
        if self.norm_method is not None:
            cleaned = ud.normalize(self.norm_method, cleaned)
        return cleaned

In [4]:
import pandas as pd
from tqdm import tqdm
import unicodedata as ud
import warnings
from sklearn.metrics import precision_score, recall_score, f1_score

class LemmaEvaluator:
    def __init__(self, nlp1, nlp2, nlp3=None, norm_method=None):
        self.nlp1 = nlp1
        self.nlp2 = nlp2
        self.nlp3 = nlp3
        self.norm_method = norm_method
        
        # check if normalization method is specified
        if self.norm_method is None:
            warnings.warn("Normalization method not specified. Text may not be normalized correctly.", UserWarning)


    def evaluate_lemmas(self, docs):
        texts = [self.clean_text(doc.text) for doc in tqdm(docs, desc="Cleaning texts")]
        doc_lengths = [len(doc) for doc in docs]
        
        # Vectorized processing for model 1
        correct_lemmas1 = [[token.lemma_ for token in self.nlp1(" ".join(texts))] for _ in tqdm(range(max(doc_lengths)), desc="Processing with model 1")]
        correct_lemmas1 = [correct_lemmas1[:length] for length in doc_lengths]
        
        # Vectorized processing for model 2
        correct_lemmas2 = [[token.lemma_ for token in self.nlp1(" ".join(texts))] for _ in tqdm(range(max(doc_lengths)), desc="Processing with model 2")]
        correct_lemmas2 = [correct_lemmas2[:length] for length in doc_lengths]
        
        if self.nlp3 is not None:
            # Vectorized processing for model 3
            correct_lemmas3 = [[token.lemma_ for token in self.nlp1(" ".join(texts))] for _ in tqdm(range(max(doc_lengths)), desc="Processing with model 3")]
            correct_lemmas3 = [correct_lemmas3[:length] for length in doc_lengths]
        
        gold_lemmas = [[token.lemma_ for token in doc] for doc in tqdm(docs, desc="Getting gold lemmas")]
        

        from sklearn.preprocessing import MultiLabelBinarizer

        mlb = MultiLabelBinarizer()

        all_lemmas = gold_lemmas + correct_lemmas1 + correct_lemmas2 + (correct_lemmas3 if self.nlp3 is not None else [])

        mlb.fit([lemma for sublist in all_lemmas for lemma in sublist])

        gold_lemmas = mlb.transform([lemma for sublist in gold_lemmas for lemma in sublist])
        correct_lemmas1 = mlb.transform([lemma for sublist in correct_lemmas1 for lemma in sublist])
        correct_lemmas2 = mlb.transform([lemma for sublist in correct_lemmas2 for lemma in sublist])
        correct_lemmas3 = mlb.transform([lemma for sublist in correct_lemmas3 for lemma in sublist]) if self.nlp3 is not None else []
        
        print(f"Model 1 precision: {precision_score(gold_lemmas, correct_lemmas1, average='micro', zero_division=1):.2%}")
        print(f"Model 2 precision: {precision_score(gold_lemmas, correct_lemmas2, average='micro', zero_division=1):.2%}")
        print(f"Model 3 precision: {precision_score(gold_lemmas, correct_lemmas3, average='micro', zero_division=1):.2%}")
        print(f"Model 1 recall: {recall_score(gold_lemmas, correct_lemmas1, average='micro', zero_division=1):.2%}")
        print(f"Model 2 recall: {recall_score(gold_lemmas, correct_lemmas2, average='micro', zero_division=1):.2%}")
        print(f"Model 3 recall: {recall_score(gold_lemmas, correct_lemmas3, average='micro', zero_division=1):.2%}")
        print(f"Model 1 F1-score: {f1_score(gold_lemmas, correct_lemmas1, average='micro', ):.2%}")
        print(f"Model 2 F1-score: {f1_score(gold_lemmas, correct_lemmas2, average='micro', ):.2%}")
        print(f"Model 3 F1-score: {f1_score(gold_lemmas, correct_lemmas3, average='micro', ):.2%}")
    
        return None
    
    def clean_text(self, text):
        # Check if the normalization method is valid
        if self.norm_method is not None and self.norm_method not in ['NFD', 'NFC', 'NFKD', 'NFKC']:
            raise ValueError("Normalization method is not valid. Must be one of ['NFD', 'NFC', 'NFKD', 'NFKC'].")
        elif self.norm_method is not None:
            cleaned = re.sub(r"[\(\[].*?[\)\]]", "", text)
            cleaned = ud.normalize(self.norm_method, cleaned)
        else:
            cleaned = text
        return cleaned

In [5]:
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

class NEREvaluator:
    def __init__(self, nlp1, nlp2, nlp3=None, norm_method=None):
        self.nlp1 = nlp1
        self.nlp2 = nlp2
        self.nlp3 = nlp3
        self.norm_method = norm_method
        
        # check if normalization method is specified
        if self.norm_method is None:
            warnings.warn("Normalization method not specified. Text may not be normalized correctly.", UserWarning)

    def evaluate_ner(self, docs):
        same_ner = 0
        diff_ner = 0
        correct_ner1 = []
        correct_ner2 = []
        correct_ner3 = []
        gold_labels = []
        predicted_labels1 = []
        predicted_labels2 = []
        predicted_labels3 = []
        data = []

        for doc in tqdm(docs, desc="Evaluating models", total=len(docs)):
            for ent in doc.ents:
                gold_label = self.clean_text(ent.label_)
                label1 = None
                label2 = None
                label3 = None
                result = ""
                for token in doc:
                    if token.idx == ent.start_char:
                        label1 = self.nlp1(self.clean_text(token.text)).ents[0].label_ if self.nlp1(self.clean_text(token.text)).ents else None
                        label2 = self.nlp2(self.clean_text(token.text)).ents[0].label_ if self.nlp2(self.clean_text(token.text)).ents else None
                        if self.nlp3 is not None:
                            label3 = self.nlp3(self.clean_text(token.text)).ents[0].label_ if self.nlp3(self.clean_text(token.text)).ents else None
                        break
                predicted_labels1.append(label1)
                predicted_labels2.append(label2)
                if self.nlp3 is not None:
                    predicted_labels3.append(label3)
                if label3 is None and label1 == label2 == gold_label:
                    result = "All NER labels are the same, correct label is {}".format(gold_label)
                    same_ner += 1
                    correct_ner1.append(1)
                    correct_ner2.append(1)
                elif label1 == label2 == label3 == gold_label:
                    result = "All NER labels are the same, correct label is {}".format(gold_label)
                    same_ner += 1
                    correct_ner1.append(1)
                    correct_ner2.append(1)
                    correct_ner3.append(1)
                else:
                    result = "NER labels are different, correct label is {}".format(gold_label)
                    diff_ner += 1
                    correct_ner1.append(int(label1 == gold_label))
                    correct_ner2.append(int(label2 == gold_label))
                    if self.nlp3 is not None:
                        correct_ner3.append(int(label3 == gold_label))
                gold_labels.append(gold_label)
                if self.nlp3 is not None:
                    data.append([doc.text, ent.text, gold_label, label1, label2, label3, result])
                else:
                    data.append([doc.text, ent.text, gold_label, label1, label2, result])

        if self.nlp3 is not None:
            df_evaluate_ner = pd.DataFrame(data, columns=["Text", "Entity", "Gold Label", "Model 1 Label", "Model 2 Label", "Model 3 Label", "Result"])
        else:
            df_evaluate_ner = pd.DataFrame(data, columns=["Text", "Entity", "Gold Label", "Model 1 Label", "Model 2 Label", "Result"])

        print(df_evaluate_ner)
        
        predicted_labels1 = ['None' if label is None else label for label in predicted_labels1]
        predicted_labels2 = ['None' if label is None else label for label in predicted_labels2]
        predicted_labels3 = ['None' if label is None else label for label in predicted_labels3] if self.nlp3 is not None else []
        
        total = same_ner + diff_ner
        print(f"Total same NER labels: {same_ner} ({same_ner/total:.2%})")
        print(f"Total different NER labels: {diff_ner} ({diff_ner/total:.2%})")
        print(f"Model 1 accuracy: {sum(correct_ner1)/len(correct_ner1):.2%}")
        print(f"Model 2 accuracy: {sum(correct_ner2)/len(correct_ner2):.2%}")
        if self.nlp3 is not None:
            print(f"Model 3 accuracy: {sum(correct_ner3)/len(correct_ner3):.2%}")
        print(f"Model 1 precision: {precision_score(gold_labels, predicted_labels1, average='weighted', zero_division=1):.2%}")
        print(f"Model 2 precision: {precision_score(gold_labels, predicted_labels2, average='weighted', zero_division=1):.2%}")
        if self.nlp3 is not None:
            print(f"Model 3 precision: {precision_score(gold_labels, predicted_labels3, average='weighted', zero_division=1):.2%}")
        print(f"Model 1 recall: {recall_score(gold_labels, predicted_labels1, average='weighted', zero_division=1):.2%}")
        print(f"Model 2 recall: {recall_score(gold_labels, predicted_labels2, average='weighted', zero_division=1):.2%}")
        if self.nlp3 is not None:
            print(f"Model 3 recall: {recall_score(gold_labels, predicted_labels3, average='weighted', zero_division=1):.2%}")
        print(f"Model 1 F1-score: {f1_score(gold_labels, predicted_labels1, average='weighted', zero_division=1):.2%}")
        print(f"Model 2 F1-score: {f1_score(gold_labels, predicted_labels2, average='weighted', zero_division=1):.2%}")
        if self.nlp3 is not None:
            print(f"Model 3 F1-score: {f1_score(gold_labels, predicted_labels3, average='weighted', zero_division=1):.2%}")

        return df_evaluate_ner

    def clean_text(self, text):
        # Check if the normalization method is valid
        if self.norm_method is not None and self.norm_method not in ['NFD', 'NFC', 'NFKD', 'NFKC']:
            raise ValueError("Normalization method is not valid. Must be one of ['NFD', 'NFC', 'NFKD', 'NFKC'].")
        elif self.norm_method is not None:
            cleaned = re.sub(r"[\(\[].*?[\)\]]", "", text)
            cleaned = ud.normalize(self.norm_method, cleaned)
        else:
            cleaned = text
        return cleaned

## Evaluate Line

In [None]:
line="Ἐξήρτηται δ' ἐκ τῆς μεγάλης φλεβὸς καὶ τῆς ἀορτῆς, καὶ δι' αὐτοῦ φλέβες πολλαὶ καὶ πυκναί, κατατείνουσαι πρὸς τὴν τῶν ἐντέρων θέσιν, ἄνωθεν ἀρξάμεναι μέχρι κάτω"
#line = "Τείνει δὲ πρῶτον μὲν ἄνω ἀπὸ τῆς καρδίας τῆς μεγάλης φλεβὸς μόριον πρὸς τὸν πλεύμονα καὶ τὴν σύναψιν τῆς ἀορτῆς, ἄσχιστος καὶ μεγάλη οὖσα φλέψ"
#line = "'ᾗ δὲ συνήρτηται κοῖλόν ἐστιν."
line = " Ἔχει δὲ διαφορὰς πολλάς, καθάπερ ἡ κοιλία, καὶ τοῦτο τὸ μόριον"

import spacy
from spacy.tokens import Doc, DocBin, Span
spacy.prefer_gpu()

#Lemma Evaluations:
nlp1 = spacy.load("grc_proiel_trf")
nlp2 = spacy.load("grc_odycy_joint_trf")
nlp3 = spacy.load('../training/grc_ud_proiel_trf_Lem_NER/model-best') #this is an old model from march 23

evaluator = LemmaEvaluator(nlp1, nlp2, nlp3)

evaluate_quote = evaluator.evaluate_line(line)


In [None]:
line="Ἐξήρτηται δ' ἐκ τῆς μεγάλης φλεβὸς καὶ τῆς ἀορτῆς, καὶ δι' αὐτοῦ φλέβες πολλαὶ καὶ πυκναί, κατατείνουσαι πρὸς τὴν τῶν ἐντέρων θέσιν, ἄνωθεν ἀρξάμεναι μέχρι κάτω"
#line = " Ἔχει δὲ διαφορὰς πολλάς, καθάπερ ἡ κοιλία, καὶ τοῦτο τὸ μόριον"

nlp1 = spacy.load('../training/ATLOMY_G_NER_pipeline/sm/model-best')
linenlp = nlp1(line)
for token in linenlp:
    print(token.text, token.ent_iob_, token.ent_type_, "| ", token.lemma_)
#for ent in linenlp.ents:
#    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
line = " Ἐξήρτηται δ' ἐκ τῆς μεγάλης φλεβὸς καὶ τῆς ἀορτῆς, καὶ δι' αὐτοῦ φλέβες πολλαὶ καὶ πυκναί, κατατείνουσαι πρὸς τὴν τῶν ἐντέρων θέσιν, ἄνωθεν ἀρξάμεναι μέχρι κάτω"
line = "Ἐξήρτηται δ' ἐκ τῆς μεγάλης φλεβὸςκαὶ τῆς ἀορτῆς, καὶ δι' αὐτοῦ φλέβες πολλαὶ καὶ πυκναί, κατατείνουσαι πρὸς τὴν τῶν ἐντέρων θέσιν, ἄνωθεν ἀρξάμεναι μέχρι κάτω"
# get gold entities for docs[1]
# if doc.text == line:
for doc in docs:
    if doc.text == line:
        print(doc.text)
        for ent in doc.ents:
            print(ent.text, ent.start_char, ent.end_char, ent.label_)
        break

## Evaluate Lemmas

In [None]:
def clean_text(text: str) -> str:
    """
    Cleans the given text by removing diacritics (accents), except for specific characters,
    and converting it to lowercase.
    """
    allowed_characters = [' ̓', "᾿", "᾽", "'", "’", "‘", 'ʼ']  # Including the Greek apostrophe
    if not isinstance(text, str):
        raise ValueError("Input must be a string.")
    try:
        non_accent_chars = [c for c in unicodedata.normalize('NFKD', text) 
        if unicodedata.category(c) != 'Mn' or c in allowed_characters]
    # Use str.lower() for converting to lowercase, which works for Unicode characters
        return ''.join(non_accent_chars).lower()
    
    except Exception as e:
        # A more generic exception handling if unexpected errors occur
        print(f"An error occurred: {e}")
        return text
    
def normalize_and_clean_text(text, normalization_forms, apply_cleaning=False):
    """
    Normalize and optionally clean the given text using specified Unicode normalization form.
    """
    normalized_text = {NORM: unicodedata.normalize(NORM, text) for NORM in normalization_forms}
    
    if apply_cleaning:
        for NORM in normalization_forms:
            normalized_text[NORM] = clean_text(normalized_text[NORM])
    
    return normalized_text



In [None]:
import spacy
from spacy.tokens import Doc, DocBin, Span
spacy.prefer_gpu()

#Lemma Evaluations:
#nlp1 = spacy.load("../training/ATLOMY_G_NER_pipeline/sm/model-best")
#nlp2 = spacy.load("../training/ATLOMY_G_NER_pipeline/sm_3_sep/model-best") # this is a model from Sep 3
#nlp3 = spacy.load('../training/grc_ud_proiel_trf_Lem_NER/model-best') #this is an old model from march 23
#nlp1 = spacy.load('../training/ATLOMY_G_NER_pipeline/sm_10_dec/model-best') # this is a model from Sep 15

nlp1 = spacy.load('../training/SageMaker/NER/ner-210324/model-best')
nlp2 = spacy.load('../training/SageMaker/NER/ner-130524/model-best')
nlp3 = spacy.load('../training/ATLOMY_G_NER_pipeline/sm_3_sep/model-best')

#nlp1 = spacy.load('../training/ATLOMY_G_NER_pipeline/sm_21_mar_2024/model-best') # this is a model with mar 24 dataset, but with pos model sm_lemmatizer_3_sep
#nlp2 = spacy.load('../training/ATLOMY_G_NER_pipeline/sm_21_mar_2024_new_pos_8_mar/model-best') # this is a model with mar 24 dataset and mar 8 pos model
#nlp3 = spacy.load('../training/SageMaker/NER/ner_21_mar_trf/model-best') # this is a model traine on sagemaker with mar 21 dataset


#evaluator = LemmaEvaluator(nlp1, nlp2, nlp3, norm_method='NFKD')
evaluator = LemmaEvaluator(nlp_models=[nlp1, nlp2, nlp3], norm_method='NFKD')
test_docs = DocBin().from_disk('../corpus/test/lemma_test/test_lemma_NFKD.spacy')
docs = list(test_docs.get_docs(nlp1.vocab))[:10]

df_evaluate_lemmas = evaluator.evaluate_lemmas(docs)

In [None]:
nlp1 = spacy.load('../training/ATLOMY_G_NER_pipeline/sm_15_sep/model-best') # this is a model from Sep 15
test_docs = DocBin().from_disk('../corpus/dev/ner_dev/ner_dev_NFKC.spacy')
docs = list(test_docs.get_docs(nlp1.vocab))

In [None]:
docs
#ἀποκαμφθεῖσα δὲ κάτω ἐπὶ σπονδύλους καταβαίνει ἔστ ̓ ἂν ἀφίκηται

## Evaluate NER

In [None]:
import spacy
from spacy.tokens import Doc, DocBin, Span
spacy.prefer_gpu()

#NER Evaluations:
#nlp1 = spacy.load("../training/ATLOMY_G_NER_pipeline/sm_10_dec/model-best")
#nlp2 = spacy.load("../training/ATLOMY_G_NER_pipeline/sm_3_sep/model-best")
#nlp3 = spacy.load('../training/ATLOMY_G_NER_pipeline/A_GreekBert_sm/model-best')
#nlp3 = spacy.load('../training/grc_ud_proiel_trf_Lem_NER/model-best')
#nlp3 = spacy.load('../training/ATLOMY_G_NER_pipeline/sm_NFKD_only/model-best')
#nlp3 = spacy.load('../training/ATLOMY_G_NER_pipeline/sm_15_sep/model-best')

#nlp1 = spacy.load('../training/SageMaker/NER/ner_21_mar_trf/model-best')
#nlp2 = spacy.load('../training/SageMaker/NER/ner_11_feb_2024_trf/model-best')
#nlp3 = spacy.load('../training/ATLOMY_G_NER_pipeline/sm_15_sep/model-best')

nlp1 = spacy.load('../training/ATLOMY_G_NER_pipeline/sm_21_mar_2024/model-best') # this is a model with mar 24 dataset, but with pos model  sm_lemmatizer_3_sep
nlp2 = spacy.load('../training/ATLOMY_G_NER_pipeline/sm_21_mar_2024_new_pos_8_mar/model-best') # this is a model with mar 24 dataset and mar 8 pos model
nlp3 = spacy.load('../training/SageMaker/NER/ner-2103/model-best') # this is a model traine on sagemaker with mar 21 dataset

#ner_21_mar_trf

evaluator = NEREvaluator(nlp1, nlp2, nlp3, norm_method='NFKD')

test_docs = DocBin().from_disk('../corpus/test/ner_test/ner_test_NFKD.spacy')
docs = list(test_docs.get_docs(nlp1.vocab))

df_evaluate_ner = evaluator.evaluate_ner(docs)

In [44]:

#spanca evaluator#

class NEREvaluator:
    def __init__(self, nlp1, nlp2, nlp3=None, norm_method=None):
        self.nlp1 = nlp1
        self.nlp2 = nlp2
        self.nlp3 = nlp3
        self.norm_method = norm_method
        
        # check if normalization method is specified
        if self.norm_method is None:
            warnings.warn("Normalization method not specified. Text may not be normalized correctly.", UserWarning)

    def evaluate_SpanCat(self, docs):

        data = []
        same_ner = 0
        diff_ner = 0
        correct_ner1 = []
        correct_ner2 = []
        correct_ner3 = []
        gold_labels = []
        predicted_labels1 = []
        predicted_labels2 = []
        predicted_labels3 = []

        for doc in tqdm(docs, desc="Evaluating models", total=len(docs)):
            doc_text = doc.text
            doc1 = self.nlp1(doc_text)
            doc2 = self.nlp2(doc_text)
            doc3 = self.nlp3(doc_text) if self.nlp3 is not None else None

            # Get the gold spans, their start, end, and label
            gold_spans = []
            for span in doc.spans["sc"]:
                gold_spans.append({
                    "start": span.start_char,
                    "end": span.end_char,
                    "label": span.label_,
                    "token": doc.text[span.start_char:span.end_char]
                })

            for gold_span in gold_spans:
                gold_label = gold_span["label"]
                token = gold_span["token"]
                predicted_label1 = None
                predicted_label2 = None
                predicted_label3 = None

                span_group = doc1.spans["sc"]
                print(f"SpanGroup '{span_group.name}' contains {len(span_group)} spans:")
                for span in span_group:
                    print(f"- Span: '{span.text}' [{span.start_char}, {span.end_char}], Labels: {span.label_}")

                for span in doc1.spans["sc"]:
                    # print gold span details and predicted span details
                    print(f"Gold span: {token}, {gold_span['start']} - {gold_span['end']}, {gold_label}")
                    print(f"Predicted span: {span.text}, {span.start_char} - {span.end_char}, {span.label_}")
                    
                    if span.start_char == gold_span["start"] and span.end_char == gold_span["end"]:
                        print("Exact match")
                        predicted_label1 = span.label_
                        break
                    elif abs(span.start_char - gold_span["start"]) <= 5 and abs(span.end_char - gold_span["end"]) <= 5:
                        print("Approximate match")
                        predicted_label1 = span.label_
                        break

                for span in doc2.spans["sc"]:
                    if span.start_char == gold_span["start"] and span.end_char == gold_span["end"]:
                        predicted_label2 = span.label_
                        break
                    elif abs(span.start_char - gold_span["start"]) <= 5 and abs(span.end_char - gold_span["end"]) <= 5:
                        predicted_label2 = span.label_
                        break

                if doc3 is not None:
                    for span in doc3.spans["sc"]:
                        if span.start_char == gold_span["start"] and span.end_char == gold_span["end"]:
                            predicted_label3 = span.label_
                            break
                        elif abs(span.start_char - gold_span["start"]) <= 5 and abs(span.end_char - gold_span["end"]) <= 5:
                            predicted_label3 = span.label_
                            break

                # Analyze and compare labels
                if predicted_label1 == predicted_label2 == predicted_label3 == gold_label:
                    label_comparison = "All span labels are the same, correct label is {}".format(gold_label)
                    same_ner += 1
                else:
                    label_comparison = "Span labels are different, correct label is {}".format(gold_label)
                    diff_ner += 1
                    if predicted_label1 != gold_label:
                        label_comparison += ", Model 1 predicted: {}".format(predicted_label1)
                    if predicted_label2 != gold_label:
                        label_comparison += ", Model 2 predicted: {}".format(predicted_label2)
                    if predicted_label3 != gold_label and doc3 is not None:
                        label_comparison += ", Model 3 predicted: {}".format(predicted_label3)

                correct_ner1.append(predicted_label1 == gold_label)
                correct_ner2.append(predicted_label2 == gold_label)
                if doc3 is not None:
                    correct_ner3.append(predicted_label3 == gold_label)

                gold_labels.append(gold_label)
                predicted_labels1.append(predicted_label1)
                predicted_labels2.append(predicted_label2)
                if doc3 is not None:
                    predicted_labels3.append(predicted_label3)

                data.append({
                    "Text": doc.text,
                    "Token": token,
                    "Gold Label": gold_label,
                    "Model 1 Label": predicted_label1,
                    "Model 2 Label": predicted_label2,
                    "Model 3 Label": predicted_label3,
                    "Result": label_comparison
                })

        # Create a DataFrame from the data
        df_evaluate_spans = pd.DataFrame(data, columns=["Text", "Token", "Gold Label", "Model 1 Label", "Model 2 Label", "Model 3 Label", "Result"])
        
        predicted_labels1 = ['None' if label is None else label for label in predicted_labels1]
        predicted_labels2 = ['None' if label is None else label for label in predicted_labels2]
        predicted_labels3 = ['None' if label is None else label for label in predicted_labels3] if self.nlp3 is not None else []
        
        # Evaluation metrics
        total = same_ner + diff_ner
        print(f"Total same NER labels: {same_ner} ({same_ner/total:.2%})")
        print(f"Total different NER labels: {diff_ner} ({diff_ner/total:.2%})")
        print(f"Model 1 accuracy: {sum(correct_ner1)/len(correct_ner1):.2%}")
        print(f"Model 2 accuracy: {sum(correct_ner2)/len(correct_ner2):.2%}")
        if self.nlp3 is not None:
            print(f"Model 3 accuracy: {sum(correct_ner3)/len(correct_ner3):.2%}")
        print(f"Model 1 precision: {precision_score(gold_labels, predicted_labels1, average='weighted', zero_division=1):.2%}")
        print(f"Model 2 precision: {precision_score(gold_labels, predicted_labels2, average='weighted', zero_division=1):.2%}")
        if self.nlp3 is not None:
            print(f"Model 3 precision: {precision_score(gold_labels, predicted_labels3, average='weighted', zero_division=1):.2%}")
        print(f"Model 1 recall: {recall_score(gold_labels, predicted_labels1, average='weighted', zero_division=1):.2%}")
        print(f"Model 2 recall: {recall_score(gold_labels, predicted_labels2, average='weighted', zero_division=1):.2%}")
        if self.nlp3 is not None:
            print(f"Model 3 recall: {recall_score(gold_labels, predicted_labels3, average='weighted', zero_division=1):.2%}")
        print(f"Model 1 F1-score: {f1_score(gold_labels, predicted_labels1, average='weighted', zero_division=1):.2%}")
        print(f"Model 2 F1-score: {f1_score(gold_labels, predicted_labels2, average='weighted', zero_division=1):.2%}")
        if self.nlp3 is not None:
            print(f"Model 3 F1-score: {f1_score(gold_labels, predicted_labels3, average='weighted', zero_division=1):.2%}")

        return df_evaluate_spans
    

    def clean_text(self, text):
        # Check if the normalization method is valid
        if self.norm_method is not None and self.norm_method not in ['NFD', 'NFC', 'NFKD', 'NFKC']:
            raise ValueError("Normalization method is not valid. Must be one of ['NFD', 'NFC', 'NFKD', 'NFKC'].")
        elif self.norm_method is not None:
            cleaned = re.sub(r"[\(\[].*?[\)\]]", "", text)
            cleaned = ud.normalize(self.norm_method, cleaned)
        else:
            cleaned = text
        return cleaned

In [46]:
import spacy
from spacy.tokens import Doc, DocBin, Span
spacy.prefer_gpu()

#NER Evaluations:
nlp1 = spacy.load('../training/ATLOMY_G_NER_pipeline/spancat_210524/model-best')
nlp2 = spacy.load('../training/ATLOMY_G_NER_pipeline/spancat_210524_new/model-best')
nlp3 = spacy.load('../training/ATLOMY_G_NER_pipeline/spancat_210524_NFKD_batcher/model-best') # this is a model traine on sagemaker with mar 21 dataset

for nlp in [nlp1, nlp2, nlp3]:
    nlp.get_pipe("spancat").cfg["threshold"] = 0.25
    print(nlp.get_pipe("spancat").cfg)


#spancat = nlp1.add_pipe("spancat")
#spancat.cfg["spans_key"] = "sc"

evaluator = NEREvaluator(nlp1, nlp2, nlp3, norm_method='NFKD')

test_docs = DocBin().from_disk('../corpus/test/spancat_test/spancat_test_NFKD.spacy')
docs = list(test_docs.get_docs(nlp1.vocab))

df_evaluate_spans = evaluator.evaluate_SpanCat(docs)

{'labels': ['Body Part', 'Adjectives/Qualities', 'Topography', 'Medical', 'Pathology', 'Physiology', 'Technical Appellation', 'Division', 'Action Verbs', 'Symmetry/Opposition'], 'spans_key': 'sc', 'threshold': 0.25, 'max_positive': None, 'negative_weight': None, 'allow_overlap': True}
{'labels': ['Body Part', 'Adjectives/Qualities', 'Topography', 'Medical', 'Pathology', 'Physiology', 'Technical Appellation', 'Division', 'Action Verbs', 'Symmetry/Opposition'], 'spans_key': 'sc', 'threshold': 0.25, 'max_positive': None, 'negative_weight': None, 'allow_overlap': True}
{'labels': ['Body Part', 'Adjectives/Qualities', 'Topography', 'Medical', 'Pathology', 'Physiology', 'Technical Appellation', 'Division', 'Action Verbs', 'Symmetry/Opposition'], 'spans_key': 'sc', 'threshold': 0.25, 'max_positive': None, 'negative_weight': None, 'allow_overlap': True}


Evaluating models:   0%|          | 0/52 [00:00<?, ?it/s]

SpanGroup 'sc' contains 12 spans:
- Span: 'ἀνάπνευσις' [12, 24], Labels: Body Part
- Span: 'ἔκπνευσις' [30, 41], Labels: Body Part
- Span: 'εἰς' [51, 55], Labels: Topography
- Span: 'στῆθος' [60, 67], Labels: Body Part
- Span: 'χωρὶς' [85, 91], Labels: Topography
- Span: 'μυκτῆρσιν' [98, 108], Labels: Body Part
- Span: 'ἐκ' [148, 151], Labels: Topography
- Span: 'στήθους' [157, 165], Labels: Body Part
- Span: 'κατὰ' [205, 210], Labels: Topography
- Span: 'γαργαρεῶνα' [216, 227], Labels: Body Part
- Span: 'ἐκ' [238, 241], Labels: Topography
- Span: 'κεφαλῆς' [247, 255], Labels: Body Part
Gold span: στῆθος, 60 - 67, Body Part
Predicted span: ἀνάπνευσις, 12 - 24, Body Part
Gold span: στῆθος, 60 - 67, Body Part
Predicted span: ἔκπνευσις, 30 - 41, Body Part
Gold span: στῆθος, 60 - 67, Body Part
Predicted span: εἰς, 51 - 55, Topography
Gold span: στῆθος, 60 - 67, Body Part
Predicted span: στῆθος, 60 - 67, Body Part
Exact match
SpanGroup 'sc' contains 12 spans:
- Span:

Evaluating models:   8%|▊         | 4/52 [00:00<00:03, 15.91it/s]

SpanGroup 'sc' contains 5 spans:
- Span: 'εὐκίνητος' [0, 11], Labels: Adjectives/Qualities
- Span: 'μυκτήρ' [18, 25], Labels: Body Part
- Span: 'οὖς' [49, 54], Labels: Body Part
- Span: 'ἀκίνητον' [55, 65], Labels: Adjectives/Qualities
- Span: 'ἰδίαν' [71, 78], Labels: Adjectives/Qualities
Gold span: μυκτήρ, 18 - 25, Body Part
Predicted span: εὐκίνητος, 0 - 11, Adjectives/Qualities
Gold span: μυκτήρ, 18 - 25, Body Part
Predicted span: μυκτήρ, 18 - 25, Body Part
Exact match
SpanGroup 'sc' contains 5 spans:
- Span: 'εὐκίνητος' [0, 11], Labels: Adjectives/Qualities
- Span: 'μυκτήρ' [18, 25], Labels: Body Part
- Span: 'οὖς' [49, 54], Labels: Body Part
- Span: 'ἀκίνητον' [55, 65], Labels: Adjectives/Qualities
- Span: 'ἰδίαν' [71, 78], Labels: Adjectives/Qualities
Gold span: οὖς, 49 - 54, Body Part
Predicted span: εὐκίνητος, 0 - 11, Adjectives/Qualities
Gold span: οὖς, 49 - 54, Body Part
Predicted span: μυκτήρ, 18 - 25, Body Part
Gold span: οὖς, 49 - 54, Body 

Evaluating models:  15%|█▌        | 8/52 [00:00<00:02, 15.73it/s]

SpanGroup 'sc' contains 4 spans:
- Span: 'ταύτην' [11, 18], Labels: Body Part
- Span: 'πάντα' [19, 25], Labels: Adjectives/Qualities
- Span: 'πνεύμονα' [35, 44], Labels: Body Part
- Span: 'ἔχει' [45, 51], Labels: Body Part
Gold span: πνεύμονα, 35 - 44, Body Part
Predicted span: ταύτην, 11 - 18, Body Part
Gold span: πνεύμονα, 35 - 44, Body Part
Predicted span: πάντα, 19 - 25, Adjectives/Qualities
Gold span: πνεύμονα, 35 - 44, Body Part
Predicted span: πνεύμονα, 35 - 44, Body Part
Exact match
SpanGroup 'sc' contains 4 spans:
- Span: 'πολλῷ' [4, 11], Labels: Adjectives/Qualities
- Span: 'ἐντέρου' [22, 31], Labels: Body Part
- Span: 'μείζων' [32, 39], Labels: Adjectives/Qualities
- Span: 'ἐοικυῖα' [47, 56], Labels: Topography
Gold span: ἐντέρου, 22 - 31, Body Part
Predicted span: πολλῷ, 4 - 11, Adjectives/Qualities
Gold span: ἐντέρου, 22 - 31, Body Part
Predicted span: ἐντέρου, 22 - 31, Body Part
Exact match
SpanGroup 'sc' contains 4 spans:
- Span: 'πολλῷ' [4, 1

Evaluating models:  23%|██▎       | 12/52 [00:00<00:02, 15.20it/s]

SpanGroup 'sc' contains 2 spans:
- Span: 'ἧπαρ' [14, 20], Labels: Body Part
- Span: 'σπλῆνα' [31, 38], Labels: Body Part
Gold span: ἧπαρ, 14 - 20, Body Part
Predicted span: ἧπαρ, 14 - 20, Body Part
Exact match
SpanGroup 'sc' contains 2 spans:
- Span: 'ἧπαρ' [14, 20], Labels: Body Part
- Span: 'σπλῆνα' [31, 38], Labels: Body Part
Gold span: σπλῆνα, 31 - 38, Body Part
Predicted span: ἧπαρ, 14 - 20, Body Part
Gold span: σπλῆνα, 31 - 38, Body Part
Predicted span: σπλῆνα, 31 - 38, Body Part
Exact match
SpanGroup 'sc' contains 6 spans:
- Span: 'πρόσφυσίς' [7, 18], Labels: Body Part
- Span: 'ἐστι' [19, 24], Labels: Topography
- Span: 'φλεβίοις' [25, 34], Labels: Body Part
- Span: 'φλεβίοις' [25, 34], Labels: Adjectives/Qualities
- Span: 'νευρώδεσι' [35, 45], Labels: Adjectives/Qualities
- Span: 'μικροῖς' [51, 59], Labels: Adjectives/Qualities
Gold span: πρόσφυσίς, 7 - 18, Body Part
Predicted span: πρόσφυσίς, 7 - 18, Body Part
Exact match
SpanGroup 'sc' contains 6 spa

Evaluating models:  27%|██▋       | 14/52 [00:00<00:02, 15.10it/s]

SpanGroup 'sc' contains 7 spans:
- Span: 'μετὰ' [8, 13], Labels: Topography
- Span: 'εὐθὺ' [21, 27], Labels: Topography
- Span: 'πρὸς' [28, 33], Labels: Topography
- Span: 'διατείνει' [48, 58], Labels: Topography
- Span: 'ἀρχός' [130, 137], Labels: Body Part
- Span: 'κνισσώδης' [139, 149], Labels: Adjectives/Qualities
- Span: 'ἀπίμελος' [167, 177], Labels: Adjectives/Qualities
Gold span: τὸ δὲ μετὰ τοῦτο, 0 - 20, Body Part
Predicted span: μετὰ, 8 - 13, Topography
Gold span: τὸ δὲ μετὰ τοῦτο, 0 - 20, Body Part
Predicted span: εὐθὺ, 21 - 27, Topography
Gold span: τὸ δὲ μετὰ τοῦτο, 0 - 20, Body Part
Predicted span: πρὸς, 28 - 33, Topography
Gold span: τὸ δὲ μετὰ τοῦτο, 0 - 20, Body Part
Predicted span: διατείνει, 48 - 58, Topography
Gold span: τὸ δὲ μετὰ τοῦτο, 0 - 20, Body Part
Predicted span: ἀρχός, 130 - 137, Body Part
Gold span: τὸ δὲ μετὰ τοῦτο, 0 - 20, Body Part
Predicted span: κνισσώδης, 139 - 149, Adjectives/Qualities
Gold span: τὸ δὲ με

Evaluating models:  35%|███▍      | 18/52 [00:01<00:02, 14.22it/s]

SpanGroup 'sc' contains 4 spans:
- Span: 'κοιλίην' [5, 13], Labels: Body Part
- Span: 'ἐς' [29, 32], Labels: Topography
- Span: 'ἀγαθὸν' [61, 69], Labels: Adjectives/Qualities
- Span: 'τεθορύβηνται' [124, 137], Labels: Topography
Gold span: κοιλίην, 5 - 13, Body Part
Predicted span: κοιλίην, 5 - 13, Body Part
Exact match
SpanGroup 'sc' contains 4 spans:
- Span: 'κοιλίην' [5, 13], Labels: Body Part
- Span: 'ἐς' [29, 32], Labels: Topography
- Span: 'ἀγαθὸν' [61, 69], Labels: Adjectives/Qualities
- Span: 'τεθορύβηνται' [124, 137], Labels: Topography
Gold span: ἔχουσιν, 19 - 28, Topography
Predicted span: κοιλίην, 5 - 13, Body Part
Gold span: ἔχουσιν, 19 - 28, Topography
Predicted span: ἐς, 29 - 32, Topography
Gold span: ἔχουσιν, 19 - 28, Topography
Predicted span: ἀγαθὸν, 61 - 69, Adjectives/Qualities
Gold span: ἔχουσιν, 19 - 28, Topography
Predicted span: τεθορύβηνται, 124 - 137, Topography
SpanGroup 'sc' contains 10 spans:
- Span: 'κάτωθεν' [14, 22], Labels: To

Evaluating models:  42%|████▏     | 22/52 [00:01<00:02, 14.59it/s]

SpanGroup 'sc' contains 5 spans:
- Span: 'ἀποτομαὶ' [15, 25], Labels: Body Part
- Span: 'φλεβίων' [31, 39], Labels: Body Part
- Span: 'εἰς' [40, 44], Labels: Topography
- Span: 'κύστιν' [50, 57], Labels: Body Part
- Span: 'καθήκουσιν' [58, 69], Labels: Topography
Gold span: ἀποτομαὶ, 15 - 25, Body Part
Predicted span: ἀποτομαὶ, 15 - 25, Body Part
Exact match
SpanGroup 'sc' contains 5 spans:
- Span: 'ἀποτομαὶ' [15, 25], Labels: Body Part
- Span: 'φλεβίων' [31, 39], Labels: Body Part
- Span: 'εἰς' [40, 44], Labels: Topography
- Span: 'κύστιν' [50, 57], Labels: Body Part
- Span: 'καθήκουσιν' [58, 69], Labels: Topography
Gold span: φλεβίων, 31 - 39, Body Part
Predicted span: ἀποτομαὶ, 15 - 25, Body Part
Gold span: φλεβίων, 31 - 39, Body Part
Predicted span: φλεβίων, 31 - 39, Body Part
Exact match
SpanGroup 'sc' contains 5 spans:
- Span: 'ἀποτομαὶ' [15, 25], Labels: Body Part
- Span: 'φλεβίων' [31, 39], Labels: Body Part
- Span: 'εἰς' [40, 44], Labels: Topography
- 

Evaluating models:  46%|████▌     | 24/52 [00:01<00:01, 14.11it/s]

SpanGroup 'sc' contains 2 spans:
- Span: 'κνήμην' [26, 33], Labels: Body Part
- Span: 'τοὐκτός' [55, 64], Labels: Body Part
Gold span: κνήμην, 26 - 33, Body Part
Predicted span: κνήμην, 26 - 33, Body Part
Exact match
SpanGroup 'sc' contains 2 spans:
- Span: 'κνήμην' [26, 33], Labels: Body Part
- Span: 'τοὐκτός' [55, 64], Labels: Body Part
Gold span: πρὸς τοὐκτός, 49 - 64, Topography
Predicted span: κνήμην, 26 - 33, Body Part
Gold span: πρὸς τοὐκτός, 49 - 64, Topography
Predicted span: τοὐκτός, 55 - 64, Body Part
SpanGroup 'sc' contains 5 spans:
- Span: 'δικραίῳ' [9, 18], Labels: Body Part
- Span: 'ὀστέον' [31, 39], Labels: Body Part
- Span: 'κνήμη' [43, 49], Labels: Body Part
- Span: 'γιγγλύμῳ' [72, 82], Labels: Body Part
- Span: 'ἀνήρμοσται' [83, 95], Labels: Topography
Gold span: δικραίῳ, 9 - 18, Division
Predicted span: δικραίῳ, 9 - 18, Body Part
Exact match
SpanGroup 'sc' contains 5 spans:
- Span: 'δικραίῳ' [9, 18], Labels: Body Part
- Span: 'ὀστέο

Evaluating models:  54%|█████▍    | 28/52 [00:01<00:01, 14.60it/s]

SpanGroup 'sc' contains 11 spans:
- Span: 'ἀποδεῖραι' [82, 93], Labels: Action Verbs
- Span: 'ἐπιγάστριον' [108, 121], Labels: Adjectives/Qualities
- Span: 'δέρμα' [122, 128], Labels: Body Part
- Span: 'διασπασθαί' [228, 239], Labels: Topography
- Span: 'ὑποκειμένων' [248, 261], Labels: Topography
- Span: 'κεχώρισται' [270, 281], Labels: Topography
- Span: 'ὑποβεβλημένων' [317, 332], Labels: Topography
- Span: 'μυῶν' [333, 338], Labels: Body Part
- Span: 'συνεχής' [342, 350], Labels: Adjectives/Qualities
- Span: 'ὑμὴν' [351, 357], Labels: Body Part
- Span: 'δέρματι' [363, 371], Labels: Body Part
Gold span: χειρουργοίη, 49 - 61, Action Verbs
Predicted span: ἀποδεῖραι, 82 - 93, Action Verbs
Gold span: χειρουργοίη, 49 - 61, Action Verbs
Predicted span: ἐπιγάστριον, 108 - 121, Adjectives/Qualities
Gold span: χειρουργοίη, 49 - 61, Action Verbs
Predicted span: δέρμα, 122 - 128, Body Part
Gold span: χειρουργοίη, 49 - 61, Action Verbs
Predicted span: διασπασθαί, 228 -

Evaluating models:  62%|██████▏   | 32/52 [00:02<00:01, 15.29it/s]

SpanGroup 'sc' contains 16 spans:
- Span: 'τείνουσιν' [0, 10], Labels: Topography
- Span: 'ἐκ' [11, 14], Labels: Topography
- Span: 'ἀορτῆς' [20, 28], Labels: Body Part
- Span: 'πόροι' [29, 35], Labels: Body Part
- Span: 'φλεβικοὶ' [36, 45], Labels: Adjectives/Qualities
- Span: 'μέχρι' [46, 52], Labels: Topography
- Span: 'κεφαλῆς' [58, 66], Labels: Body Part
- Span: 'ὄρχεως' [83, 91], Labels: Body Part
- Span: 'ἄλλοι' [98, 105], Labels: Adjectives/Qualities
- Span: 'ἀπὸ' [106, 111], Labels: Topography
- Span: 'νεφρῶν' [117, 124], Labels: Body Part
- Span: 'δύο' [125, 129], Labels: Adjectives/Qualities
- Span: 'αἱματώδεις' [154, 166], Labels: Adjectives/Qualities
- Span: 'ἐκ' [175, 178], Labels: Topography
- Span: 'ἀορτῆς' [184, 192], Labels: Body Part
- Span: 'ἄναιμοι' [193, 202], Labels: Adjectives/Qualities
Gold span: τείνουσιν, 0 - 10, Topography
Predicted span: τείνουσιν, 0 - 10, Topography
Exact match
SpanGroup 'sc' contains 16 spans:
- Span: 'τείνουσιν'

Evaluating models:  69%|██████▉   | 36/52 [00:02<00:01, 15.16it/s]

SpanGroup 'sc' contains 6 spans:
- Span: 'εὐρήσεις' [0, 10], Labels: Action Verbs
- Span: 'ἀφηρημένων' [22, 34], Labels: Action Verbs
- Span: 'μυῶν' [48, 53], Labels: Body Part
- Span: 'εἰς' [59, 63], Labels: Topography
- Span: 'γαστροκνημίαν' [69, 83], Labels: Body Part
- Span: 'ἠκόντων' [84, 93], Labels: Topography
Gold span: εὐρήσεις, 0 - 10, Action Verbs
Predicted span: εὐρήσεις, 0 - 10, Action Verbs
Exact match
SpanGroup 'sc' contains 6 spans:
- Span: 'εὐρήσεις' [0, 10], Labels: Action Verbs
- Span: 'ἀφηρημένων' [22, 34], Labels: Action Verbs
- Span: 'μυῶν' [48, 53], Labels: Body Part
- Span: 'εἰς' [59, 63], Labels: Topography
- Span: 'γαστροκνημίαν' [69, 83], Labels: Body Part
- Span: 'ἠκόντων' [84, 93], Labels: Topography
Gold span: ἀφηρημένων, 22 - 34, Action Verbs
Predicted span: εὐρήσεις, 0 - 10, Action Verbs
Gold span: ἀφηρημένων, 22 - 34, Action Verbs
Predicted span: ἀφηρημένων, 22 - 34, Action Verbs
Exact match
SpanGroup 'sc' contains 6 spans:

Evaluating models:  77%|███████▋  | 40/52 [00:02<00:00, 15.83it/s]

SpanGroup 'sc' contains 15 spans:
- Span: 'παρὰ' [0, 5], Labels: Topography
- Span: 'μηρούς' [16, 23], Labels: Body Part
- Span: 'περόναι' [24, 32], Labels: Body Part
- Span: 'παρήκουσιν' [38, 49], Labels: Topography
- Span: 'ἐντός' [59, 66], Labels: Topography
- Span: 'ἐκτός' [74, 81], Labels: Topography
- Span: 'ἐς' [88, 91], Labels: Topography
- Span: 'ἄρθρον' [96, 104], Labels: Body Part
- Span: 'ἐξήκει' [115, 123], Labels: Topography
- Span: 'ἑτέρωθεν' [130, 140], Labels: Topography
- Span: 'πρὸς' [149, 154], Labels: Topography
- Span: 'ὀστέῳ' [160, 168], Labels: Body Part
- Span: 'προσπεφύκασι' [169, 182], Labels: Topography
- Span: 'προς' [183, 187], Labels: Topography
- Span: 'μηρῷ.' [193, 200], Labels: Body Part
Gold span: παρὰ, 0 - 5, Topography
Predicted span: παρὰ, 0 - 5, Topography
Exact match
SpanGroup 'sc' contains 15 spans:
- Span: 'παρὰ' [0, 5], Labels: Topography
- Span: 'μηρούς' [16, 23], Labels: Body Part
- Span: 'περόναι' [24, 32], Labels

Evaluating models:  85%|████████▍ | 44/52 [00:02<00:00, 16.44it/s]

SpanGroup 'sc' contains 2 spans:
- Span: 'φέρουσι' [28, 36], Labels: Topography
- Span: 'ὥνθρωπος' [85, 95], Labels: Body Part
Gold span: φέρουσι, 28 - 36, Physiology
Predicted span: φέρουσι, 28 - 36, Topography
Exact match
SpanGroup 'sc' contains 2 spans:
- Span: 'φέρουσι' [28, 36], Labels: Topography
- Span: 'ὥνθρωπος' [85, 95], Labels: Body Part
Gold span: αὐανθέωσιν, 60 - 72, Physiology
Predicted span: φέρουσι, 28 - 36, Topography
Gold span: αὐανθέωσιν, 60 - 72, Physiology
Predicted span: ὥνθρωπος, 85 - 95, Body Part
SpanGroup 'sc' contains 4 spans:
- Span: 'κύστις' [19, 26], Labels: Body Part
- Span: 'ὑμενοειδὴς' [27, 39], Labels: Adjectives/Qualities
- Span: 'ἄλλο' [46, 52], Labels: Adjectives/Qualities
- Span: 'ὑμένος' [64, 72], Labels: Body Part
Gold span: κύστις, 19 - 26, Body Part
Predicted span: κύστις, 19 - 26, Body Part
Exact match
SpanGroup 'sc' contains 4 spans:
- Span: 'κύστις' [19, 26], Labels: Body Part
- Span: 'ὑμενοειδὴς' [27, 39], Labels:

Evaluating models:  92%|█████████▏| 48/52 [00:03<00:00, 16.80it/s]

SpanGroup 'sc' contains 19 spans:
- Span: 'πρὸς' [0, 5], Labels: Topography
- Span: 'καυλὸν' [15, 22], Labels: Body Part
- Span: 'κύστεως' [33, 41], Labels: Body Part
- Span: 'συνήρτηται' [42, 53], Labels: Topography
- Span: 'αἰδοῖον' [58, 67], Labels: Body Part
- Span: 'ἐξωτάτω' [78, 87], Labels: Topography
- Span: 'τρῆμα' [88, 94], Labels: Body Part
- Span: 'συνερρωγὸς' [95, 106], Labels: Topography
- Span: 'εἰς' [107, 111], Labels: Topography
- Span: 'μικρὸν' [124, 131], Labels: Adjectives/Qualities
- Span: 'ὑποκάτω' [135, 144], Labels: Topography
- Span: 'εἰς' [154, 158], Labels: Topography
- Span: 'ὄρχεις' [165, 173], Labels: Body Part
- Span: 'φέρει' [174, 180], Labels: Topography
- Span: 'τρημάτων' [186, 195], Labels: Body Part
- Span: 'εἰς' [203, 207], Labels: Topography
- Span: 'κύστιν' [213, 220], Labels: Body Part
- Span: 'νευρῶδες' [222, 231], Labels: Adjectives/Qualities
- Span: 'χονδρῶδες' [237, 247], Labels: Adjectives/Qualities
Gold span: πρὸς, 0

Evaluating models: 100%|██████████| 52/52 [00:03<00:00, 15.43it/s]

SpanGroup 'sc' contains 3 spans:
- Span: 'ἠέρα' [16, 22], Labels: Body Part
- Span: 'ὁδὸν' [73, 79], Labels: Body Part
- Span: 'ἐκβάλλειν' [80, 91], Labels: Action Verbs
Gold span: ὀπίσω, 52 - 59, Topography
Predicted span: ἠέρα, 16 - 22, Body Part
Gold span: ὀπίσω, 52 - 59, Topography
Predicted span: ὁδὸν, 73 - 79, Body Part
Gold span: ὀπίσω, 52 - 59, Topography
Predicted span: ἐκβάλλειν, 80 - 91, Action Verbs
SpanGroup 'sc' contains 3 spans:
- Span: 'ἠέρα' [16, 22], Labels: Body Part
- Span: 'ὁδὸν' [73, 79], Labels: Body Part
- Span: 'ἐκβάλλειν' [80, 91], Labels: Action Verbs
Gold span: τὴν, 60 - 64, Topography
Predicted span: ἠέρα, 16 - 22, Body Part
Gold span: τὴν, 60 - 64, Topography
Predicted span: ὁδὸν, 73 - 79, Body Part
Gold span: τὴν, 60 - 64, Topography
Predicted span: ἐκβάλλειν, 80 - 91, Action Verbs
SpanGroup 'sc' contains 3 spans:
- Span: 'ἠέρα' [16, 22], Labels: Body Part
- Span: 'ὁδὸν' [73, 79], Labels: Body Part
- Span: 'ἐκβάλλειν




In [None]:
# print results for the first 50 docs and the column of only the first 4 columns
df_evaluate_spans.iloc[:50]


In [35]:
import spacy
from spacy.tokens import DocBin
FORMAT = 'NFKD'

nlp = spacy.load('../training/ATLOMY_G_NER_pipeline/spancat_210524/model-best')
nlp.get_pipe("spancat").cfg["threshold"] = 0.2
nlp.get_pipe("spancat").cfg

train_docbin = DocBin().from_disk("../corpus/train/spancat_train/spancat_train_{0}.spacy".format(FORMAT))
# get docs from new_docbin
train_docs = list(train_docbin.get_docs(nlp.vocab))
dev_docbin = DocBin().from_disk("../corpus/dev/spancat_dev/spancat_dev_{0}.spacy".format(FORMAT))
# get docs from new_docbin
dev_docs = list(dev_docbin.get_docs(nlp.vocab))
test_docbin = DocBin().from_disk("../corpus/test/spancat_test/spancat_test_{0}.spacy".format(FORMAT))
# get docs from new_docbin
test_docs = list(test_docbin.get_docs(nlp.vocab))
# count sentences in train, test and dev data
print ("train:", len(train_docs), "dev:", len(dev_docs), "test:", len(test_docs))
# find sentence and print spans
for doc in test_docs:
    # checking a sentenc that had a word with two labels
    #if doc.text == "ἅμα δʼ ἡ ἀνάπνευσις καὶ ἔκπνευσις γίνεται εἰς τὸ στῆθος, καὶ ἀδύνατον χωρὶς τοῖς μυκτῆρσιν ἀναπνεῦσαι ἢ ἐκπνεῦσαι, διὰ τὸ ἐκ τοῦ στήθους εἶναι τὴν ἀναπνοὴν καὶ ἐκπνοὴν κατὰ τὸν γαργαρεῶνα, καὶ μὴ ἐκ τῆς κεφαλῆς τινι μέρει":
    if doc.text == "εἶτα εἰς ἑκάτερον τὸ ἰσχίον ἀφανίζεται ἑκατέρα πρῶτον, ἔπειτα δῆλαι γίγνονται πάλιν διατεταμέναι πρὸς τὸ ἰσχίον":
        print("doc user data: ", doc.user_data)
        # find spans that have the same token and more than one label
        
        print(doc.text)
        #for span in doc.spans["sc"]:
            #print(span.text, span.start_char, span.end_char, span.label_)
            # print character text in location of indices of span
            #print(doc.text[span.start_char:span.end_char])      
        # print spansgroup assuming you have a SpanGroup named 'sc' in doc.spans
        span_group = doc.spans["sc"]
        
        print(f"SpanGroup '{span_group.name}' contains {len(span_group)} spans:")
        for span in span_group:
            print(f"- Span: '{span.text}' [{span.start_char}, {span.end_char}], Labels: {span.label_}")
            
        perdicted = nlp(doc.text)
        # print spansgroup assuming you have a SpanGroup named 'sc' in doc.spans
        span_group = perdicted.spans["sc"]
        print(f"SpanGroup '{span_group.name}' contains {len(span_group)} perdicted spans:")
        print(f" Scores: {span_group.attrs['scores']}")

        for i, span in enumerate(span_group):
            score = span_group.attrs["scores"][i]
            print(f"- Span: '{span.text}' [{span.start_char}, {span.end_char}], Labels: {span.label_}, Score: {score:.4f}")



train: 412 dev: 53 test: 52
doc user data:  {'source_info': 'Coda, Aristotle, History of Animals, B(3), C(4), P(138), L(3)'}
εἶτα εἰς ἑκάτερον τὸ ἰσχίον ἀφανίζεται ἑκατέρα πρῶτον, ἔπειτα δῆλαι γίγνονται πάλιν διατεταμέναι πρὸς τὸ ἰσχίον
SpanGroup 'sc' contains 9 spans:
- Span: 'ἰσχίον' [27, 35], Labels: Body Part
- Span: 'ἰσχίον' [126, 134], Labels: Body Part
- Span: 'ἑκάτερον' [12, 22], Labels: Adjectives/Qualities
- Span: 'ἑκατέρα' [49, 58], Labels: Adjectives/Qualities
- Span: 'δῆλαι' [77, 83], Labels: Adjectives/Qualities
- Span: 'εἰς' [7, 11], Labels: Topography
- Span: 'ἀφανίζεται' [36, 48], Labels: Topography
- Span: 'διατεταμέναι' [102, 115], Labels: Topography
- Span: 'πρὸς' [116, 121], Labels: Topography
SpanGroup 'sc' contains 10 perdicted spans:
 Scores: [0.2159974  0.7930039  0.27780434 0.9726317  0.9033647  0.35614106
 0.39842072 0.83058286 0.98641694 0.270216  ]
- Span: 'εἶτα' [0, 6], Labels: Topography, Score: 0.2160
- Span: 'εἰς'