In [14]:
!pip install cltk

import torch
from transformers import pipeline
from sklearn.metrics import f1_score
from cltk import NLP
import json

def compare_features(features1, features2):
    """
    Confronta due dizionari di caratteristiche morfologiche per verificare se sono equivalenti.
    """
    if not features1 or not features2:
        return False  # Nessuna corrispondenza se uno dei due è vuoto

    # Converto le chiavi e i valori in stringhe per evitare errori
    features1 = {str(key): list(map(str, value)) for key, value in features1.items()}
    features2 = {str(key): list(map(str, value)) for key, value in features2.items()}
    
    # Confronto chiave per chiave con conversione delle liste in set
    for key, value in features1.items():
        if key not in features2:
            return False
        if set(features1[key]) != set(features2[key]):
            return False
    
    return True
    
def analyze_morph(token: str, nlp_pipeline) -> dict:
    """
    Esegue l'analisi morfologica di 'token' con la pipeline CLTK (per il latino).
    Restituisce un dizionario con 'lemma', 'upos' e 'features' (o vuoto se non riconosciuto).
    """
    doc = nlp_pipeline(token)
    if doc and len(doc.words) > 0:
        w = doc.words[0]
        return {
            "lemma": w.lemma,
            "upos": w.upos,
            "features": w.features
        }
    else:
        return {}

def calculate_metrics(results):
    """
    Calcola diverse metriche basate sui risultati delle predizioni.

    Args:
    - results: lista di dizionari con chiavi:
        - `exact_match`: True/False
        - `top_k_match`: True/False
        - `gold_morph`: info morfologica gold
        - `pred_morph`: info morfologica predetta

    Returns:
    - metrics: dizionario con tutte le metriche calcolate
    """
    total = len(results)

    exact_matches = sum(result["exact_match"] for result in results)
    top_k_matches = sum(result["top_k_match"] for result in results)
    
    # Morfologia
    morph_matches = sum(
        result["gold_morph"] and any(
            compare_features(result["gold_morph"].get("features"), pred_morph.get("features"))
            for pred_morph in result["pred_morphs"]
        )
        for result in results
    )
    
    # Lemmatura su tutte le predizioni
    lemma_matches = sum(
        result["gold_morph"] and any(
            result["gold_morph"].get("lemma") == pred_morph.get("lemma")
            for pred_morph in result["pred_morphs"]
        )
        for result in results
    )

    # POS tagging su tutte le predizioni
    pos_matches = sum(
        result["gold_morph"] and any(
            result["gold_morph"].get("upos") == pred_morph.get("upos")
            for pred_morph in result["pred_morphs"]
        )
        for result in results
    )

    return {
        "tag_acc": exact_matches / total if total > 0 else 0.0,
        "top_k_acc": top_k_matches / total if total > 0 else 0.0,
        "morph_acc": morph_matches / total if total > 0 else 0.0,
        "lemma_acc": lemma_matches / total if total > 0 else 0.0,
        "pos_acc": pos_matches / total if total > 0 else 0.0,
    }


def evaluate_fill_mask_with_full_metrics(models, test_data, top_k=5):
    """
    Valuta i modelli di fill-mask su un dataset e calcola metriche avanzate.

    Args:
    - models: dizionario di pipeline fill-mask
    - test_data: lista di dizionari con chiavi:
        - `masked_sentence`: frase contenente <mask>
        - `correct_token`: token corretto
    - top_k: numero di predizioni da considerare per il Top-k

    Returns:
    - None, stampa i risultati
    """
    lat_nlp = NLP("lat")

    for model_name, fillmask_pipeline in models.items():
        results = []

        print(f"\n=== Valutazione Modello: {model_name} ===")

        for entry in test_data:
            masked_sent = entry["masked_sentence"]
            gold_token = entry["correct_token"]

            # Esegui la predizione Top-k
            predictions = fillmask_pipeline(masked_sent, top_k=top_k)
            pred_tokens = [pred["token_str"].strip().replace(" ", "") for pred in predictions]

            # Exact match (gold token è la prima predizione)
            exact_match = gold_token == pred_tokens[0]

            # Top-k match (gold token è tra le prime k predizioni)
            top_k_match = gold_token in pred_tokens

            # Analisi morfologica
            gold_morph_info = analyze_morph(gold_token, lat_nlp)
            #pred_morph_info = analyze_morph(pred_tokens[0], lat_nlp)
            pred_morphs = [analyze_morph(token, lat_nlp) for token in pred_tokens]

            # Salva i risultati per il calcolo delle metriche
            results.append({
                "exact_match": exact_match,
                "top_k_match": top_k_match,
                "gold_token": gold_token,
                "predicted_tokens": pred_tokens,
                "gold_morph": gold_morph_info,
                "pred_morphs": pred_morphs,
            })

        # Calcola le metriche per il modello
        metrics = calculate_metrics(results)
        print(f"\n=== Risultati per {model_name} ===")
        for metric_name, value in metrics.items():
            print(f"  {metric_name}: {value:.2%}")
        print("-" * 60)



# 1) Definiamo i modelli: XLM-RoBERTa-base e RoBERTa-base
model_xlmr = "Cicciokr/XLM-Roberta-Base-Latin-Uncased-V2"
model_roberta = "Cicciokr/Roberta-Base-Latin-Uncased-V2"

# 2) Creiamo le pipeline fill-mask
fillmask_xlmr = pipeline("fill-mask", model=model_xlmr, tokenizer=model_xlmr)
fillmask_roberta = pipeline("fill-mask", model=model_roberta, tokenizer=model_roberta)

# 3) Dataset di test
#with open("/kaggle/input/the-latin-library/test_data_latinlibrary.json", "r", encoding="utf-8") as f:
#    test_data = json.load(f)

test_data = [
    {
        "masked_sentence": "Gallia est omnis <mask> in partes tres, quarum unam incolunt Belgae.",
        "correct_token": "divisa"
    },
    {
        "masked_sentence": "Cum Caesar in Galliam venit, altero proelio vicit et hostes <mask>.",
        "correct_token": "repulit"
    },
    {
        "masked_sentence": "Romani multas provincias subegerunt, et eorum imperium magnam partem orbis terrarum <mask>.",
        "correct_token": "tenebat"
    },
    {
        "masked_sentence": "Hannibal Alpes transgressus magnum <mask> exercitum in Italiam duxit.",
        "correct_token": "Carthaginis"
    },
    {
        "masked_sentence": "Postquam Romani pacem cum Carthaginiensibus fecerunt, bellum contra <mask> gerere coeperunt.",
        "correct_token": "Macedoniam"
    },
    {
        "masked_sentence": "In foro Romano saepe disputationes de legibus et rebus <mask> fiebant.",
        "correct_token": "publicis"
    },
    {
        "masked_sentence": "Cum Cicero consul esset, Catilina coniurationem ad rem publicam <mask> paravit.",
        "correct_token": "oppugnandam"
    },
    {
        "masked_sentence": "Vir sapiens semper sibi <mask>, numquam fortunae, confidit.",
        "correct_token": "ipsi"
    },
    {
        "masked_sentence": "Milites Romanos hostes circumvenire conati sunt, sed legio prima <mask> restitit.",
        "correct_token": "fortiter"
    },
    {
        "masked_sentence": "Caesar pontem fluminis <mask> fabricari iussit ut copias transduceret.",
        "correct_token": "celeriter"
    },
    {
        "masked_sentence": "Philosophi Graeci multa de natura et principiis rerum <mask> scripserunt.",
        "correct_token": "disputabant"
    },
    {
        "masked_sentence": "Servi in villa laborantes agrum <mask> ac fruges curabant.",
        "correct_token": "arabant"
    },
    {
        "masked_sentence": "Litterae Romanae magnum <mask> ad humanitatem et virtutem dederunt.",
        "correct_token": "exemplum"
    },
    {
        "masked_sentence": "Senatores de bello Punico secundo diu <mask>, sed tandem bellum decreverunt.",
        "correct_token": "disputaverunt"
    },
    {
        "masked_sentence": "Rex Pyrrhus magnum elephantorum <mask> in proelium contra Romanos duxit.",
        "correct_token": "numerum"
    },
    {
        "masked_sentence": "Graecia artes litterasque multas <mask> Romanis tradidit.",
        "correct_token": "nobilissimas"
    },
    {
        "masked_sentence": "Dux fortis copias trans flumen <mask>, ut urbem oppugnaret.",
        "correct_token": "duxit"
    },
    {
        "masked_sentence": "Cum Brutus et Cassius coniuravissent, Caesar tamen <mask> non timuit.",
        "correct_token": "mortem"
    },
    {
        "masked_sentence": "Post victoriam, Romani templum Iovis <mask> in Capitolio dedicaverunt.",
        "correct_token": "magnificum"
    },
    {
        "masked_sentence": "Cum Graeci Troiam obsiderent, Ulixes consilium de equo ligneo <mask> proposuit.",
        "correct_token": "callide"
    }
]

# 4) Confrontiamo i due modelli
models_dict = {
    "XLM-RoBERTa-base": fillmask_xlmr,
    "RoBERTa-base": fillmask_roberta
}

evaluate_fill_mask_with_full_metrics(models=models_dict, test_data=test_data, top_k=5)



Device set to use cpu
Device set to use cpu


‎𐤀 CLTK version '1.4.0'. When using the CLTK in research, please cite: https://aclanthology.org/2021.acl-demo.3/

Pipeline for language 'Latin' (ISO: 'lat'): `LatinNormalizeProcess`, `LatinStanzaProcess`, `LatinEmbeddingsProcess`, `StopsProcess`, `LatinLexiconProcess`.

⸖ ``LatinStanzaProcess`` using Stanza model from the Stanford NLP Group: https://stanfordnlp.github.io/stanza/ . Please cite: https://arxiv.org/abs/2003.07082
⸖ ``LatinEmbeddingsProcess`` using word2vec model by University of Oslo from http://vectors.nlpl.eu/ . Please cite: https://aclanthology.org/W17-0237/
⸖ ``LatinLexiconProcess`` using Lewis's *An Elementary Latin Dictionary* (1890).

⸎ To suppress these messages, instantiate ``NLP()`` with ``suppress_banner=True``.

=== Valutazione Modello: XLM-RoBERTa-base ===

=== Risultati per XLM-RoBERTa-base ===
  tag_acc: 5.00%
  top_k_acc: 5.00%
  morph_acc: 15.00%
  lemma_acc: 5.00%
  pos_acc: 60.00%
------------------------------------------------------------

=== Valutazi