<a href="https://colab.research.google.com/github/Cicciokr/latin-ai-model/blob/main/Calculate_NPMI_TD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import LatentDirichletAllocation
from transformers import RobertaTokenizer, RobertaModel
import torch

# Funzione per calcolare la coerenza dei topic (NPMI)
def calculate_npmi(topics, documents, vocab):
    vectorizer = CountVectorizer(vocabulary=vocab)
    doc_term_matrix = vectorizer.fit_transform(documents)

    def npmi_score(topic):
        scores = []
        for i in range(len(topic)):
            for j in range(i + 1, len(topic)):
                word_i = topic[i]
                word_j = topic[j]
                p_i = np.sum(doc_term_matrix[:, vocab.index(word_i)].toarray()) / len(documents)
                p_j = np.sum(doc_term_matrix[:, vocab.index(word_j)].toarray()) / len(documents)
                p_ij = np.sum(
                    (doc_term_matrix[:, vocab.index(word_i)].toarray() > 0) &
                    (doc_term_matrix[:, vocab.index(word_j)].toarray() > 0)
                ) / len(documents)
                if p_ij > 0:
                    score = np.log(p_ij / (p_i * p_j)) / -np.log(p_ij)
                else:
                    score = 0
                scores.append(score)
        return np.mean(scores)

    return np.mean([npmi_score(topic) for topic in topics])

# Funzione per calcolare la diversità dei topic (TD)
def calculate_td(topics):
    unique_words = set()
    total_words = 0
    for topic in topics:
        unique_words.update(topic)
        total_words += len(topic)
    return len(unique_words) / total_words

# Funzione per generare embedding con RoBERTa
def generate_embeddings(documents):
    tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    model = RobertaModel.from_pretrained('roberta-base')

    embeddings = []
    for doc in documents:
        inputs = tokenizer(doc, return_tensors='pt', truncation=True, padding=True, max_length=512)
        outputs = model(**inputs)
        embeddings.append(outputs.last_hidden_state.mean(1).detach().numpy())
    return np.vstack(embeddings)

# Funzione per estrarre i topic con LDA
def extract_topics(documents, n_topics=5, n_words=10):
    vectorizer = CountVectorizer()
    doc_term_matrix = vectorizer.fit_transform(documents)
    lda = LatentDirichletAllocation(n_components=n_topics, random_state=42)
    lda.fit(doc_term_matrix)
    topics = []
    for topic_idx, topic in enumerate(lda.components_):
        topics.append([vectorizer.get_feature_names_out()[i] for i in topic.argsort()[:-n_words - 1:-1]])
    return topics

# Funzione principale per calcolare i parametri
def evaluate_topics(documents, topics):
    # Prepara il vocabolario
    vocab = list(set(word for topic in topics for word in topic))

    # Calcola NPMI
    npmi = calculate_npmi(topics, documents, vocab)

    # Calcola TD
    td = calculate_td(topics)

    # Calcola le distanze con embedding RoBERTa
    embeddings = generate_embeddings(documents)
    distances = pairwise_distances(embeddings, metric='cosine')

    return {'NPMI': npmi, 'TD': td, 'Embedding Distances': distances}

# Esempio di utilizzo
documents = [
    "Roma è la capitale dell'Impero Romano.",
    "Cesare fu un grande condottiero romano.",
    "La letteratura latina è ricca di poesia epica e drammatica."
]

# Estrai i topic
topics = extract_topics(documents, n_topics=3, n_words=5)
print("Topics estratti:", topics)

# Calcola metriche
evaluation = evaluate_topics(documents, topics)
print(evaluation)