In [1]:
import pandas as pd
from sentence_transformers import SentenceTransformer, util
import torch
from sklearn.metrics.pairwise import distance
import json

# Initialize the model
model = SentenceTransformer("Embedding-model/gtebase")

topics_df = pd.read_csv('meta-evaluation/MetaEvaluation-Topics.csv', sep=';', usecols=[0,1])
ground_truth_df = pd.read_csv('meta-evaluation/MetaEvaluation-Topics.csv', sep=';', usecols=[0,2])


# Parse the GroundTruth field into a dictionary
def parse_ground_truth(ground_truth_str):
    return {k.strip(): int(v.strip()) for k, v in (item.split(':') for item in ground_truth_str.split(','))}

ground_truth_df['Set2'] = ground_truth_df['Set2'].apply(parse_ground_truth)


# Function to calculate cosine similarity
def calculate_similarity(embeddings1, embeddings2):
    return util.pytorch_cos_sim(embeddings1, embeddings2)


# Calculate diversity based on embeddings
def calculate_diversity(embeddings):
    n = len(embeddings)
    if n < 2:
        return 0  # No diversity score if less than 2 topics
    distances = []
    for i in range(n):
        for j in range(i + 1, n):
            dist = 1 - util.pytorch_cos_sim(embeddings[i], embeddings[j])
            adjusted_dist = dist / 2  # Adjusting the range to 0 to 1
            distances.append(adjusted_dist)
    return sum(distances) / len(distances) if distances else 0


def compute_metrics(article_id, topics, ground_truth):
    
    ground_truth_topics = ground_truth[article_id]
    ground_truth_sorted = sorted(ground_truth_topics.items(), key=lambda item: item[1], reverse=True)
     
    N = min(8, len(ground_truth_sorted))
    top_n_ground_truth = dict(ground_truth_sorted[:N])
    correct_topics = set(topics).intersection(top_n_ground_truth.keys())
    
    ## EXACT-MATCHING
    
    # Exact-Match Precision
    ExactMatch_precision = len(correct_topics) / len(topics) if topics else 0
    
    # Exact-Match Weighted Recall
    correct_weights = sum(top_n_ground_truth[topic] for topic in correct_topics)
    total_weight_top_n = sum(top_n_ground_truth.values())
    ExactMatch_weighted_recall = correct_weights / total_weight_top_n if total_weight_top_n else 0
    
    # Exact-Match F1 Score
    ExactMatch_f1_score = 2 * (ExactMatch_precision * ExactMatch_weighted_recall) / (ExactMatch_precision + ExactMatch_weighted_recall) if ExactMatch_precision + ExactMatch_weighted_recall != 0 else 0
    
    ## EMBEDDING-BASED
    
    # Embeddings
    topic_embeddings = model.encode(list(topics))
    gt_topic_embeddings = model.encode(list(top_n_ground_truth.keys()))
    
    # Calculate cosine similarity
    similarity_matrix = calculate_similarity(topic_embeddings, gt_topic_embeddings)
    
    # Compute Precision
    max_similarity_per_identified = similarity_matrix.max(dim=1).values
    Embeddings_precision = max_similarity_per_identified.mean().item()
    
    # Compute Weighted Recall
    max_similarity_per_gt = similarity_matrix.max(dim=0).values
    weighted_similarity_scores = max_similarity_per_gt * torch.tensor(list(top_n_ground_truth.values()))
    Embeddings_weighted_recall = weighted_similarity_scores.sum().item() / sum(top_n_ground_truth.values())
    
    # Compute F1 Score
    if Embeddings_precision + Embeddings_weighted_recall == 0:
        Embeddings_f1_score = 0
    else:
        Embeddings_f1_score = 2 * (Embeddings_precision * Embeddings_weighted_recall) / (Embeddings_precision + Embeddings_weighted_recall)
    
    ## TOPIC DIVERSITY
    
    diversity_score = calculate_diversity(topic_embeddings)
    
    return ExactMatch_precision, ExactMatch_weighted_recall, ExactMatch_f1_score, Embeddings_precision, Embeddings_weighted_recall, Embeddings_f1_score, diversity_score

# Apply metrics to all articles
ExactMatch_results = []
Embeddings_results = []
Diversity_results = []

for _, row in topics_df.iterrows():
    article_id = row['ArticleID']
    topics = row['Set1'].split(', ')
    
    ExactMatch_precision, ExactMatch_weighted_recall, ExactMatch_f1_score, Embeddings_precision, Embeddings_weighted_recall, Embeddings_f1_score, diversity = compute_metrics(article_id, topics, ground_truth_df.set_index('ArticleID')['Set2'])
    
    ExactMatch_results.append({'ArticleID': article_id, 'ExactMatch_precision': ExactMatch_precision, 'ExactMatch_weighted_recall': ExactMatch_weighted_recall, 'ExactMatch_f1_score': ExactMatch_f1_score})
    Embeddings_results.append({'ArticleID': article_id, 'Embeddings_precision': Embeddings_precision, 'Embeddings_weighted_recall': Embeddings_weighted_recall, 'Embeddings_f1_score': Embeddings_f1_score})
    Diversity_results.append({'ArticleID': article_id, 'Diversity': diversity})

    
ExactMatch_results_df = pd.DataFrame(ExactMatch_results)
Embeddings_results_df = pd.DataFrame(Embeddings_results)
Diversity_results_df = pd.DataFrame(Diversity_results)

ExactMatch_final_precision = ExactMatch_results_df['ExactMatch_precision'].mean()
ExactMatch_final_weighted_recall = ExactMatch_results_df['ExactMatch_weighted_recall'].mean()
ExactMatch_final_f1_score = ExactMatch_results_df['ExactMatch_f1_score'].mean()

Embeddings_final_precision = Embeddings_results_df['Embeddings_precision'].mean()
Embeddings_final_weighted_recall = Embeddings_results_df['Embeddings_weighted_recall'].mean()
Embeddings_final_f1_score = Embeddings_results_df['Embeddings_f1_score'].mean()

final_diversity = Diversity_results_df['Diversity'].mean()

ExactMatch_final_scores = {'Final Precision': ExactMatch_final_precision, 'Final Weighted Recall': ExactMatch_final_weighted_recall, 'Final F1 Score': ExactMatch_final_f1_score}
Embeddings_scores = {'Final Precision': Embeddings_final_precision, 'Final Weighted Recall': Embeddings_final_weighted_recall, 'Final F1 Score': Embeddings_final_f1_score}

# Load the CSV file into a DataFrame
csv_df = pd.read_csv('meta-evaluation/MetaEvaluation-Topics.csv', sep=';')

# Assume you have a common column in both DataFrames to merge on, e.g., 'ID'
# Adjust 'ID' to your actual column name used for matching
merged_df = pd.merge(csv_df, ExactMatch_results_df[['ArticleID', 'ExactMatch_f1_score']], on='ArticleID', how='left')
merged_df2 = pd.merge(merged_df, Embeddings_results_df[['ArticleID', 'Embeddings_f1_score']], on='ArticleID', how='left')

print(merged_df2)
# Save the updated DataFrame back to CSV
#merged_df.to_csv('meta-evaluation/MetaEvaluation-Topics.csv', sep=';')

print("Finished")


  from tqdm.autonotebook import tqdm, trange


SafetensorError: Error while deserializing header: MetadataIncompleteBuffer

In [None]:
import pandas as pd
from scipy.stats import pearsonr, spearmanr, kendalltau

# Function to calculate correlations and p-values
def calculate_correlations(df, col1, col2):
    # Pearson correlation
    pearson_corr, pearson_p = pearsonr(df[col1], df[col2])
    print(f"Pearson correlation between {col1} and {col2}: {pearson_corr:.3f}, p-value: {pearson_p:.3g}")
    
    # Spearman correlation
    spearman_corr, spearman_p = spearmanr(df[col1], df[col2])
    print(f"Spearman correlation between {col1} and {col2}: {spearman_corr:.3f}, p-value: {spearman_p:.3g}")
    
    # Kendall correlation
    kendall_corr, kendall_p = kendalltau(df[col1], df[col2])
    print(f"Kendall correlation between {col1} and {col2}: {kendall_corr:.3f}, p-value: {kendall_p:.3g}")
    
# Calculate correlations between score1 and score2
calculate_correlations(merged_df2, 'Score', 'ExactMatch_f1_score')

# Calculate correlations between score1 and score3
calculate_correlations(merged_df2, 'Score', 'Embeddings_f1_score')
