In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
from collections import Counter

In [None]:
def compute_f1_score(pred_tokens, true_tokens):
    # Create Counter objects for both sets of tokens
    pred_counts = Counter(pred_tokens)
    true_counts = Counter(true_tokens)
    
    # Compute the number of common tokens between prediction and truth
    common = pred_counts & true_counts
    num_same = sum(common.values())
    
    if num_same == 0:
        return 0
    
    # Precision: proportion of predicted tokens that are correct
    precision = num_same / len(pred_tokens)
    # Recall: proportion of true tokens that are predicted
    recall = num_same / len(true_tokens)
    
    # F1 score: harmonic mean of precision and recall
    f1_score = 2 * (precision * recall) / (precision + recall)
    return f1_score

In [None]:
# Load the CSV file
data = pd.read_csv("Data/q_and_a.csv")

In [None]:
# Function to apply F1 score calculation to each row
def apply_f1_scores(row):
    reference_answer = row['reference_answer']
    answers = {
        'chat_gpt_answer': row['chat_gpt_answer'],
        'google_gemini_answer': row['google_gemini_answer'],
        'fine_tuned_model_answer': row['fine_tuned_model_answer']
    }
    
    # Tokenize the reference answer
    true_tokens = word_tokenize(reference_answer)
    
    # Dictionary to store F1 scores
    f1_scores = {}
    
    # Calculate F1 score for each model's answer
    for model, answer in answers.items():
        pred_tokens = word_tokenize(answer)
        f1_scores[f'{model}_f1'] = compute_f1_score(pred_tokens, true_tokens)
    
    return pd.Series(f1_scores)

In [None]:
# Apply the function to each row in the dataframe
f1_results = data.apply(apply_f1_scores, axis=1)

In [None]:
# Combine the original data with the F1 results
final_results = pd.concat([data, f1_results], axis=1)

In [None]:
# save to a new CSV file
final_results.to_csv('Data/q_and_a_F1.csv', index=False)