# Import libraries

In [3]:
### Pip install the necessary packages
# !pip install nltk
# !pip install rouge_score
# !pip install openai
# !pip install google-generativeai




In [4]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import numpy as np
import nltk
import openai
from nltk.tokenize import word_tokenize

In [11]:
# Download NLTK data
# nltk.download('punkt')

# Import data

In [3]:
# File path
input = r'test_predict_beit3.csv'
output_folder = r'output/'

In [5]:
# Save to pandas dataframe
df = pd.read_csv(input)
df

Unnamed: 0,Image,Question,Answer,Predict
0,ff557f4ea977d3a,What color is the top of the enchiladas,golden-brown,golden-brown
1,ff557f4ea977d3a,What type of dish is positioned on the left si...,enchiladas,salad
2,ff557f4ea977d3a,What color is the filling of the enchiladas,white,white
3,ff557f4ea977d3a,Where are the enchiladas located on the plate,on the left side,on the left side
4,ff557f4ea977d3a,What vegetable is scattered on top of the lettuce,tomatoes,tomatoes
...,...,...,...,...
3694,e95d2afc654400d,"Where are the turkey, spaghetti, and zoodles s...",throughout the entire pan,throughout
3695,73c6313f570a1cd,What color are the grilled zucchini sticks,yellow,green
3696,73c6313f570a1cd,What vegetable is shown on the platter,zucchini,zucchini
3697,73c6313f570a1cd,What color is the pepper sprinkled on the zucc...,black,green


# Code

## Bleu

Evaluates the similarity between a predicted text and a reference text based on n-gram precision.
How it is calculated:
- Tokenize each sentence.
- Count matching n-gram (usually 1-4).
- Applies a penalty if `len(prediction)` < `len(answer)`.

In this code:
- BLEU-4 is used via `sentence_bleu` function from nltk: Each n-gram from 1 to 4 contributes equally to the final score.
- `reference`: The target (goal) answers.
- `hypothesis`: Generated answers.
- `smoothie`: From nltk library to handle short sentences where high n-grams return no matches.

The BLEU score is computed as:
$$ BLEU = BP * exp(\sum^{N}_{n=1}w_n log(p_n)) $$
- BP: The penalty where:
    + BP = 1 if $l_{pred} \geq l_{ref}$
    + BP = $e^{1 - \frac{l_{ref}}{l_{pred}}}$ if $l_{pred} < l_{ref}$
    + $l_X$: length of X
- $p_n$: Count matching n-gram between predictions and reference.
    + Clip (fit) the count to the maximum number of times each n-gram appears in the reference to avoid overcounting.
    + $p_n$ = (sum of clipped n-gram counts) / (total n-grams in prediction).
- $w_n$: Weight of each n-gram. In BLEU-4, each n-gram has equal weight of 0.25.
- The exp function converts it back to a score of 0-1.

In [6]:
def calculate_bleu(reference, hypothesis):
    """Calculate BLEU score for a single pair of sentences"""
    ref_words = str(reference).strip().split()
    hyp_words = str(hypothesis).strip().split()
    
    if not ref_words or not hyp_words:
        return 0.0
        
    smoothie = SmoothingFunction().method4
    return sentence_bleu([ref_words], hyp_words, smoothing_function=smoothie)


In [8]:
# Change the column named 'answer' and 'predicted_answer' if necessary
if df is not None:
    bleu_scores = [calculate_bleu(row['Answer'], row['Predict']) 
                  for _, row in df.iterrows()]
    avg_bleu = np.mean(bleu_scores)
    
    # Create results DataFrame
    bleu_results = pd.DataFrame({
        'answer': df['Answer'], # Reference - Change here
        'predicted_answer': df['Predict'], # Hypothesis - Change here
        'bleu_score': bleu_scores
    })
    
    print(f"Average BLEU Score: {avg_bleu:.4f}")
    display(bleu_results.head())
    bleu_results.to_csv(output_folder + 'bleu_results.csv', index=False)

Average BLEU Score: 0.4797


Unnamed: 0,answer,predicted_answer,bleu_score
0,golden-brown,golden-brown,1.0
1,enchiladas,salad,0.0
2,white,white,1.0
3,on the left side,on the left side,1.0
4,tomatoes,tomatoes,1.0


## Rouge

Evaluates the similarity between a predicted text and a reference text based on recall of n-grams or longest common subsequences. How it works:
- Tokenize each sentence into words.
- Compute overlap between predictions and references:
    + `ROUGE-1` measures overlap of unigrams based on recall.
    + `ROUGE-2` measures overlap of bigrams.
    + `ROUGE-L` measures the longest common subsequence (LCS).
- Focuses on recall (fraction of reference content captured), reports F1 scores (harmonic mean of precision and recall).

In this code:
- `rouge_scorer` is used from the `rouge_score` library with stemming to normalize words.
- Calculates three variants:
    + rouge1: Unigram overlap.
    + rouge2: Bigram overlap.
    + rougeL: LCS-based similarity.
- `reference`: The target (goal) answers.
- `hypothesis`: Generated answers.

The ROUGE score is computed as:

ROUGE-N = (Number of overlapping N-grams) / (Total N-grams in reference)
- This is the recall score. In practice, F1 is reported:
F1 = 2*(Precision * Recall) / (Precision + Recall).
    + Precision = (Number of overlapping N-grams) / (Total N-grams in reference)
    + Recall = (Number of overlapping N-grams) / (Total N-grams in reference)

ROUGE-L = LCS(reference, prediction) / (Total words in reference)
- LCS: Length of LCS
- F1 is also computed using precision (LCS length / prediction length) and recall (LCS length / reference length).
- Scores range from 0-1.

In [9]:
def calculate_rouge(reference, hypothesis):
    """Calculate ROUGE scores for a single pair of sentences"""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(str(reference), str(hypothesis))
    return {
        'rouge1': scores['rouge1'].fmeasure,
        'rouge2': scores['rouge2'].fmeasure,
        'rougeL': scores['rougeL'].fmeasure
    }

In [10]:
if df is not None:
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    
    for _, row in df.iterrows():
        scores = calculate_rouge(row['Answer'], row['Predict']) # Change here
        rouge1_scores.append(scores['rouge1'])
        rouge2_scores.append(scores['rouge2'])
        rougeL_scores.append(scores['rougeL'])
    
    # Calculate averages
    avg_rouge1 = np.mean(rouge1_scores)
    avg_rouge2 = np.mean(rouge2_scores)
    avg_rougeL = np.mean(rougeL_scores)
    
    # Create results DataFrame
    rouge_results = pd.DataFrame({
        'answer': df['Answer'], # Reference - Change here
        'predicted_answer': df['Predict'], # Hypothesis - Change here
        'rouge1': rouge1_scores,
        'rouge2': rouge2_scores,
        'rougeL': rougeL_scores
    })
    
    print(f"Average ROUGE-1 Score: {avg_rouge1:.4f}")
    print(f"Average ROUGE-2 Score: {avg_rouge2:.4f}")
    print(f"Average ROUGE-L Score: {avg_rougeL:.4f}")
    display(rouge_results.head())
    rouge_results.to_csv(output_folder + 'beit3_rouge_results.csv', index=False)

Average ROUGE-1 Score: 0.5781
Average ROUGE-2 Score: 0.0853
Average ROUGE-L Score: 0.5780


Unnamed: 0,answer,predicted_answer,rouge1,rouge2,rougeL
0,golden-brown,golden-brown,1.0,1.0,1.0
1,enchiladas,salad,0.0,0.0,0.0
2,white,white,1.0,0.0,1.0
3,on the left side,on the left side,1.0,1.0,1.0
4,tomatoes,tomatoes,1.0,0.0,1.0


## Gpt

### Api keys:

In [None]:
gpt_api_key = "apikey" # Changew here

### GPTScore

## How GPTScore Works

Refer to this <a href="https://arxiv.org/pdf/2302.04166">paper</a>.

### Evaluation Protocol
- Define the **task** (e.g., summarization) and the **aspect** to evaluate (e.g., fluency) using specific instructions.
- Example instruction: "Generate a fluent summary for this text."

### Input Construction
- Combine the following components into a single prompt:
  - **Source Text**: The context, such as the original document or dialogue history (if applicable).
  - **Instruction**: A natural language description of the evaluation aspect.
  - **Demonstrations**: Optional exemplar samples (e.g., reference-hypothesis pairs with scores) to guide the model via in-context learning.

### Scoring
- Use a generative model to compute the likelihood of the **hypothesis text** (the generated text) given the constructed prompt.
- **Interpretation**: A higher likelihood indicates higher quality for the specified aspect.
- The score reflects how naturally the hypothesis aligns with the context and instruction.

### Meta-Evaluation
- Validate the computed scores by correlating them with human judgments.
- Common correlation measures include:
  - **Spearman Correlation**: Assesses monotonic relationships.
  - **Pearson Correlation**: Assesses linear relationships.

### Conceptual Formula
The GPTScore is conceptually defined as the conditional probability of the hypothesis given the context, instruction, and demonstrations:

$$
\text{GPTScore} = P(\text{hypothesis} \mid \text{context, instruction, demonstrations})
$$

Where:
- **context**: The source text or dialogue history providing the basis for evaluation.
- **instruction**: A natural language description specifying the evaluation aspect (e.g., fluency, relevance).
- **demonstrations**: Optional examples included in the prompt to enhance model performance through in-context learning.

### Implementation Note
- The paper approximates this probability using **log-likelihoods** or **normalized scores** derived from the generative model’s output.
- Practical implementations may vary, such as using API-based ratings when direct log probabilities are unavailable (e.g., with newer OpenAI APIs).

In [None]:
openai.api_key = gpt_api_key

def calculate_gptscore(reference, hypothesis, aspect="relevance", model="gpt-4o", demonstrations=None):
    """
    Calculate GPTScore for a hypothesis text given a reference and evaluation aspect.
    
    Args:
        reference (str): The reference or source text.
        hypothesis (str): The generated text to evaluate.
        aspect (str): The evaluation aspect (e.g., "fluency", "relevance").
        model (str): The GPT model to use (e.g., "gpt-4o", "gpt-4-turbo").
        demonstrations (list): Optional list of (ref, hypo, score) tuples for in-context learning.
    
    Returns:
        float: Normalized score between 0 and 1.
    """
    # Define aspect-specific instruction
    aspect_instructions = {
        "fluency": "Rate the fluency of the hypothesis based on the reference (0-1 scale).",
        "relevance": "Rate how relevant the hypothesis is to the reference (0-1 scale).",
        "informativeness": "Rate how informative the hypothesis is compared to the reference (0-1 scale)."
    }
    
    instruction = aspect_instructions.get(aspect, "Evaluate the quality of this text.")
    prompt = f"{instruction}\n\nReference: {reference}\nHypothesis: {hypothesis}\nScore:"

    # Add demonstrations if provided
    messages = [{"role": "system", "content": "You are a helpful assistant that evaluates text similarity. Respond with only the score and nothing else."}]
    
    if demonstrations:
        for demo_ref, demo_hypo, demo_score in demonstrations:
            messages.append({"role": "user", "content": f"Reference: {demo_ref}\nHypothesis: {demo_hypo}\nScore: {demo_score}"})
    
    messages.append({"role": "user", "content": prompt})

    # Call the OpenAI API
    client = openai.OpenAI(api_key=gpt_api_key)
    
    response = client.chat.completions.create(
        model=model,  # Correct model
        messages=messages,  # Correct API format
        max_tokens=5,  # Short response expected (score)
        temperature=0
    )
    
    # Extract score
    score_text = response.choices[0].message.content.strip()
    
    try:
        score = float(score_text)
        normalized_score = min(max(score, 0), 1)  # Clip to [0,1]
    except ValueError:
        normalized_score = 0.5  # Default score if GPT fails to return a valid number
    
    return normalized_score

In [None]:
demonstrations = [
    ("The cat sits on the mat.", "The cat rests on the rug.", 0.9),
    ("The dog barks loudly.", "The loud dog barks.", 0.8)
]

gpt_scores = []
for _, row in df.iterrows():
    score = calculate_gptscore(
        reference=row['answer'], # Change here 
        hypothesis=row['predicted_answer'], # Change here
        model="gpt-4o-mini",
        demonstrations=demonstrations
    )
    print(_, score)
    gpt_scores.append(score)

# Calculate average GPT score
avg_gpt = np.mean(gpt_scores)
print(f"Average GPTScore: {avg_gpt:.4f}")

# Add scores to DataFrame and save
df['gpt_score'] = gpt_scores
print(df)
df.to_csv(output_folder + 'gptscore_results.csv', index=False)

Average GPTScore: 0.5000
                                              answer  \
0            The sun rises in the east every morning   
1            The sun rises in the east every morning   
2           Python is a popular programming language   
3           Python is a popular programming language   
4  Artificial intelligence is transforming the world   
5           Python is a popular programming language   
6  Artificial intelligence is transforming the world   
7            The sun rises in the east every morning   
8            The sun rises in the east every morning   
9            The sun rises in the east every morning   

                                 predicted_answer  gpt_score  
0          The sun rises the a east every morning        0.5  
1               The sun the in east every morning        0.5  
2               popular a is programming language        0.5  
3        Python is a popular programming language        0.5  
4            Artificial the is transforming

## Evaluation Metrics: EM, Precision, Recall, and F1

These metrics assess the quality of predicted text against reference text at a token level, complementing n-gram-based (BLEU, ROUGE) and semantic (GPTScore) evaluations.

### Exact Match (EM)
- **Definition**: Measures if the predicted text exactly matches the reference text (case-insensitive, stripped of extra whitespace).
- **Formula**: \( EM = 1 \) if \( \text{predicted} == \text{reference} \), else \( 0 \).
- **Use**: Strict measure of correctness, useful for tasks requiring identical outputs.

### Precision
- **Definition**: Fraction of tokens in the predicted text that appear in the reference text.
- **Formula**: 
\[
\text{Precision} = \frac{|\text{predicted tokens} \cap \text{reference tokens}|}{|\text{predicted tokens}|}
\]
- **Use**: Highlights over-generation (extra tokens in prediction).

### Recall
- **Definition**: Fraction of tokens in the reference text captured by the predicted text.
- **Formula**: 
\[
\text{Recall} = \frac{|\text{predicted tokens} \cap \text{reference tokens}|}{|\text{reference tokens}|}
\]
- **Use**: Highlights under-generation (missing tokens from reference).

### F1 Score
- **Definition**: Harmonic mean of precision and recall, balancing both metrics.
- **Formula**: 
\[
F1 = 2 \cdot \frac{\text{Precision} \cdot \text{Recall}}{\text{Precision} + \text{Recall}}
\]
- **Use**: Provides a single score for overall token-level performance.

In [5]:
def calculate_em(reference, hypothesis):
    """Calculate Exact Match score."""
    ref_clean = " ".join(reference.strip().lower().split())
    hyp_clean = " ".join(hypothesis.strip().lower().split())
    return 1.0 if ref_clean == hyp_clean else 0.0

def calculate_precision_recall_f1(reference, hypothesis):
    """Calculate Precision, Recall, and F1 scores based on token overlap."""
    ref_tokens = set(word_tokenize(reference.lower()))
    hyp_tokens = set(word_tokenize(hypothesis.lower()))
    
    if not hyp_tokens:  # Avoid division by zero
        precision = 0.0 if ref_tokens else 1.0
    else:
        precision = len(ref_tokens & hyp_tokens) / len(hyp_tokens)
    
    if not ref_tokens:  # Avoid division by zero
        recall = 1.0 if not hyp_tokens else 0.0
    else:
        recall = len(ref_tokens & hyp_tokens) / len(ref_tokens)
    
    if precision + recall == 0:  # Avoid division by zero
        f1 = 0.0
    else:
        f1 = 2 * (precision * recall) / (precision + recall)
    
    return precision, recall, f1

In [None]:
# Load or create sample data
def create_test_csv(filename="test_data.csv"):
    data = {
        "answer": [
            "The quick brown fox jumps over the lazy dog",
            "I enjoy coding in Python",
            "Machine learning is fascinating"
        ],
        "predicted_answer": [
            "The quick brown fox leaps over the idle dog",
            "I like coding in Python daily",
            "Machine learning is interesting"
        ]
    }
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)
    return df

# Load data
try:
    df = pd.read_csv(input) 
except FileNotFoundError:
    df = create_test_csv("test_data.csv")

# Compute metrics
em_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for _, row in df.iterrows():
    ref = row['answer']
    hyp = row['predicted_answer']
    
    # Exact Match
    em = calculate_em(ref, hyp)
    em_scores.append(em)
    
    # Precision, Recall, F1
    precision, recall, f1 = calculate_precision_recall_f1(ref, hyp)
    precision_scores.append(precision)
    recall_scores.append(recall)
    f1_scores.append(f1)

# Calculate averages
avg_em = np.mean(em_scores)
avg_precision = np.mean(precision_scores)
avg_recall = np.mean(recall_scores)
avg_f1 = np.mean(f1_scores)

# Print results
print(f"Average Exact Match (EM): {avg_em:.4f}")
print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1 Score: {avg_f1:.4f}")

# Save results to DataFrame and CSV
results_df = df.copy()
results_df['EM'] = em_scores
results_df['Precision'] = precision_scores
results_df['Recall'] = recall_scores
results_df['F1'] = f1_scores
results_df.to_csv(output_folder + "evaluation_results_em_prf1.csv", index=False) # Save to CSV
print("Results saved to 'evaluation_results_em_prf1.csv'")

Average Exact Match (EM): 0.0000
Average Precision: 0.7222
Average Recall: 0.7667
Average F1 Score: 0.7424
Results saved to 'evaluation_results_em_prf1.csv'
