# Import libraries

In [32]:
### Pip install the necessary packages
# !pip install nltk
# !pip install rouge_score
# !pip install openai
# !pip install google-generativeai

In [1]:
import pandas as pd
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import numpy as np
import nltk
from openai import OpenAI
import google.generativeai as genai

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Download NLTK data
# nltk.download('punkt')

# Import data

In [2]:
# File path
input = r'test_predict_beit3.csv'
output_folder = r'output/'

In [3]:
# Save to pandas dataframe
df = pd.read_csv(input)
df

Unnamed: 0,Image,Question,Answer,Predict
0,ff557f4ea977d3a,What color is the top of the enchiladas,golden-brown,golden-brown
1,ff557f4ea977d3a,What type of dish is positioned on the left si...,enchiladas,salad
2,ff557f4ea977d3a,What color is the filling of the enchiladas,white,white
3,ff557f4ea977d3a,Where are the enchiladas located on the plate,on the left side,on the left side
4,ff557f4ea977d3a,What vegetable is scattered on top of the lettuce,tomatoes,tomatoes
...,...,...,...,...
3694,e95d2afc654400d,"Where are the turkey, spaghetti, and zoodles s...",throughout the entire pan,throughout
3695,73c6313f570a1cd,What color are the grilled zucchini sticks,yellow,green
3696,73c6313f570a1cd,What vegetable is shown on the platter,zucchini,zucchini
3697,73c6313f570a1cd,What color is the pepper sprinkled on the zucc...,black,green


# Code

## Bleu

Evaluates the similarity between a predicted text and a reference text based on n-gram precision.
How it is calculated:
- Tokenize each sentence.
- Count matching n-gram (usually 1-4).
- Applies a penalty if `len(prediction)` < `len(answer)`.

In this code:
- BLEU-4 is used via `sentence_bleu` function from nltk: Each n-gram from 1 to 4 contributes equally to the final score.
- `reference`: The target (goal) answers.
- `hypothesis`: Generated answers.
- `smoothie`: From nltk library to handle short sentences where high n-grams return no matches.

The BLEU score is computed as:
$$ BLEU = BP * exp(\sum^{N}_{n=1}w_n log(p_n)) $$
- BP: The penalty where:
    + BP = 1 if $l_{pred} \geq l_{ref}$
    + BP = $e^{1 - \frac{l_{ref}}{l_{pred}}}$ if $l_{pred} < l_{ref}$
    + $l_X$: length of X
- $p_n$: Count matching n-gram between predictions and reference.
    + Clip (fit) the count to the maximum number of times each n-gram appears in the reference to avoid overcounting.
    + $p_n$ = (sum of clipped n-gram counts) / (total n-grams in prediction).
- $w_n$: Weight of each n-gram. In BLEU-4, each n-gram has equal weight of 0.25.
- The exp function converts it back to a score of 0-1.

In [20]:
def calculate_bleu(reference, hypothesis):
    """Calculate BLEU score for a single pair of sentences"""
    ref_words = str(reference).strip().split()
    hyp_words = str(hypothesis).strip().split()
    
    if not ref_words or not hyp_words:
        return 0.0
        
    smoothie = SmoothingFunction().method4
    return sentence_bleu([ref_words], hyp_words, smoothing_function=smoothie)


In [37]:
# Change the column named 'answer' and 'predicted_answer' if necessary
if df is not None:
    bleu_scores = [calculate_bleu(row['answer'], row['predicted_answer']) 
                  for _, row in df.iterrows()]
    avg_bleu = np.mean(bleu_scores)
    
    # Create results DataFrame
    bleu_results = pd.DataFrame({
        'answer': df['answer'], # Reference - Change here
        'predicted_answer': df['predicted_answer'], # Hypothesis - Change here
        'bleu_score': bleu_scores
    })
    
    print(f"Average BLEU Score: {avg_bleu:.4f}")
    display(bleu_results.head())
    bleu_results.to_csv(output_folder + 'bleu_results.csv', index=False)

Average BLEU Score: 0.3230


Unnamed: 0,answer,predicted_answer,bleu_score
0,The sun rises in the east every morning,The sun rises the a east every morning,0.28854
1,The sun rises in the east every morning,The sun the in east every morning,0.228941
2,Python is a popular programming language,popular a is programming language,0.124787
3,Python is a popular programming language,Python is a popular programming language,1.0
4,Artificial intelligence is transforming the world,Artificial the is transforming world,0.124787


## Rouge

Evaluates the similarity between a predicted text and a reference text based on recall of n-grams or longest common subsequences. How it works:
- Tokenize each sentence into words.
- Compute overlap between predictions and references:
    + `ROUGE-1` measures overlap of unigrams based on recall.
    + `ROUGE-2` measures overlap of bigrams.
    + `ROUGE-L` measures the longest common subsequence (LCS).
- Focuses on recall (fraction of reference content captured), reports F1 scores (harmonic mean of precision and recall).

In this code:
- `rouge_scorer` is used from the `rouge_score` library with stemming to normalize words.
- Calculates three variants:
    + rouge1: Unigram overlap.
    + rouge2: Bigram overlap.
    + rougeL: LCS-based similarity.
- `reference`: The target (goal) answers.
- `hypothesis`: Generated answers.

The ROUGE score is computed as:

ROUGE-N = (Number of overlapping N-grams) / (Total N-grams in reference)
- This is the recall score. In practice, F1 is reported:
F1 = 2*(Precision * Recall) / (Precision + Recall).
    + Precision = (Number of overlapping N-grams) / (Total N-grams in reference)
    + Recall = (Number of overlapping N-grams) / (Total N-grams in reference)

ROUGE-L = LCS(reference, prediction) / (Total words in reference)
- LCS: Length of LCS
- F1 is also computed using precision (LCS length / prediction length) and recall (LCS length / reference length).
- Scores range from 0-1.

In [22]:
def calculate_rouge(reference, hypothesis):
    """Calculate ROUGE scores for a single pair of sentences"""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(str(reference), str(hypothesis))
    return {
        'rouge1': scores['rouge1'].fmeasure,
        'rouge2': scores['rouge2'].fmeasure,
        'rougeL': scores['rougeL'].fmeasure
    }

In [38]:
if df is not None:
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []
    
    for _, row in df.iterrows():
        scores = calculate_rouge(row['answer'], row['predicted_answer']) # Change here
        rouge1_scores.append(scores['rouge1'])
        rouge2_scores.append(scores['rouge2'])
        rougeL_scores.append(scores['rougeL'])
    
    # Calculate averages
    avg_rouge1 = np.mean(rouge1_scores)
    avg_rouge2 = np.mean(rouge2_scores)
    avg_rougeL = np.mean(rougeL_scores)
    
    # Create results DataFrame
    rouge_results = pd.DataFrame({
        'answer': df['answer'], # Reference - Change here
        'predicted_answer': df['predicted_answer'], # Hypothesis - Change here
        'rouge1': rouge1_scores,
        'rouge2': rouge2_scores,
        'rougeL': rougeL_scores
    })
    
    print(f"Average ROUGE-1 Score: {avg_rouge1:.4f}")
    print(f"Average ROUGE-2 Score: {avg_rouge2:.4f}")
    print(f"Average ROUGE-L Score: {avg_rougeL:.4f}")
    display(rouge_results.head())
    rouge_results.to_csv(output_folder + 'rouge_results.csv', index=False)

Average ROUGE-1 Score: 0.9286
Average ROUGE-2 Score: 0.4157
Average ROUGE-L Score: 0.7160


Unnamed: 0,answer,predicted_answer,rouge1,rouge2,rougeL
0,The sun rises in the east every morning,The sun rises the a east every morning,0.875,0.571429,0.875
1,The sun rises in the east every morning,The sun the in east every morning,0.933333,0.461538,0.8
2,Python is a popular programming language,popular a is programming language,0.909091,0.222222,0.545455
3,Python is a popular programming language,Python is a popular programming language,1.0,1.0,1.0
4,Artificial intelligence is transforming the world,Artificial the is transforming world,0.909091,0.222222,0.727273


## Gpt/ Gemini

### Api keys:

In [None]:
gpt_api_key = ""
gemini_api_key = "apikey" # Change here

GPTScore

In [5]:

def get_gpt_score(reference, hypothesis):
    """Gets a GPT-based similarity score between reference and hypothesis."""
    prompt = f"""
    You are an expert evaluator. Rate the similarity of the following hypothesis to the reference on a scale of 0 to 1.
    
    Reference: "{reference}"
    Hypothesis: "{hypothesis}"
    
    Provide only the score as a number, nothing else.
    """
    client = OpenAI(api_key=gpt_api_key)

    response = client.chat.completions.create(
        model="gpt-4o-mini", # Change model here
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0  # Ensures consistent scoring
    )
    
    score = response.choices[0].message.content.strip()
    return float(score)  # Convert to float


In [6]:
if df is not None:
    gpt_scores = []  # Store GPT scores
    
    for _, row in df.iterrows():
        score = get_gpt_score(row['Answer'], row['Predict'])  # Change column names as needed
        print(_, score)
        gpt_scores.append(score)

    # Calculate average GPT score
    avg_gpt = np.mean(gpt_scores)

    # Create results DataFrame
    gpt_results = pd.DataFrame({
        'reference': df['Answer'],  # Change here
        'hypothesis': df['Predict'],  # Change here
        'gpt_score': gpt_scores
    })
    
    print(f"Average GPT Score: {avg_gpt:.4f}")
    display(gpt_results.head())
    
    gpt_results.to_csv(output_folder + 'beit3_old_gptscore.csv', index=False)

0 1.0
1 0.1
2 1.0
3 1.0
4 1.0
5 1.0
6 1.0
7 1.0
8 1.0
9 0.1
10 1.0
11 1.0
12 1.0
13 0.2
14 1.0
15 0.6
16 0.7
17 1.0
18 0.9
19 0.1
20 0.2
21 0.5
22 1.0
23 0.2
24 1.0
25 0.1
26 1.0
27 0.8
28 0.7
29 0.9
30 1.0
31 1.0
32 1.0
33 0.6
34 1.0
35 1.0
36 1.0
37 1.0
38 1.0
39 0.0
40 0.3
41 0.2
42 0.0
43 1.0
44 1.0
45 0.2
46 0.8
47 0.2
48 1.0
49 0.1
50 0.7
51 1.0
52 1.0
53 1.0
54 1.0
55 1.0
56 0.5
57 1.0
58 1.0
59 1.0
60 0.4
61 0.2
62 1.0
63 0.0
64 0.1
65 0.2
66 1.0
67 1.0
68 1.0
69 0.7
70 0.5
71 1.0
72 1.0
73 0.8
74 1.0
75 1.0
76 0.1
77 1.0
78 0.1
79 0.2
80 1.0
81 0.8
82 1.0
83 0.2
84 1.0
85 1.0
86 0.6
87 1.0
88 1.0
89 0.0
90 0.2
91 1.0
92 1.0
93 0.7
94 1.0
95 0.6
96 1.0
97 0.4
98 0.2
99 1.0
100 0.7
101 1.0
102 1.0
103 1.0
104 1.0
105 0.8
106 1.0
107 0.3
108 0.2
109 0.5
110 1.0
111 1.0
112 0.2
113 0.2
114 1.0
115 0.7
116 1.0
117 1.0
118 1.0
119 1.0
120 1.0
121 1.0
122 0.0
123 1.0
124 0.2
125 0.8
126 0.2
127 0.8
128 1.0
129 1.0
130 0.1
131 1.0
132 1.0
133 1.0
134 0.7
135 0.9
136 1.0
137 1.0
138 1.

Unnamed: 0,reference,hypothesis,gpt_score
0,golden-brown,golden-brown,1.0
1,enchiladas,salad,0.1
2,white,white,1.0
3,on the left side,on the left side,1.0
4,tomatoes,tomatoes,1.0


Geminiscore

In [39]:
genai.configure(api_key="your-api-key")

def get_gemini_score(reference, hypothesis):
    """Gets a similarity score using Google's Gemini model."""
    prompt = f"""
    You are an expert evaluator. Rate the similarity of the following hypothesis to the reference on a scale of 0 to 1.
    
    Reference: "{reference}"
    Hypothesis: "{hypothesis}"
    
    Provide only the score as a number, nothing else.
    """

    model = genai.GenerativeModel("gemini-pro")
    response = model.generate_content(prompt)
    
    score = response.text.strip()
    return float(score)  # Convert to float

In [None]:
if df is not None:
    gemini_scores = []  # Store Gemini scores
    
    for _, row in df.iterrows():
        score = get_gemini_score(row['reference'], row['hypothesis'])  # Change column names as needed
        gemini_scores.append(score)

    # Calculate average GeminiScore
    avg_gemini = np.mean(gemini_scores)

    # Create results DataFrame
    gemini_results = pd.DataFrame({
        'reference': df['reference'],  # Change here
        'hypothesis': df['hypothesis'],  # Change here
        'gemini_score': gemini_scores
    })
    
    print(f"Average Gemini Score: {avg_gemini:.4f}")
    display(gemini_results.head())
    
    gemini_results.to_csv(output_folder + 'gemini_results.csv', index=False)