In [3]:
import torch
import pandas as pd
import csv
from transformers import CLIPProcessor, CLIPModel
from PIL import Image

from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import meteor_score
from rouge_score import rouge_scorer

Best caption: A horse and a man standing


In [None]:
# Initialize CLIP processor and model
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")

In [None]:
header = ["Image", "Reference_Captions", "Chosen_Caption"]
clip_predictions = []

file_name = "Test-Images.csv"
df = pd.read_csv(file_name)
df = df.reset_index()

root = "./Predictions/"

vgg_predicted = root + "VGG-Transformer.csv"
vgg_df = pd.read_csv(vgg_predicted)

bilstm_predicted = root + "ViT-BiLSTM.csv"
bilstm_df = pd.read_csv(bilstm_predicted)

roberta_predicted = root + "ViT-Roberta.csv"
roberta_df = pd.read_csv(roberta_predicted)

for index, row in df.iterrows():
    image = row['Image']
    actual_captions = row['Captions']

    vgg = vgg_df[vgg_df['Image'] == image]['Caption'].iloc[0]
    bilstm = bilstm_df[bilstm_df['Image'] == image]['Caption'].iloc[0]
    roberta = roberta_df[roberta_df['Image'] == image]['Caption'].iloc[0]

    candidate_captions = [vgg, bilstm, roberta]

    inputs = clip_processor(text=candidate_captions, images=Image.open(image),
                            return_tensors="pt", padding=True)
    
    with torch.no_grad():
        outputs = clip_model(**inputs)

    # Extract logits
    logits = outputs.logits_per_image  # Shape: (batch_size, num_labels)

    # Find the index of the highest-scoring caption
    best_caption_index = torch.argmax(logits).item()
    best_caption = candidate_captions[best_caption_index]

    temp = [image, actual_captions, best_caption]
    clip_predictions.append(temp)

In [None]:
clip_predictions_file_name = "Clip-Predictions.csv"

with open(clip_predictions_file_name, 'w', newline = "") as file:
    csvwriter = csv.writer(file)
    csvwriter.writerow(header)
    csvwriter.writerows(clip_predictions)

In [None]:
clip_df = pd.read_csv(clip_predictions_file_name)
clip_df.reset_index()

In [None]:
# calculating blue scores

bleu1 = 0
bleu2 = 0
bleu3 = 0
bleu4 = 0
mod = 1e9 + 7
length = 1000

for index, row in clip_df.iterrows():
    reference_captions = row["Reference_Captions"]
    reference_captions_list = [element.split() for element in reference_captions]
    candidate_caption = row["Chosen_Caption"]
    candidate_caption_list = candidate_caption.split()
    bleu1 = (bleu1 + sentence_bleu(reference_captions_list, candidate_caption_list, weights=(1.0,0,0,0))) % mod
    bleu2 = (bleu2 + sentence_bleu(reference_captions_list, candidate_caption_list, weights=(0.5,0.5,0,0))) % mod
    bleu3 = (bleu3 + sentence_bleu(reference_captions_list, candidate_caption_list, weights=(0.3,0.3,0.3,0))) % mod
    bleu4 = (bleu4 + sentence_bleu(reference_captions_list, candidate_caption_list, weights=(0.25,0.25,0.25,0.25))) % mod


print(f"BLEU-1 score: {(bleu1/length) * 100}")
print(f"BLEU-2 score: {(bleu2/length) * 100}")
print(f"BLEU-3 score: {(bleu3/length) * 100}")
print(f"BLEU-4 score: {(bleu4/length) * 100}")

In [None]:
# calculating meteor metric

meteor = 0
length = 1000
mod = 1e9 + 7

for index, row in clip_df.iterrows():
    reference_captions = row["Reference_Captions"]
    reference_captions_list = [element.split() for element in reference_captions]
    candidate_caption = row["Chosen_Caption"]
    candidate_caption_list = candidate_caption.split()
    meteor = (meteor + meteor_score(reference_captions_list, candidate_caption_list)) % mod

print(f"METEOR score: {(meteor/length) * 100}")


In [None]:
# calculating rouge metric

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

rouge1_precision = 0
rouge1_recall = 0
rouge1_fmeasure = 0
rouge2_precision = 0
rouge2_recall = 0
rouge2_fmeasure = 0
rougeL_precision = 0
rougeL_recall = 0
rougeL_fmeasure = 0

obj = {
    'rouge1': [0, 0, 0],
    'rouge2': [0, 0, 0],
    'rougeL': [0, 0, 0]
}

for index, row in clip_df.iterrows():
    reference_captions = row["Reference_Captions"]
    candidate_caption = row["Chosen_Caption"]
    scores = {key: [] for key in ['rouge1', 'rouge2', 'rougeL']}
    r1_p_max = 0
    r1_r_max = 0
    r1_f_max = 0
    r2_p_max = 0
    r2_r_max = 0
    r2_f_max = 0
    rL_p_max = 0
    rL_r_max = 0
    rL_f_max = 0
    for ref in reference_captions:
        temp_scores = scorer.score(ref, candidate_caption)
        r1_p_max = max(r1_p_max, temp_scores['rouge1'].precision)
        r1_r_max = max(r1_r_max, temp_scores['rouge1'].recall)
        r1_f_max = max(r1_f_max, temp_scores['rouge1'].fmeasure)

        r2_p_max = max(r2_p_max, temp_scores['rouge2'].precision)
        r2_r_max = max(r2_r_max, temp_scores['rouge2'].recall)
        r2_f_max = max(r2_f_max, temp_scores['rouge2'].fmeasure)

        rL_p_max = max(rL_p_max, temp_scores['rougeL'].precision)
        rL_r_max = max(rL_r_max, temp_scores['rougeL'].recall)
        rL_f_max = max(rL_f_max, temp_scores['rougeL'].fmeasure)
        
    obj['rouge1'][0] = (obj['rouge1'][0] + r1_p_max) % mod
    obj['rouge1'][1] = (obj['rouge1'][1] + r1_r_max) % mod
    obj['rouge1'][2] = (obj['rouge1'][2] + r1_f_max) % mod
    obj['rouge2'][0] = (obj['rouge2'][0] + r2_p_max) % mod
    obj['rouge2'][1] = (obj['rouge2'][1] + r2_r_max) % mod
    obj['rouge2'][2] = (obj['rouge2'][2] + r2_f_max) % mod
    obj['rougeL'][0] = (obj['rougeL'][0] + rL_p_max) % mod
    obj['rougeL'][1] = (obj['rougeL'][1] + rL_r_max) % mod
    obj['rougeL'][2] = (obj['rougeL'][2] + rL_f_max) % mod


size = length
print(f'Rouge1: Precision = {obj["rouge1"][0] * 100 / size}, Recall = {obj["rouge1"][1] * 100 / size}, f_measure = {obj["rouge1"][2] * 100 / size}')
print(f'Rouge2: Precision = {obj["rouge2"][0] * 100 / size}, Recall = {obj["rouge2"][1] * 100 / size}, f_measure = {obj["rouge2"][2] * 100 / size}')
print(f'RougeL: Precision = {obj["rougeL"][0] * 100 / size}, Recall = {obj["rougeL"][1] * 100 / size}, f_measure = {obj["rougeL"][2] * 100 / size}')