# Evaluation

In [1]:
# import libraries

import json
import time

from PIL import Image
from IPython.display import display
from transformers import BlipProcessor, BlipForConditionalGeneration

In [2]:
# data folder

data_folder = '../../data/social_media'

In [3]:
# variable to store combinations and respective scores

temperature_values = [0.3, 0.7, 1.0, 1.5]
top_p_values = [0.3, 0.6, 0.9]
combinations = []

for temperature in temperature_values:
    for top_p in top_p_values:
        combinations.append({
            'temperature': temperature,
            'top_p': top_p,
            'total_score': 0
        })

In [4]:
# sum scores for each combination

num_images = 20
for index in range(num_images):
    with open(f"{data_folder}/results/{index}.json", 'r') as file:
        results = json.load(file)
    for n, item in enumerate(results):
        combinations[n]['total_score'] += item['score']

sorted_combinations = sorted(combinations, key=lambda x: x['total_score'], reverse=True)

print('Best values for parameters:')
print(f"- temperature = {sorted_combinations[0]['temperature']}")
print(f"- top_p = {sorted_combinations[0]['top_p']}")

sorted_combinations

Best values for parameters:
- temperature = 0.3
- top_p = 0.9


[{'temperature': 0.3, 'top_p': 0.9, 'total_score': 30},
 {'temperature': 0.3, 'top_p': 0.6, 'total_score': 29},
 {'temperature': 0.7, 'top_p': 0.3, 'total_score': 29},
 {'temperature': 0.3, 'top_p': 0.3, 'total_score': 28},
 {'temperature': 1.5, 'top_p': 0.3, 'total_score': 28},
 {'temperature': 1.0, 'top_p': 0.3, 'total_score': 25},
 {'temperature': 0.7, 'top_p': 0.6, 'total_score': 24},
 {'temperature': 1.0, 'top_p': 0.6, 'total_score': 22},
 {'temperature': 1.5, 'top_p': 0.6, 'total_score': 21},
 {'temperature': 0.7, 'top_p': 0.9, 'total_score': 20},
 {'temperature': 1.0, 'top_p': 0.9, 'total_score': 17},
 {'temperature': 1.5, 'top_p': 0.9, 'total_score': 13}]

In [5]:
# load BLIP model

processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [12]:
def generate_caption(image : Image) -> str:
    '''
    Generates a caption for the image using the BLIP model
    '''
    text = 'a photograph of'
    inputs = processor(image, text, return_tensors="pt")
    temperature = 0.3
    top_p = 0.9
    outputs = model.generate(**inputs, temperature=temperature, top_p=top_p, max_new_tokens=50, do_sample=True, repetition_penalty=1.2)
    caption = processor.decode(outputs[0], skip_special_tokens=True)
    return caption

In [13]:
# measure image captioning times

times = []
captions = []

for index in range(num_images):
    image = Image.open(f"{data_folder}/images/{index}.jpg").convert('RGB')
    start_time = time.time()
    caption = generate_caption(image)
    end_time = time.time()
    elapsed_time = end_time - start_time
    times.append(elapsed_time)
    captions.append(caption)

In [14]:
# captions and respective times

for index in range(num_images):
    print(f"[{round(times[index], 1)} s] {captions[index]}")

total = sum(times)
average = total / num_images
print(f"\nTotal time for 20 captions: {round(total, 1)} s")
print(f"Average time per caption: {round(average, 1)} s")

[5.1 s] a photograph of a chocolate cake with two slices missing and a slice cut out
[4.4 s] a photograph of a man sitting on top of a chair in a gym
[3.8 s] a photograph of two people standing on a dock near the ocean
[3.8 s] a photograph of a woman standing on a railing near a river
[4.1 s] a photograph of a woman in black jacket and goggles on skis
[4.0 s] a photograph of a bowl of mushroom soup with bread and parsley
[3.7 s] a photograph of two women doing yoga in a gym room
[3.5 s] a photograph of two children playing in a play tunnel
[4.0 s] a photograph of a man is sitting in the cockpit of a helicopter
[3.8 s] a photograph of two people laying on the grass near each other
[4.1 s] a photograph of two dogs are sitting on the couch looking out the window
[4.0 s] a photograph of a pizza in a box with pepperoni on it
[4.6 s] a photograph of a woman standing on top of a building with a city in the background
[3.7 s] a photograph of a blue background with a quote on it
[3.7 s] a photo