In [1]:
import sys
import time
from PIL import Image
from transformers import BlipProcessor, BlipForQuestionAnswering

# get model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-vqa-base")
model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")

# raw_image = Image.open("Scene images/Hospital/hospital1.png").convert('RGB')
# question = "Is the person in the image wearing a safety goggle?"
# # preprocess input data
# inputs = processor(raw_image, question, return_tensors="pt")

# start = time.perf_counter()
# # perform generation
# out = model.generate(**inputs)
# end = time.perf_counter() - start

# # postprocess result
# answer = processor.decode(out[0], skip_special_tokens=True)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
images = []

for i in range(1, 51):
    file_name = f"Scene images/Hospital/hospital{i}.png"
    raw_image = Image.open(file_name).convert('RGB')
    images.append(raw_image)

In [4]:
question1 = "Is the person in the image wearing a safety goggle?"
question2 = "Is the person in the image wearing gloves?"
question3 = "Is the person in the image wearing shoe covers?"
question4 = "Is the person in the image wearing a protective suit?"
question5 = "Is the person in the image wearing a mask?"

In [2]:
image_paths = [f"Scene images/Hospital/hospital{i}.png" for i in range(1, 51)]
questions = [
    "Is the person in the image wearing a safety goggle?",
    "Is the person in the image wearing gloves?",
    "Is the person in the image wearing shoe covers?",
    "Is the person in the image wearing a protective suit?",
    "Is the person in the image wearing a mask?"
]

results = []

for image_path in image_paths:
    raw_image = Image.open(image_path).convert('RGB')
    for question in questions:
        inputs = processor(raw_image, question, return_tensors="pt")

        start = time.perf_counter()
        out = model.generate(**inputs)
        end = time.perf_counter() - start

        answer = processor.decode(out[0], skip_special_tokens=True)
        results.append({
            "image": image_path,
            "question": question,
            "answer": answer,
            # "time_taken": end
        })

output_file = "results_BLIP1.txt"
with open(output_file, "w") as f:
    for result in results:
        f.write(f"Image: {result['image']}\n")
        f.write(f"Question: {result['question']}\n")
        f.write(f"Answer: {result['answer']}\n")
        #f.write(f"Time taken: {result['time_taken']:.4f} seconds\n")
        f.write("\n")

print(f"Results saved to {output_file}")




Results saved to results_BLIP1.txt


# Evaluation

In [36]:
def read_results(file_path):
    results = []
    with open(file_path, "r") as file:
        lines = file.readlines()
        for i in range(0, len(lines), 4):  
            try:
                image = lines[i].strip().split(": ")[1]
                question = lines[i+1].strip().split(": ")[1]
                answer = lines[i+2].strip().split(": ")[1]
                results.append((image, question, answer))
            except IndexError:
                print(f"Warning: Skipping malformed record at lines {i}-{i+3}")
                continue
    return results

ground_truth = read_results("ground truth.txt")
model_output = read_results("results.txt")

correct_count = 0
total_count = min(len(ground_truth), len(model_output))

for gt, mo in zip(ground_truth, model_output):
    if gt == mo:
        correct_count += 1

accuracy = correct_count / total_count

print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 76.40%
