In [1]:
import sys
import time
from PIL import Image
from transformers import AutoProcessor, LlavaForConditionalGeneration

# get model and processor
model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [00:06<00:00,  2.05s/it]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
from transformers import AutoProcessor, LlavaForConditionalGeneration, BitsAndBytesConfig
import torch
from PIL import Image
from accelerate import init_empty_weights, infer_auto_device_map
from transformers import AutoModelForSeq2SeqLM

# Check if CUDA is available
device = "cuda" if torch.cuda.is_available() else "cpu"

# Set up 8-bit quantization configuration using BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
    load_in_8bit=True,   # You can change this to load_in_4bit=True for 4-bit quantization
    bnb_8bit_use_double_quant=True,
    bnb_8bit_quant_type="nf4",
    bnb_8bit_min_version=False,
)

# Initialize model with empty weights, which allows accelerated loading later
with init_empty_weights():
    model = LlavaForConditionalGeneration.from_pretrained(
        "llava-hf/llava-1.5-7b-hf", 
        quantization_config=quantization_config
    )

# Tie the weights to ensure consistency before using `accelerate`
model.tie_weights()

# Automatically determine which parts of the model go to which device (CPU or GPU)
device_map = infer_auto_device_map(model, max_memory={0: "20GiB", "cpu": "30GiB"})

# Load model with device mapping and quantization
model = LlavaForConditionalGeneration.from_pretrained(
    "llava-hf/llava-1.5-7b-hf", 
    device_map=device_map,
    quantization_config=quantization_config
)

processor = AutoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")

raw_image = Image.open("C:/Users/zhc23013/Desktop/Safety Detection Benchmark/Scene images/Hospital/hospital24.png")

In [None]:
prompt = "USER: <image> What are the functions of the mask worn by the person in the picture supposed to have? Answer:"

# Process inputs
inputs = processor(text=prompt, images=raw_image, return_tensors="pt").to(device)

# Extract only the required fields for generation
input_ids = inputs['input_ids'].to(device)
attention_mask = inputs['attention_mask'].to(device)

# Generate the output
outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_new_tokens=30)

# Decode and print the output
generated_text = processor.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print(generated_text)

In [38]:
raw_image = Image.open("Scene images/Hospital/hospital2.png")
prompt = "USER: <image>\nIs the person in the image wearing a safety goggle? Answer:"

inputs = processor(text=prompt, images=raw_image, return_tensors="pt")
outputs = model.generate(**inputs, max_new_tokens=15)

generated_text = processor.batch_decode(outputs, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print(generated_text)

USER:  
Is the person in the image wearing a safety goggle? Answer: No


In [36]:
if "yes" in generated_text.lower():
    answer = "yes"
elif "no" in generated_text.lower():
    answer = "no"
else:
    answer = "unknown"

print(answer)

yes


In [3]:
# images = []

# for i in range(1, 51):
#     file_name = f"Scene images/Hospital/hospital{i}.png"
#     raw_image = Image.open(file_name).convert('RGB')
#     images.append(raw_image)

In [4]:
# question1 = "Is the person in the image wearing a safety goggle? Answer:"
# question2 = "Is the person in the image wearing gloves? Answer:"
# question3 = "Is the person in the image wearing shoe covers? Answer:"
# question4 = "Is the person in the image wearing a protective suit? Answer:"
# question5 = "Is the person in the image wearing a mask? Answer:"

In [2]:
image_paths = [f"Scene images/Hospital/hospital{i}.png" for i in range(1, 51)]
questions = [
    "USER: <image>\nIs the person in the image wearing a safety goggle? Answer:",
    "USER: <image>\nIs the person in the image wearing gloves? Answer:",
    "USER: <image>\nIs the person in the image wearing shoe covers? Answer:",
    "USER: <image>\nIs the person in the image wearing a protective suit? Answer:",
    "USER: <image>\nIs the person in the image wearing a mask? Answer:"
]

results = []

for image_path in image_paths:
    raw_image = Image.open(image_path)
    for question in questions:
        inputs = processor(text=question, images=raw_image, return_tensors="pt")

        #start = time.perf_counter()
        out = model.generate(**inputs, max_new_tokens=15)
        #end = time.perf_counter() - start

        generated_text = processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]

        if "yes" in generated_text.lower():
            answer = "yes"
        elif "no" in generated_text.lower():
            answer = "no"
        else:
            answer = "unknown"

        results.append({
            "image": image_path,
            "question": question,
            "answer": answer,
            #"time_taken": end
        })

output_file = "results_llava1.5.txt"
with open(output_file, "w") as f:
    for result in results:
        f.write(f"Image: {result['image']}\n")
        f.write(f"Question: {result['question']}\n")
        f.write(f"Answer: {result['answer']}\n")
        #f.write(f"Time taken: {result['time_taken']:.4f} seconds\n")
        f.write("\n")

print(f"Results saved to {output_file}")


Results saved to results_llava1.5.txt


# Evaluation

In [47]:
def read_results(file_path):
    results = []
    with open(file_path, "r") as file:
        lines = file.readlines()
        for i in range(0, len(lines), 4):  
            try:
                #image = lines[i].strip().split(": ")[1]
                #question = lines[i+1].strip().split(": ")[1]
                answer = lines[i+2].strip().split(": ")[1]
                results.append((answer))
            except IndexError:
                print(f"Warning: Skipping malformed record at lines {i}-{i+3}")
                continue
    return results

ground_truth = read_results("ground truth.txt")
model_output = read_results("results_llava1.5.txt")

correct_count = 0
total_count = min(len(ground_truth), len(model_output))

for gt, mo in zip(ground_truth, model_output):
    if gt == mo:
        correct_count += 1

accuracy = correct_count / total_count

print(f"Accuracy: {accuracy * 100:.2f}%")


Accuracy: 74.40%
