# **Setup**

In [1]:
!pip install -q torch transformers pillow bitsandbytes accelerate open_clip_torch matplotlib pandas seaborn tqdm

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m62.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import os
from PIL import Image
import matplotlib.pyplot as plt

data_dir = "chest_xray_samples"
normal_dir = os.path.join(data_dir, "normal")
pneumonia_dir = os.path.join(data_dir, "pneumonia")

images = {}

normal_files = sorted([f for f in os.listdir(normal_dir) if f.endswith(('.jpeg', '.jpg', '.png'))])[:5]
for i, f in enumerate(normal_files):
    name = f"normal_{i+1}"
    images[name] = {"path": os.path.join(normal_dir, f), "image": Image.open(os.path.join(normal_dir, f)).convert("RGB"), "condition": "normal"}

pneumonia_files = sorted([f for f in os.listdir(pneumonia_dir) if f.endswith(('.jpeg', '.jpg', '.png'))])[:5]
for i, f in enumerate(pneumonia_files):
    name = f"pneumonia_{i+1}"
    images[name] = {"path": os.path.join(pneumonia_dir, f), "image": Image.open(os.path.join(pneumonia_dir, f)).convert("RGB"), "condition": "pneumonia"}

print(f"loaded {len(images)} images")

fig, axes = plt.subplots(2, 5, figsize=(15, 6))
for name, data in images.items():
    row = 0 if "normal" in name else 1
    col = int(name.split("_")[1]) - 1
    axes[row, col].imshow(data["image"], cmap='gray')
    axes[row, col].set_title(name)
    axes[row, col].axis('off')
plt.tight_layout()
plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'chest_xray_samples/normal'

# **Prompts**

In [None]:
def get_prompts(condition):
    if condition == "normal":
        return [
            ("Is there pneumonia in this X-ray?", False, "affirmative"),
            ("Is there a pleural effusion in this X-ray?", False, "affirmative"),
            ("Is there no pneumonia in this X-ray?", True, "negated"),
            ("Is there no pleural effusion in this X-ray?", True, "negated"),
            ("Is there right-sided pneumonia?", False, "left_right"),
            ("Is there left-sided pneumonia?", False, "left_right"),
        ]
    else:
        return [
            ("Is there pneumonia in this X-ray?", True, "affirmative"),
            ("Is there consolidation in this X-ray?", True, "affirmative"),
            ("Is there no pneumonia in this X-ray?", False, "negated"),
            ("Is there no lung abnormality in this X-ray?", False, "negated"),
            ("Is there right-sided pneumonia?", True, "left_right"),
            ("Is there left-sided pneumonia?", True, "left_right"),
        ]

all_test_cases = []
for img_name, data in images.items():
    for prompt, expected, prompt_type in get_prompts(data["condition"]):
        all_test_cases.append({
            "image_name": img_name,
            "prompt": prompt,
            "ground_truth": expected,
            "prompt_type": prompt_type,
            "condition": data["condition"],
        })

print(f"total: {len(all_test_cases)} test cases")
print(f"affirmative: {sum(1 for x in all_test_cases if x['prompt_type'] == 'affirmative')}")
print(f"negated: {sum(1 for x in all_test_cases if x['prompt_type'] == 'negated')}")
print(f"left_right: {sum(1 for x in all_test_cases if x['prompt_type'] == 'left_right')}")

# **Loading LLaVA-Med**

In [None]:
from transformers import LlavaForConditionalGeneration, AutoProcessor
import torch

model_path = "llava-hf/llava-1.5-7b-hf"

model = LlavaForConditionalGeneration.from_pretrained(
    model_path,
    torch_dtype=torch.float16,
    device_map="auto",
    load_in_4bit=True,
)
processor = AutoProcessor.from_pretrained(model_path)
print(f"loaded {model_path}")

# **Inference**

In [None]:
def run_inference(model, processor, image, prompt, max_new_tokens=128):
    conversation = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}]
    text_prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    inputs = processor(images=image, text=text_prompt, return_tensors="pt").to(model.device)
    
    with torch.inference_mode():
        outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, do_sample=False, output_scores=True, return_dict_in_generate=True)
    
    generated_ids = outputs.sequences[:, inputs.input_ids.shape[1]:]
    response = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
    
    transition_scores = model.compute_transition_scores(outputs.sequences, outputs.scores, normalize_logits=True)
    mean_logprob = transition_scores[0].mean().item()
    
    response_lower = response.lower()
    yes_indicators = ["yes", "there is", "present", "visible", "shows", "appears"]
    no_indicators = ["no", "there is no", "not present", "not visible", "no evidence"]
    
    says_yes = any(ind in response_lower for ind in yes_indicators)
    says_no = any(ind in response_lower for ind in no_indicators)
    
    if says_yes and says_no:
        says_yes = response_lower.find("yes") < response_lower.find("no") if "yes" in response_lower and "no" in response_lower else says_yes
    
    return response, mean_logprob, says_yes

In [None]:
import pandas as pd
from tqdm import tqdm

results = []

for tc in tqdm(all_test_cases):
    img = images[tc["image_name"]]["image"]
    response, logprob, says_yes = run_inference(model, processor, img, tc["prompt"])
    
    is_correct = says_yes == tc["ground_truth"]
    
    results.append({
        "image": tc["image_name"],
        "prompt": tc["prompt"],
        "prompt_type": tc["prompt_type"],
        "ground_truth": tc["ground_truth"],
        "response": response[:150],
        "model_says_yes": says_yes,
        "is_correct": is_correct,
        "mean_logprob": logprob,
    })

df = pd.DataFrame(results)
print(f"completed {len(df)} test cases")
df.head(10)

# **Results Analysis**

In [None]:
import seaborn as sns

print("=" * 60)
print("HALLUCINATION ANALYSIS")
print("=" * 60)

print(f"\noverall accuracy: {df['is_correct'].mean() * 100:.1f}%")

print("\nby prompt type:")
for ptype in ["affirmative", "negated", "left_right"]:
    subset = df[df["prompt_type"] == ptype]
    acc = subset["is_correct"].mean() * 100
    print(f"  {ptype}: {acc:.1f}%")

negated = df[df["prompt_type"] == "negated"]
false_negated = negated[negated["ground_truth"] == False]
halluc_rate = false_negated["model_says_yes"].mean() * 100 if len(false_negated) > 0 else 0
print(f"\nhallucination rate (yes to negated): {halluc_rate:.1f}%")

print("\nlog-probability:")
correct = df[df["is_correct"] == True]["mean_logprob"]
incorrect = df[df["is_correct"] == False]["mean_logprob"]
print(f"  correct: {correct.mean():.4f}")
print(f"  incorrect: {incorrect.mean():.4f}")

In [None]:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

# accuracy by prompt type
acc_by_type = df.groupby("prompt_type")["is_correct"].mean() * 100
acc_by_type.plot(kind="bar", ax=axes[0], color=["#2ecc71", "#e74c3c", "#3498db"])
axes[0].set_title("accuracy by prompt type")
axes[0].set_ylabel("accuracy (%)")
axes[0].set_ylim(0, 100)
axes[0].tick_params(axis='x', rotation=45)

# log prob distribution
df["correctness"] = df["is_correct"].map({True: "correct", False: "incorrect"})
sns.boxplot(data=df, x="correctness", y="mean_logprob", ax=axes[1], palette={"correct": "#2ecc71", "incorrect": "#e74c3c"})
axes[1].set_title("log-prob by correctness")

# negated prompt responses
negated_df = df[df["prompt_type"] == "negated"]
by_truth = negated_df.groupby("ground_truth")["model_says_yes"].mean() * 100
axes[2].bar(["expected: no", "expected: yes"], [by_truth.get(False, 0), by_truth.get(True, 0)], color=["#e74c3c", "#2ecc71"])
axes[2].set_title("'yes' rate on negated prompts")
axes[2].set_ylabel("% saying yes")
axes[2].set_ylim(0, 100)

plt.tight_layout()
plt.show()

# **BiomedCLIP Similarity**

In [None]:
import open_clip
import torch.nn.functional as F

clip_model, _, preprocess = open_clip.create_model_and_transforms('hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224')
tokenizer = open_clip.get_tokenizer('hf-hub:microsoft/BiomedCLIP-PubMedBERT_256-vit_base_patch16_224')
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model = clip_model.to(device).eval()
print(f"loaded biomedclip on {device}")

In [None]:
def clip_similarity(image, text):
    img_input = preprocess(image).unsqueeze(0).to(device)
    txt_input = tokenizer([text]).to(device)
    with torch.no_grad():
        img_feat = F.normalize(clip_model.encode_image(img_input), dim=-1)
        txt_feat = F.normalize(clip_model.encode_text(txt_input), dim=-1)
        return (img_feat @ txt_feat.T).item()

clip_results = []
for tc in tqdm(all_test_cases):
    img = images[tc["image_name"]]["image"]
    sim = clip_similarity(img, tc["prompt"])
    clip_results.append({"image": tc["image_name"], "prompt": tc["prompt"], "prompt_type": tc["prompt_type"], "ground_truth": tc["ground_truth"], "clip_similarity": sim})

df_clip = pd.DataFrame(clip_results)
df = df.merge(df_clip[["image", "prompt", "clip_similarity"]], on=["image", "prompt"], how="left")

In [None]:
print("\nbiomedclip similarity:")
print(f"  true statements: {df[df['ground_truth'] == True]['clip_similarity'].mean():.4f}")
print(f"  false statements: {df[df['ground_truth'] == False]['clip_similarity'].mean():.4f}")

fig, ax = plt.subplots(1, 1, figsize=(6, 4))
df["truth_label"] = df["ground_truth"].map({True: "true", False: "false"})
sns.boxplot(data=df, x="truth_label", y="clip_similarity", ax=ax, palette={"true": "#2ecc71", "false": "#e74c3c"})
ax.set_title("biomedclip similarity by ground truth")
plt.show()

In [None]:
print("=" * 70)
print("HALLUCINATION SUMMARY")
print("=" * 70)

halluc_data = []
for ptype in ["affirmative", "negated", "left_right"]:
    subset = df[df["prompt_type"] == ptype]
    false_stmt = subset[subset["ground_truth"] == False]
    halluc = false_stmt["model_says_yes"].mean() * 100 if len(false_stmt) > 0 else 0
    halluc_data.append({"prompt_type": ptype, "total": len(subset), "accuracy": f"{subset['is_correct'].mean()*100:.1f}%", "halluc_rate": f"{halluc:.1f}%"})
print(pd.DataFrame(halluc_data).to_string(index=False))

print("\n" + "=" * 70)
print("LOG-PROBABILITY DIFFERENCES")
print("=" * 70)

logprob_data = []
for ptype in ["affirmative", "negated", "left_right"]:
    subset = df[df["prompt_type"] == ptype]
    corr = subset[subset["is_correct"] == True]["mean_logprob"]
    incorr = subset[subset["is_correct"] == False]["mean_logprob"]
    logprob_data.append({
        "prompt_type": ptype,
        "logprob_correct": f"{corr.mean():.4f}" if len(corr) > 0 else "n/a",
        "logprob_incorrect": f"{incorr.mean():.4f}" if len(incorr) > 0 else "n/a",
    })
print(pd.DataFrame(logprob_data).to_string(index=False))

print("\n" + "=" * 70)
print("BIOMEDCLIP SIMILARITY")
print("=" * 70)

clip_data = []
for ptype in ["affirmative", "negated", "left_right"]:
    subset = df[df["prompt_type"] == ptype]
    true_s = subset[subset["ground_truth"] == True]["clip_similarity"]
    false_s = subset[subset["ground_truth"] == False]["clip_similarity"]
    clip_data.append({
        "prompt_type": ptype,
        "sim_true": f"{true_s.mean():.4f}" if len(true_s) > 0 else "n/a",
        "sim_false": f"{false_s.mean():.4f}" if len(false_s) > 0 else "n/a",
    })
print(pd.DataFrame(clip_data).to_string(index=False))

df.to_csv("hallucination_results.csv", index=False)
print("\nsaved to hallucination_results.csv")

# **9. MiniGPT-Med**

In [None]:
!pip install -q psutil==5.9.4 regex==2022.10.31 tqdm==4.64.1 timm==0.6.13 webdataset==0.2.48 omegaconf==2.3.0 opencv-python==4.7.0.72 decord==0.6.0 peft==0.2.0 sentence-transformers gradio==3.47.1 accelerate==0.20.3 scikit-image visual-genome wandb

In [None]:
import os
if not os.path.exists("MiniGPT-Med"):
    !git clone https://github.com/Vision-CAIR/MiniGPT-Med.git
%cd MiniGPT-Med

In [None]:
import sys
sys.path.insert(0, ".")

from minigpt4.common.config import Config
from minigpt4.common.registry import registry
from minigpt4.conversation.conversation import Chat, CONV_VISION_minigptv2

import argparse

# config paths - adjust these for your setup
cfg_path = "eval_configs/minigptv2_eval.yaml"
model_ckpt = "/kaggle/input/minigpt-med/pytorch/default/1/miniGPT_Med.pth"  # adjust path
llama_path = "Llama-2-7b-chat-hf"  # adjust path

# update config
import yaml
with open(cfg_path, 'r') as f:
    config = yaml.safe_load(f)
config['model']['ckpt'] = model_ckpt
with open(cfg_path, 'w') as f:
    yaml.dump(config, f)

args = argparse.Namespace(cfg_path=cfg_path, gpu_id=0, options=[])
cfg = Config(args)

model_config = cfg.model_cfg
model_config.device_8bit = 0
model_cls = registry.get_model_class(model_config.arch)
minigpt_model = model_cls.from_config(model_config).to('cuda:0')

vis_processor_cfg = cfg.datasets_cfg.cc_sbu_align.vis_processor.train
vis_processor = registry.get_processor_class(vis_processor_cfg.name).from_config(vis_processor_cfg)

chat = Chat(minigpt_model, vis_processor, device='cuda:0')
print("loaded minigpt-med")

In [None]:
def run_minigpt_inference(chat, image, prompt):
    chat_state = CONV_VISION_minigptv2.copy()
    img_list = []
    
    chat.upload_img(image, chat_state, img_list)
    chat.encode_img(img_list)
    chat.ask(prompt, chat_state)
    
    response = chat.answer(
        conv=chat_state,
        img_list=img_list,
        num_beams=1,
        temperature=1.0,
        max_new_tokens=128,
        max_length=2000
    )[0]
    
    response_lower = response.lower()
    yes_indicators = ["yes", "there is", "present", "visible", "shows", "appears"]
    no_indicators = ["no", "there is no", "not present", "not visible", "no evidence"]
    
    says_yes = any(ind in response_lower for ind in yes_indicators)
    says_no = any(ind in response_lower for ind in no_indicators)
    
    if says_yes and says_no:
        says_yes = response_lower.find("yes") < response_lower.find("no") if "yes" in response_lower and "no" in response_lower else says_yes
    
    return response, says_yes

In [None]:
# reload images (paths may have changed after cd)
%cd ..
from PIL import Image as PILImage

images_minigpt = {}
for name in ["normal_1", "normal_2", "normal_3", "normal_4", "normal_5"]:
    idx = int(name.split("_")[1]) - 1
    f = sorted([f for f in os.listdir("chest_xray_samples/normal") if f.endswith(('.jpeg', '.jpg', '.png'))])[idx]
    images_minigpt[name] = PILImage.open(f"chest_xray_samples/normal/{f}").convert("RGB")

for name in ["pneumonia_1", "pneumonia_2", "pneumonia_3", "pneumonia_4", "pneumonia_5"]:
    idx = int(name.split("_")[1]) - 1
    f = sorted([f for f in os.listdir("chest_xray_samples/pneumonia") if f.endswith(('.jpeg', '.jpg', '.png'))])[idx]
    images_minigpt[name] = PILImage.open(f"chest_xray_samples/pneumonia/{f}").convert("RGB")

print(f"loaded {len(images_minigpt)} images for minigpt-med")

In [None]:
minigpt_results = []

for tc in tqdm(all_test_cases):
    img = images_minigpt[tc["image_name"]]
    response, says_yes = run_minigpt_inference(chat, img, tc["prompt"])
    
    is_correct = says_yes == tc["ground_truth"]
    
    minigpt_results.append({
        "image": tc["image_name"],
        "prompt": tc["prompt"],
        "prompt_type": tc["prompt_type"],
        "ground_truth": tc["ground_truth"],
        "response": response[:150],
        "model_says_yes": says_yes,
        "is_correct": is_correct,
        "model": "MiniGPT-Med"
    })

df_minigpt = pd.DataFrame(minigpt_results)
print(f"completed {len(df_minigpt)} test cases for minigpt-med")

In [None]:
print("=" * 60)
print("MINIGPT-MED RESULTS")
print("=" * 60)

print(f"\noverall accuracy: {df_minigpt['is_correct'].mean() * 100:.1f}%")

print("\nby prompt type:")
for ptype in ["affirmative", "negated", "left_right"]:
    subset = df_minigpt[df_minigpt["prompt_type"] == ptype]
    acc = subset["is_correct"].mean() * 100
    print(f"  {ptype}: {acc:.1f}%")

negated = df_minigpt[df_minigpt["prompt_type"] == "negated"]
false_negated = negated[negated["ground_truth"] == False]
halluc_rate = false_negated["model_says_yes"].mean() * 100 if len(false_negated) > 0 else 0
print(f"\nhallucination rate (yes to negated): {halluc_rate:.1f}%")

# **10. Compare LLaVA vs MiniGPT-Med**

In [None]:
df["model"] = "LLaVA"
df_combined = pd.concat([df[["image", "prompt", "prompt_type", "ground_truth", "response", "model_says_yes", "is_correct", "model"]], df_minigpt], ignore_index=True)

print("=" * 70)
print("MODEL COMPARISON")
print("=" * 70)

comparison = []
for model_name in ["LLaVA", "MiniGPT-Med"]:
    subset = df_combined[df_combined["model"] == model_name]
    
    acc = subset["is_correct"].mean() * 100
    
    neg = subset[subset["prompt_type"] == "negated"]
    false_neg = neg[neg["ground_truth"] == False]
    halluc = false_neg["model_says_yes"].mean() * 100 if len(false_neg) > 0 else 0
    
    aff_acc = subset[subset["prompt_type"] == "affirmative"]["is_correct"].mean() * 100
    neg_acc = subset[subset["prompt_type"] == "negated"]["is_correct"].mean() * 100
    lr_acc = subset[subset["prompt_type"] == "left_right"]["is_correct"].mean() * 100
    
    comparison.append({
        "model": model_name,
        "overall_acc": f"{acc:.1f}%",
        "affirmative_acc": f"{aff_acc:.1f}%",
        "negated_acc": f"{neg_acc:.1f}%",
        "left_right_acc": f"{lr_acc:.1f}%",
        "halluc_rate": f"{halluc:.1f}%"
    })

print(pd.DataFrame(comparison).to_string(index=False))

df_combined.to_csv("hallucination_results_both_models.csv", index=False)
print("\nsaved to hallucination_results_both_models.csv")

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

# accuracy comparison
acc_data = df_combined.groupby(["model", "prompt_type"])["is_correct"].mean().unstack() * 100
acc_data.plot(kind="bar", ax=axes[0], color=["#2ecc71", "#e74c3c", "#3498db"])
axes[0].set_title("accuracy by model and prompt type")
axes[0].set_ylabel("accuracy (%)")
axes[0].set_ylim(0, 100)
axes[0].tick_params(axis='x', rotation=0)
axes[0].legend(title="prompt type")

# hallucination comparison
halluc_by_model = []
for model_name in ["LLaVA", "MiniGPT-Med"]:
    subset = df_combined[df_combined["model"] == model_name]
    neg = subset[(subset["prompt_type"] == "negated") & (subset["ground_truth"] == False)]
    halluc = neg["model_says_yes"].mean() * 100 if len(neg) > 0 else 0
    halluc_by_model.append(halluc)

axes[1].bar(["LLaVA", "MiniGPT-Med"], halluc_by_model, color=["#3498db", "#9b59b6"])
axes[1].set_title("hallucination rate (yes to false negated)")
axes[1].set_ylabel("% saying yes")
axes[1].set_ylim(0, 100)

plt.tight_layout()
plt.savefig("model_comparison.png", dpi=150)
plt.show()
print("saved to model_comparison.png")