In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from huggingface_hub import InferenceClient
from PIL import Image
import base64
import io
from tqdm import tqdm
import time

# Load 1000-row CSV
df = pd.read_csv("/content/drive/MyDrive/vision_benchmark/metadata/disguises_pairs_sample1000.csv")

# HuggingFace model client
client = InferenceClient(model="Qwen/Qwen2.5-VL-72B-Instruct")

# Encode image to base64
def encode_image(img_path):
    with open(img_path, "rb") as f:
        return base64.b64encode(f.read()).decode("utf-8")

# Collect results
results = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    try:
        img1 = encode_image(row["img1_path"])
        img2 = encode_image(row["img2_path"])
        prompt = "Do these two people look like the same person, or is one in disguise?"

        response = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img1}"}},
                        {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img2}"}},
                    ]
                }
            ]
        )

        prediction = response.choices[0].message.content

        results.append({
            "img1": row["img1_path"],
            "img2": row["img2_path"],
            "ground_truth": row["label"],
            "prediction": prediction
        })

    except Exception as e:
        results.append({
            "img1": row["img1_path"],
            "img2": row["img2_path"],
            "ground_truth": row["label"],
            "prediction": f"ERROR: {str(e)}"
        })

    time.sleep(1)  # avoid throttling

# Save output
results_df = pd.DataFrame(results)
out_path = "/content/drive/MyDrive/vision_benchmark/metadata/qwen2.5vl_sample1000_results.csv"
results_df.to_csv(out_path, index=False)

print(f"\n✅ Saved results to: {out_path}")


100%|██████████| 1000/1000 [1:06:29<00:00,  3.99s/it]


✅ Saved results to: /content/drive/MyDrive/vision_benchmark/metadata/qwen2.5vl_sample1000_results.csv





In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, f1_score

# Load the CSV
df = pd.read_csv("/content/drive/MyDrive/vision_benchmark/metadata/disguise_qwen2.5vl_sample1000_results.csv")

# Define broader matching logic for predictions
def flexible_map_prediction(text):
    text = text.strip().lower()
    if any(phrase in text for phrase in [
        "show the same person",
        "appear to be the same person",
        "appear to show the same person"
    ]):
        return 1
    elif any(phrase in text for phrase in [
        "do not appear to be the same person",
        "do not appear to be the same individual"
    ]):
        return 0
    else:
        return -1  # unclear or noisy response

# Apply the mapping
df["flexible_pred"] = df["prediction"].apply(flexible_map_prediction)

# Filter only usable predictions
filtered_df = df[df["flexible_pred"] != -1]
y_true = filtered_df["ground_truth"]
y_pred = filtered_df["flexible_pred"]

# Compute and print metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"F1 Score: {f1:.4f}")


Accuracy: 0.3414
Precision: 0.3202
F1 Score: 0.4850


In [2]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, f1_score
from datetime import datetime
import os

# Paths to disguise result CSVs for each model
model_paths = {
    "Qwen-2.5": "/content/drive/MyDrive/vision_benchmark/metadata/disguise_qwen2.5vl_sample1000_results.csv",
    "Meta-Llama4-Scout": "/content/drive/MyDrive/vision_benchmark/metadata/disguise_llama4_scout_sample1000_results.csv",
    "Meta-Llama4-Maverick": "/content/drive/MyDrive/vision_benchmark/metadata/disguise_llama4_maverick_sample1000_results.csv"
    # Add more models here if needed
}

# Mapping logic for flexible responses
def flexible_map_prediction(text):
    text = str(text).strip().lower()
    if any(phrase in text for phrase in [
        "show the same person",
        "appear to be the same person",
        "appear to show the same person"
    ]):
        return 1
    elif any(phrase in text for phrase in [
        "do not appear to be the same person",
        "do not appear to be the same individual"
    ]):
        return 0
    else:
        return -1

# List to store result rows
results = []

# Evaluate each model
for model_name, path in model_paths.items():
    try:
        df = pd.read_csv(path)
        df["flexible_pred"] = df["prediction"].apply(flexible_map_prediction)
        filtered_df = df[df["flexible_pred"] != -1]

        y_true = filtered_df["ground_truth"]
        y_pred = filtered_df["flexible_pred"]

        results.append({
            "Model": model_name,
            "Dataset": "Disguise",
            "Accuracy": round(accuracy_score(y_true, y_pred), 4),
            "Precision": round(precision_score(y_true, y_pred), 4),
            "F1_Score": round(f1_score(y_true, y_pred), 4)
        })

        print(f"✅ Done: {model_name}")

    except Exception as e:
        print(f"❌ Error with {model_name}: {e}")

# Save to CSV
results_df = pd.DataFrame(results)
results_path = "/content/drive/MyDrive/vision_benchmark/metadata/disguise_comparison_final.csv"
results_df.to_csv(results_path, mode='a', header=not os.path.exists(results_path), index=False)
print(f"\n📄 All results saved to: {results_path}")


✅ Done: Qwen-2.5
❌ Error with Meta-Llama4-Scout: [Errno 2] No such file or directory: '/content/drive/MyDrive/vision_benchmark/metadata/disguise_llama4_scout_sample1000_results.csv'
❌ Error with Meta-Llama4-Maverick: [Errno 2] No such file or directory: '/content/drive/MyDrive/vision_benchmark/metadata/disguise_llama4_maverick_sample1000_results.csv'

📄 All results saved to: /content/drive/MyDrive/vision_benchmark/metadata/disguise_comparison_final.csv
