In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import base64
import pandas as pd
from PIL import Image
from huggingface_hub import InferenceClient
from sklearn.metrics import accuracy_score, classification_report
import traceback

In [None]:
def encode_image_base64(image_path):
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image not found: {image_path}")
    with open(image_path, "rb") as f:
        return f"data:image/jpeg;base64," + base64.b64encode(f.read()).decode("utf-8")

In [None]:
client = InferenceClient(model="meta-llama/Llama-4-Scout-17B-16E-Instruct")

In [None]:
# Load data
df = pd.read_csv("/content/drive/MyDrive/vision_benchmark/metadata/twin_face_pairs_absolute.csv")  # or use uploaded file path
prompt = (
    "Look closely at both images such as face shape, eyes, nose, mouth, and hairstyle. Based on these observations, do the two people appear to be twins? Respond only with 'Yes' or 'No'."
)

preds = []
gts = df["is_same_twin"].tolist()
results = []

for idx, row in df.iterrows():
    try:
        img1_b64 = encode_image_base64(row["img1"])
        img2_b64 = encode_image_base64(row["img2"])

        messages = [{
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": {"url": img1_b64}},
                {"type": "image_url", "image_url": {"url": img2_b64}},
                {"type": "text", "text": prompt}
            ]
        }]

        response = client.chat.completions.create(messages=messages)
        answer = response.choices[0].message.content.strip().lower()

        # More reliable classification logic
        if "yes" in answer and "no" not in answer:
            pred = 1
        elif "no" in answer and "yes" not in answer:
            pred = 0
        else:
            pred = 0  # fallback for ambiguous

    except Exception as e:
        answer = f"error: {e}"
        pred = 0

    preds.append(pred)
    results.append({
        "img1": row["img1"],
        "img2": row["img2"],
        "is_same_twin": row["is_same_twin"],
        "model_response": answer,
        "prediction": pred
    })

# Save predictions
df_results = pd.DataFrame(results)
df_results.to_csv("/content/drive/MyDrive/vision_benchmark/metadata/twin_test_output_metallama4.csv", index=False)

# Print evaluation metrics
print("\n✅ Accuracy:", accuracy_score(gts, preds))
print("\n📋 Classification Report:\n", classification_report(gts, preds, digits=4))


✅ Accuracy: 0.9139344262295082

📋 Classification Report:
               precision    recall  f1-score   support

           0     0.9106    0.9180    0.9143       122
           1     0.9174    0.9098    0.9136       122

    accuracy                         0.9139       244
   macro avg     0.9140    0.9139    0.9139       244
weighted avg     0.9140    0.9139    0.9139       244

