In [None]:
from huggingface_hub import InferenceClient
import pandas as pd
from sklearn.metrics import f1_score
from tqdm import tqdm
import base64, os

# 📂 Mount Drive
from google.colab import drive
drive.mount('/content/drive')

# 📄 Load and sample 1200 rows
df = pd.read_csv("/content/drive/MyDrive/vision_benchmark/metadata/wildlife_dataset.csv")
sample_df = df.sample(1200, random_state=42).reset_index(drop=True)

# 🧠 Model setup
client = InferenceClient("Qwen/Qwen2.5-VL-72B-Instruct")

# 🏷️ All valid labels
known_labels = df["label"].str.lower().unique().tolist()

# 🧠 Base64 encoding
def encode_image_base64(image_path):
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image not found: {image_path}")
    with open(image_path, "rb") as f:
        return "data:image/jpeg;base64," + base64.b64encode(f.read()).decode("utf-8")

# 🧠 Extract predicted label from response
def extract_predicted_label(response, known_labels):
    response = str(response).lower()
    for label in known_labels:
        if label in response:
            return label
    return "unknown"

# 🧪 Run inference
results = []

for _, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
    image_path = row["image_path"]
    prompt = row["prompt"]
    ground_truth = row["label"].lower()

    try:
        img_b64 = encode_image_base64(image_path)

        messages = [{
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": {"url": img_b64}},
                {"type": "text", "text": prompt}
            ]
        }]

        response = client.chat.completions.create(messages=messages, max_tokens=100)
        answer = response.choices[0].message.content.strip().lower()
        predicted_label = extract_predicted_label(answer, known_labels)

    except Exception as e:
        import traceback
        traceback.print_exc()
        answer = f"error: {e}"
        predicted_label = "unknown"

    results.append({
        "image_path": image_path,
        "prompt": prompt,
        "ground_truth": ground_truth,
        "model_response": answer,
        "predicted_label": predicted_label
    })

# 💾 Save output
df_out = pd.DataFrame(results)
output_path = "/content/drive/MyDrive/vision_benchmark/metadata/wildlife_Qwen2.5_result.csv"
df_out.to_csv(output_path, index=False)

# 📊 Evaluation (includes 'unknown')
f1_macro = f1_score(df_out["ground_truth"], df_out["predicted_label"], average="macro", zero_division=0)
f1_micro = f1_score(df_out["ground_truth"], df_out["predicted_label"], average="micro", zero_division=0)

print("\n✅ Evaluation Done")
print(f"F1 Score : {f1_macro:.4f}")
print(f"F1 Score : {f1_micro:.4f}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


 21%|██        | 250/1200 [22:05<1:36:01,  6.06s/it]Traceback (most recent call last):
  File "/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_http.py", line 409, in hf_raise_for_status
    response.raise_for_status()
  File "/usr/local/lib/python3.11/dist-packages/requests/models.py", line 1024, in raise_for_status
    raise HTTPError(http_error_msg, response=self)
requests.exceptions.HTTPError: 502 Server Error: Bad Gateway for url: https://router.huggingface.co/nebius/v1/chat/completions

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "/tmp/ipython-input-2316079989.py", line 55, in <cell line: 0>
    response = client.chat.completions.create(messages=messages, max_tokens=100)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/huggingface_hub/inference/_client.py", line 923, in chat_completion
    data = self._inner_post(request