In [None]:
from huggingface_hub import InferenceClient
import pandas as pd
from sklearn.metrics import f1_score
from tqdm import tqdm
import base64, os

# 📂 Mount Drive
from google.colab import drive
drive.mount('/content/drive')

# 📄 Load and sample 1200 rows
df = pd.read_csv("/content/drive/MyDrive/vision_benchmark/metadata/wildlife_dataset.csv")
sample_df = df.sample(1200, random_state=42).reset_index(drop=True)

# 🧠 Model setup
client = InferenceClient("meta-llama/Llama-4-Scout-17B-16E-Instruct")

# 🏷️ All valid labels
known_labels = df["label"].str.lower().unique().tolist()

# 🧠 Base64 encoding
def encode_image_base64(image_path):
    if not os.path.exists(image_path):
        raise FileNotFoundError(f"Image not found: {image_path}")
    with open(image_path, "rb") as f:
        return "data:image/jpeg;base64," + base64.b64encode(f.read()).decode("utf-8")

# 🧠 Extract predicted label from response
def extract_predicted_label(response, known_labels):
    response = str(response).lower()
    for label in known_labels:
        if label in response:
            return label
    return "unknown"

# 🧪 Run inference
results = []

for _, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
    image_path = row["image_path"]
    prompt = row["prompt"]
    ground_truth = row["label"].lower()

    try:
        img_b64 = encode_image_base64(image_path)

        messages = [{
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": {"url": img_b64}},
                {"type": "text", "text": prompt}
            ]
        }]

        response = client.chat.completions.create(messages=messages, max_tokens=100)
        answer = response.choices[0].message.content.strip().lower()
        predicted_label = extract_predicted_label(answer, known_labels)

    except Exception as e:
        import traceback
        traceback.print_exc()
        answer = f"error: {e}"
        predicted_label = "unknown"

    results.append({
        "image_path": image_path,
        "prompt": prompt,
        "ground_truth": ground_truth,
        "model_response": answer,
        "predicted_label": predicted_label
    })

# 💾 Save output
df_out = pd.DataFrame(results)
output_path = "/content/drive/MyDrive/vision_benchmark/metadata/wildlife_preds_llama4_scout_sample1200.csv"
df_out.to_csv(output_path, index=False)

# 📊 Evaluation (includes 'unknown')
f1_macro = f1_score(df_out["ground_truth"], df_out["predicted_label"], average="macro", zero_division=0)
f1_micro = f1_score(df_out["ground_truth"], df_out["predicted_label"], average="micro", zero_division=0)

print("\n✅ Evaluation Done")
print(f"F1 Score : {f1_macro:.4f}")
print(f"F1 Score : {f1_micro:.4f}")


Mounted at /content/drive


100%|██████████| 1200/1200 [1:43:36<00:00,  5.18s/it]


✅ Evaluation Done
F1 Score : 0.3228
F1 Score : 0.3275





In [1]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.metrics import f1_score
from datetime import datetime
import os

# 📁 Paths to already-saved prediction CSVs
model_paths = {
    "Meta-Llama4-Scout": "/content/drive/MyDrive/vision_benchmark/metadata/wildlife_preds_llama4_scout_sample1200.csv",
    "Meta-Llama4-Maverick": "/content/drive/MyDrive/vision_benchmark/metadata/wildlife_preds_llama_maverick_sample1200.csv"
    # Add more models here
}

# 📊 Store metrics
results = []

for model_name, path in model_paths.items():
    try:
        df = pd.read_csv(path)
        f1_macro = f1_score(df["ground_truth"], df["predicted_label"], average="macro", zero_division=0)
        f1_micro = f1_score(df["ground_truth"], df["predicted_label"], average="micro", zero_division=0)
        accuracy_score = (df["ground_truth"] == df["predicted_label"]).mean()

        results.append({
            "Model": model_name,
            "Dataset": "Wildlife",
            "Date": datetime.today().strftime('%Y-%m-%d'),
            "F1_Macro": round(f1_macro, 4),
            "F1_Micro": round(f1_micro, 4),
            "Accuracy": round(accuracy_score,4)
        })

        print(f"✅ {model_name} - Macro F1: {f1_macro:.4f}, Micro F1: {f1_micro:.4f}, Accuracy: {accuracy_score}")

    except Exception as e:
        print(f"❌ Failed for {model_name}: {e}")

# 💾 Save all metrics to summary CSV
summary_df = pd.DataFrame(results)
summary_path = "/content/drive/MyDrive/vision_benchmark/metadata/wildlife_comparison_final.csv"
summary_df.to_csv(summary_path, mode='a', header=not os.path.exists(summary_path), index=False)
print(f"\n📄 Summary saved to: {summary_path}")


✅ Meta-Llama4-Scout - Macro F1: 0.3228, Micro F1: 0.3275, Accuracy: 0.3275
✅ Meta-Llama4-Maverick - Macro F1: 0.3806, Micro F1: 0.3875, Accuracy: 0.3875

📄 Summary saved to: /content/drive/MyDrive/vision_benchmark/metadata/wildlife_comparison_final.csv
