In [None]:
# -------------------------------
# Multi-model batch inference (LLaMA + Phi) + save results
# -------------------------------

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
import gc
import time
import os

# -------------------------------
# 0. Memory cleanup
# -------------------------------
gc.collect()
torch.cuda.empty_cache()

# -------------------------------
# 1. Load dataset
# -------------------------------
try:
    final_datasheet = "finalDataset.csv"
    data = pd.read_csv(final_datasheet, encoding="utf-8")
except UnicodeDecodeError:
    print("UTF-8 failed, trying ISO-8859-1...")
    data = pd.read_csv(final_datasheet, encoding="ISO-8859-1")

# Add empty columns for storing results
data['ModelAnswer'] = ""
data['Correct'] = ""

# -------------------------------
# 2. Choose model
# -------------------------------
# Options: "3.1-8B-instruct", "3.1-8B-IT", "3.2-3B", "phi-mini"
MODEL_CHOICE = "phi-mini"  # change this variable to switch models

model_dict = {
    "3.1-8B-instruct": "meta-llama/Llama-3.1-8b-instruct",
    "3.1-8B-IT": "meta-llama/Llama-3.1-8b-IT",
    "3.2-3B": "meta-llama/Llama-3.2-3b",
    "phi-mini": "microsoft/Phi-4-mini-flash-reasoning"
}

model_name = model_dict[MODEL_CHOICE]

# -------------------------------
# 3. Load token (optional, safe check)
# -------------------------------
huggingface_token = os.getenv("HF_TOKEN")

llama_models = ["3.1-8B-instruct", "3.1-8B-IT", "3.2-3B"]

if MODEL_CHOICE in llama_models and not huggingface_token:
    raise ValueError(
        f"⚠️ The model '{MODEL_CHOICE}' requires a Hugging Face token. "
        "Please set the HF_TOKEN environment variable to access it."
    )
elif huggingface_token:
    print("✅ Using Hugging Face token for authentication.")
else:
    print("ℹ️ No Hugging Face token found. Will try public model download if available (Phi-mini only).")

token_kwargs = {"use_auth_token": huggingface_token} if huggingface_token else {}

# -------------------------------
# 4. Load tokenizer and model
# -------------------------------
print(f"Loading tokenizer for {MODEL_CHOICE}...")
tokenizer = AutoTokenizer.from_pretrained(model_name, **token_kwargs)

print(f"Loading model {MODEL_CHOICE} on GPU...")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    **token_kwargs,
    device_map="auto",
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True
)

# -------------------------------
# 5. Batch inference function
# -------------------------------
def generate_answer(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.0,
            do_sample=False
        )
    decoded = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()
    return decoded.replace(prompt.strip(), "").strip()

# -------------------------------
# 6. Loop through dataset
# -------------------------------
start_time = time.time()
total_questions = len(data)
correct_answers = 0

for idx, row in data.iterrows():
    prompt = f"""
You are a person from India with deep knowledge and lived experience of Indian culture.
Now, answer the following question using your expertise in Indian culture by identifying the specific cultural element being referred to.
Respond only with the name of the cultural element (e.g., Indian) — no additional text, questions, or explanations.

Question: {row['Corrected Question']}
"""
    model_answer = generate_answer(prompt)

    prediction_correctness = row['Answer'].strip().lower() in model_answer.lower()
    if prediction_correctness:
        correct_answers += 1

    data.at[idx, 'ModelAnswer'] = model_answer
    data.at[idx, 'Correct'] = str(prediction_correctness)

    print(f"Progress: {idx+1}/{total_questions} | Correct so far: {correct_answers}")

# -------------------------------
# 7. Summary & save
# -------------------------------
accuracy = (correct_answers / total_questions) * 100
print(f"Total correct: {correct_answers}/{total_questions}")
print(f"Accuracy: {accuracy:.2f}%")

output_file = f"model_answers_results_{MODEL_CHOICE.replace('-', '_')}.csv"
data.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")
print(f"Total runtime: {time.time() - start_time:.2f} seconds")

# -------------------------------
# 8. Cleanup
# -------------------------------
gc.collect()
torch.cuda.empty_cache()
