<div style="background-color:rgb(223, 119, 160); padding: 30px; border-radius: 20px; box-shadow: 0 4px 15px rgba(255, 105, 180, 0.3); color: #F8BBD0; font-family: 'Times New Roman', serif;">

<h1 style="text-align: center; font-size: 38px; color: white; font-weight: bold;">🎀 Evaluate The Chatbots 🎀</h1>


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from evaluate import load
import pandas as pd
import torch
from tqdm import tqdm

# 1. Load model and tokenizer
model_path = "./interview_model"  # Update this if different
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForCausalLM.from_pretrained(model_path)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# 2. Load the dataset
df = pd.read_csv("./Q&A_data.csv").dropna(subset=["question", "answer"])

# 3. Clean the text
def clean(text):
    return text.strip().replace("\n", " ")

# 4. Load metrics
rouge = load("rouge")
bleu = load("bleu")

rouge_scores = []
bleu_scores = []
examples = []

# 5. Evaluation loop
print("🔍 Evaluating...")
for _, row in tqdm(df.iterrows(), total=len(df)):
    question = clean(row["question"])
    reference = clean(row["answer"])

    # Prompt for inference
    prompt = f"<|startoftext|>\n### Question:\n{question}\n\n### Answer:\n"

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    output = model.generate(
        **inputs,
        max_new_tokens=100,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.2,
        pad_token_id=tokenizer.eos_token_id
    )

    prediction = tokenizer.decode(output[0], skip_special_tokens=True)
    prediction = prediction.split("### Answer:")[-1].strip()

    # Compute metrics
    rouge_score = rouge.compute(predictions=[prediction], references=[reference])["rougeL"]
    bleu_score = bleu.compute(predictions=[prediction], references=[[reference]])["bleu"]

    rouge_scores.append(rouge_score)
    bleu_scores.append(bleu_score)

    # Store a few example predictions
    if len(examples) < 5:
        examples.append((question, reference, prediction))

# 6. Print results
print("\n📊 Evaluation Results:")
print(f"🔸 Average ROUGE-L: {sum(rouge_scores)/len(rouge_scores):.4f}")
print(f"🔸 Average BLEU: {sum(bleu_scores)/len(bleu_scores):.4f}")

print("\n📝 Example predictions:")
for q, real, pred in examples:
    print(f"Q: {q}")
    print(f"✅ Real: {real}")
    print(f"🤖 Pred: {pred}")
    print("-" * 50)
