In [8]:
!pip install rouge_score
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch, time, psutil, numpy as np, gc, re, json
import evaluate
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

# Load model and tokenizer
model_name = "microsoft/phi-1_5"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
    model_name, trust_remote_code=True,
    torch_dtype=torch.float16, device_map="auto"
)
device = model.device
torch.cuda.empty_cache()

# Load ARC-Challenge dataset
dataset = load_dataset("allenai/ai2_arc", "ARC-Challenge", split="test[:500]")

smoother = SmoothingFunction().method4
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

def evaluate_phi15_on_arc(dataset, batch_size=4, max_new_tokens=50):
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    process = psutil.Process()
    results = {
        "accuracies": [], "f1_scores": [], "latencies": [], "tokens_per_sec": [], "memory_usage": [],
        "retrieval_latencies": [], "query_times": [], "memory_reductions": [],
        "bleu_scores": [], "rouge1_scores": [], "rougeL_scores": [],
        "knowledge_retentions": [], "accuracy_drops": [], "compression_ratios": []
    }

    initial_memory = process.memory_info().rss / 1024**3

    for i in range(0, len(dataset), batch_size):
        prompts, correct_answers, references = [], [], []

        for idx in range(i, min(i + batch_size, len(dataset))):
            item = dataset[idx]
            question = item["question"]
            choices = item["choices"]
            labels = choices["label"]
            texts = choices["text"]
            answer_key = item["answerKey"]

            prompt = f"Question: {question}\n"
            for label, choice in zip(labels, texts):
                prompt += f"{label}: {choice}\n"
            prompt += "Answer:"
            explanation = f"{question} " + " ".join(texts)

            prompts.append(prompt)
            correct_answers.append(answer_key)
            references.append(explanation)

        start_retrieval = time.time()
        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        retrieval_latency = time.time() - start_retrieval
        results["retrieval_latencies"].append(retrieval_latency)

        start = time.time()
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.eos_token_id)
        latency = time.time() - start
        results["latencies"].append(latency)

        generated_tokens = sum(len(out) - len(inp) for out, inp in zip(outputs, inputs['input_ids']))
        results["tokens_per_sec"].append(generated_tokens / latency if latency > 0 else 0)
        results["query_times"].append(time.time() - start)

        final_memory = process.memory_info().rss / 1024**3
        results["memory_usage"].append(final_memory)
        results["memory_reductions"].append(max(0, initial_memory - final_memory))

        generated_texts = [tokenizer.decode(out, skip_special_tokens=True) for out in outputs]
        pred_answers = []
        for text in generated_texts:
            match = re.search(r"\b([A-E])\b", text.split("Answer")[-1])
            pred_answers.append(match.group(1).strip().upper() if match else "")

        try:
            accuracy = accuracy_metric.compute(predictions=pred_answers, references=correct_answers)["accuracy"]
            f1 = f1_metric.compute(predictions=pred_answers, references=correct_answers, average="macro")["f1"]
        except:
            accuracy, f1 = 0, 0
        results["accuracies"].append(accuracy)
        results["f1_scores"].append(f1)

        for gen, ref in zip(generated_texts, references):
            gen_words = gen.split()
            ref_words = ref.split()

            try:
                bleu = sentence_bleu([ref_words], gen_words, weights=(0.5, 0.5, 0.0, 0.0), smoothing_function=smoother)
                results["bleu_scores"].append(bleu)
            except:
                results["bleu_scores"].append(0)

            try:
                rouge = scorer.score(ref, gen)
                results["rouge1_scores"].append(rouge['rouge1'].fmeasure)
                results["rougeL_scores"].append(rouge['rougeL'].fmeasure)
                results["knowledge_retentions"].append(rouge['rougeL'].fmeasure)
                results["accuracy_drops"].append(1 - rouge['rouge1'].fmeasure)
            except:
                results["rouge1_scores"].append(0)
                results["rougeL_scores"].append(0)
                results["knowledge_retentions"].append(0)
                results["accuracy_drops"].append(0)

            try:
                input_tokens = len(inputs['input_ids'][0])
                output_tokens = len(outputs[0])
                results["compression_ratios"].append(input_tokens / output_tokens if output_tokens > 0 else 1)
            except:
                results["compression_ratios"].append(1)

        gc.collect()
        torch.cuda.empty_cache()

    # Print full metric summary
    summary = {k: np.mean(v) if v else 0 for k, v in results.items()}
    print(f"Avg latency: {summary['latencies']:.3f} sec")
    print(f"Tokens per sec: {summary['tokens_per_sec']:.2f}")
    print(f"BLEU Score: {summary['bleu_scores']:.3f}")
    print(f"ROUGE-1 Score: {summary['rouge1_scores']:.3f}")
    print(f"ROUGE-L Score: {summary['rougeL_scores']:.3f}")
    print(f"Memory usage (GB): {summary['memory_usage']:.3f}")
    print(f"Retrieval Latency (sec): {summary['retrieval_latencies']:.3f}")
    print(f"F1 Score: {summary['f1_scores']:.3f}")
    print(f"Knowledge Retention: {summary['knowledge_retentions']:.3f}")
    print(f"Memory Reduction (GB): {summary['memory_reductions']:.2f}")
    print(f"Query Processing Time (sec): {summary['query_times']:.3f}")
    print(f"Accuracy Drop: {summary['accuracy_drops']:.3f}")
    print(f"Compression Ratio: {summary['compression_ratios']:.2f}")
    print(f"Accuracy: {summary['accuracies']:.3f}")

    with open("/kaggle/working/arc_phi15_metrics.json", "w") as f:
        json.dump(summary, f, indent=2)

    return summary

# Run evaluation
arc_metrics = evaluate_phi15_on_arc(dataset, batch_size=4, max_new_tokens=50)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=71b63d43b649b7d3f10f0bc1cb32d7e29f9d7acd2f8dde6de8981f2839a01db3
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Avg latency: 1.724 sec
Tokens per sec: 116.08
BLEU Score: 0.446
ROUGE-1 Score: 0.639
ROUGE-L Score: 0.639
Memory usage (GB): 2.057
Retrieval Latency (sec): 0.002
F1 Score: 0.000
Knowledge Retention: 0.639
Memory Reduction (GB): 0.00
Query Processing Time (sec): 1.724
Accuracy Drop: 0.361
Compression Ratio: 0.64
Accuracy: 0.000
