<a href="https://www.kaggle.com/code/ebadshabbir/phi1-5-arc-test?scriptVersionId=234992074" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
!pip install rouge_score evaluate
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch, time, psutil, numpy as np, gc, re, json
import evaluate
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer

# Load model and tokenizer
model_name = "microsoft/phi-1_5"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
    model_name, trust_remote_code=True,
    torch_dtype=torch.float16, device_map="auto"
)
device = model.device
torch.cuda.empty_cache()

# Load ARC-Challenge dataset
dataset = load_dataset("allenai/ai2_arc", "ARC-Challenge", split="test[:500]")

smoother = SmoothingFunction().method4
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

def evaluate_phi15_on_arc(dataset, batch_size=4, max_new_tokens=50):
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    process = psutil.Process()
    results = {
        "accuracies": [], "f1_scores": [], "latencies": [], "tokens_per_sec": [], "memory_usage": [],
        "retrieval_latencies": [], "query_times": [], "memory_reductions": [],
        "bleu_scores": [], "rouge1_scores": [], "rougeL_scores": [],
        "knowledge_retentions": [], "accuracy_drops": [], "compression_ratios": []
    }

    initial_memory = process.memory_info().rss / 1024**3

    for i in range(0, len(dataset), batch_size):
        prompts, correct_answers, references = [], [], []

        for idx in range(i, min(i + batch_size, len(dataset))):
            item = dataset[idx]
            question = item["question"]
            choices = item["choices"]
            labels = choices["label"]
            texts = choices["text"]
            answer_key = item["answerKey"]

            prompt = f"Question: {question}\n"
            for label, choice in zip(labels, texts):
                prompt += f"{label}: {choice}\n"
            prompt += "Answer:"
            explanation = f"{question} " + " ".join(texts)

            prompts.append(prompt)
            correct_answers.append(answer_key)
            references.append(explanation)

        start_retrieval = time.time()
        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        retrieval_latency = time.time() - start_retrieval
        results["retrieval_latencies"].append(retrieval_latency)

        start = time.time()
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.eos_token_id)
        latency = time.time() - start
        results["latencies"].append(latency)

        generated_tokens = sum(len(out) - len(inp) for out, inp in zip(outputs, inputs['input_ids']))
        results["tokens_per_sec"].append(generated_tokens / latency if latency > 0 else 0)
        results["query_times"].append(time.time() - start)

        final_memory = process.memory_info().rss / 1024**3
        results["memory_usage"].append(final_memory)
        results["memory_reductions"].append(max(0, initial_memory - final_memory))

        generated_texts = [tokenizer.decode(out, skip_special_tokens=True) for out in outputs]
        pred_answers = []
        for text in generated_texts:
            match = re.search(r"\b([A-E])\b", text.split("Answer")[-1])
            pred_answers.append(match.group(1).strip().upper() if match else "")

        try:
            accuracy = accuracy_metric.compute(predictions=pred_answers, references=correct_answers)["accuracy"]
            f1 = f1_metric.compute(predictions=pred_answers, references=correct_answers, average="macro")["f1"]
        except:
            accuracy, f1 = 0, 0
        results["accuracies"].append(accuracy)
        results["f1_scores"].append(f1)

        for gen, ref in zip(generated_texts, references):
            gen_words = gen.split()
            ref_words = ref.split()

            try:
                bleu = sentence_bleu([ref_words], gen_words, weights=(0.5, 0.5, 0.0, 0.0), smoothing_function=smoother)
                results["bleu_scores"].append(bleu)
            except:
                results["bleu_scores"].append(0)

            try:
                rouge = scorer.score(ref, gen)
                results["rouge1_scores"].append(rouge['rouge1'].fmeasure)
                results["rougeL_scores"].append(rouge['rougeL'].fmeasure)
                results["knowledge_retentions"].append(rouge['rougeL'].fmeasure)
                results["accuracy_drops"].append(1 - rouge['rouge1'].fmeasure)
            except:
                results["rouge1_scores"].append(0)
                results["rougeL_scores"].append(0)
                results["knowledge_retentions"].append(0)
                results["accuracy_drops"].append(0)

            try:
                input_tokens = len(inputs['input_ids'][0])
                output_tokens = len(outputs[0])
                results["compression_ratios"].append(input_tokens / output_tokens if output_tokens > 0 else 1)
            except:
                results["compression_ratios"].append(1)

        gc.collect()
        torch.cuda.empty_cache()

    # Print full metric summary
    summary = {k: np.mean(v) if v else 0 for k, v in results.items()}
    print(f"Avg latency: {summary['latencies']:.3f} sec")
    print(f"Tokens per sec: {summary['tokens_per_sec']:.2f}")
    print(f"BLEU Score: {summary['bleu_scores']:.3f}")
    print(f"ROUGE-1 Score: {summary['rouge1_scores']:.3f}")
    print(f"ROUGE-L Score: {summary['rougeL_scores']:.3f}")
    print(f"Memory usage (GB): {summary['memory_usage']:.3f}")
    print(f"Retrieval Latency (sec): {summary['retrieval_latencies']:.3f}")
    print(f"F1 Score: {summary['f1_scores']:.3f}")
    print(f"Knowledge Retention: {summary['knowledge_retentions']:.3f}")
    print(f"Memory Reduction (GB): {summary['memory_reductions']:.2f}")
    print(f"Query Processing Time (sec): {summary['query_times']:.3f}")
    print(f"Accuracy Drop: {summary['accuracy_drops']:.3f}")
    print(f"Compression Ratio: {summary['compression_ratios']:.2f}")
    print(f"Accuracy: {summary['accuracies']:.3f}")

    with open("/kaggle/working/arc_phi15_metrics.json", "w") as f:
        json.dump(summary, f, indent=2)

    return summary

# Run evaluation
arc_metrics = evaluate_phi15_on_arc(dataset, batch_size=4, max_new_tokens=50)


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting fsspec>=2021.05.0 (from fsspec[http]>=2021.05.0->evaluate)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.whl (183 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=ef2d5eb2aa64308b12f8ce29c0d4320bf9e26f2fc878f620cdf6094a98706b3f
  Stored i

2025-04-20 09:21:42.305745: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745140902.623138      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745140902.706088      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


tokenizer_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/736 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.84G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/9.00k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


train-00000-of-00001.parquet:   0%|          | 0.00/190k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


test-00000-of-00001.parquet:   0%|          | 0.00/204k [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


validation-00000-of-00001.parquet:   0%|          | 0.00/55.7k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1119 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1172 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/299 [00:00<?, ? examples/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Avg latency: 1.726 sec
Tokens per sec: 116.34
BLEU Score: 0.446
ROUGE-1 Score: 0.639
ROUGE-L Score: 0.639
Memory usage (GB): 2.018
Retrieval Latency (sec): 0.002
F1 Score: 0.000
Knowledge Retention: 0.639
Memory Reduction (GB): 0.00
Query Processing Time (sec): 1.726
Accuracy Drop: 0.361
Compression Ratio: 0.64
Accuracy: 0.000
