In [13]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch, time, psutil, numpy as np, gc, re, json
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge_score import rouge_scorer
import evaluate

# Load model and tokenizer
model_name = "microsoft/phi-1_5"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.padding_side = "left"
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
    model_name, trust_remote_code=True,
    torch_dtype=torch.float16, device_map="auto"
)
device = model.device
torch.cuda.empty_cache()

# Load dataset
dataset = load_dataset("gsm8k", "main", split="test[:500]")

# BLEU smoothing
smoother = SmoothingFunction().method4

def extract_answer(text):
    match = re.search(r"(?:Answer:)?\s*([-]?\d*\.?\d+|\d+/\d+|[-]?\d+|\d+\s*\w+)", text)
    return match.group(1) if match else text.strip()

def evaluate_phi15_on_gsm8k(dataset, batch_size=8, max_new_tokens=50):
    accuracy_metric = evaluate.load("accuracy")
    f1_metric = evaluate.load("f1")
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    process = psutil.Process()

    results = {k: [] for k in [
        "accuracies", "f1_scores", "latencies", "tokens_per_sec", "memory_usage",
        "perplexities", "bleu_scores", "rouge1_scores", "rougeL_scores",
        "retrieval_latencies", "memory_reductions", "query_times",
        "accuracy_drops", "compression_ratios", "knowledge_retentions"
    ]}

    initial_memory = process.memory_info().rss / 1024**3

    for i in range(0, len(dataset), batch_size):
        batch = dataset[i:i+batch_size]
        prompts = [f"Solve: {q}\nAnswer: " for q in batch["question"]]
        references = [str(a).split("#### ")[-1].strip() for a in batch["answer"]]
        ref_explanations = [str(a).split("#### ")[0].strip() for a in batch["answer"]]

        start_retrieval = time.time()
        inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=512)
        inputs = {k: v.to(model.device) for k, v in inputs.items()}
        retrieval_latency = time.time() - start_retrieval
        results["retrieval_latencies"].append(retrieval_latency)

        start = time.time()
        with torch.no_grad():
            outputs = model.generate(**inputs, max_new_tokens=max_new_tokens, pad_token_id=tokenizer.eos_token_id)
        latency = time.time() - start
        results["latencies"].append(latency)

        generated_tokens = sum(len(out) - len(inp) for out, inp in zip(outputs, inputs['input_ids']))
        results["tokens_per_sec"].append(generated_tokens / latency if latency > 0 else 0)
        results["query_times"].append(time.time() - start)

        final_memory = process.memory_info().rss / 1024**3
        results["memory_usage"].append(final_memory)
        results["memory_reductions"].append(max(0, initial_memory - final_memory))

        generated_texts = [tokenizer.decode(out, skip_special_tokens=True) for out in outputs]
        pred_answers = [extract_answer(text) for text in generated_texts]

        try:
            accuracy = accuracy_metric.compute(predictions=pred_answers, references=references)["accuracy"]
            f1 = f1_metric.compute(predictions=pred_answers, references=references, average="macro")["f1"]
        except:
            accuracy, f1 = 0, 0
        results["accuracies"].append(accuracy)
        results["f1_scores"].append(f1)

        for gen, ref_exp, ref_ans in zip(generated_texts, ref_explanations, references):
            gen_words = gen.split()
            ref_words = ref_exp.split() if ref_exp.strip() else ref_ans.split()

            try:
                bleu = sentence_bleu([ref_words], gen_words, weights=(0.5, 0.5, 0.0, 0.0), smoothing_function=smoother)
                results["bleu_scores"].append(bleu)
            except:
                results["bleu_scores"].append(0)

            try:
                rouge = scorer.score(ref_exp if ref_exp.strip() else ref_ans, gen)
                results["rouge1_scores"].append(rouge['rouge1'].fmeasure)
                results["rougeL_scores"].append(rouge['rougeL'].fmeasure)
                results["knowledge_retentions"].append(rouge['rougeL'].fmeasure)
                results["accuracy_drops"].append(1 - rouge['rouge1'].fmeasure)
            except:
                results["rouge1_scores"].append(0)
                results["rougeL_scores"].append(0)
                results["knowledge_retentions"].append(0)
                results["accuracy_drops"].append(0)

            try:
                input_tokens = len(inputs['input_ids'][0])
                output_tokens = len(outputs[0])
                results["compression_ratios"].append(input_tokens / output_tokens if output_tokens > 0 else 1)
            except:
                results["compression_ratios"].append(1)

        gc.collect()
        torch.cuda.empty_cache()

    # Final metrics
    summary = {k: np.mean(v) if v else 0 for k, v in results.items()}
    for k, v in summary.items():
        print(f"{k.replace('_', ' ').title()}: {v:.3f}")

    with open("/kaggle/working/gsm8k_phi15_results.json", "w") as f:
        json.dump(summary, f, indent=2)

    return summary

# Run evaluation
gsm8k_metrics = evaluate_phi15_on_gsm8k(dataset, batch_size=8, max_new_tokens=50)


Accuracies: 0.012
F1 Scores: 0.007
Latencies: 1.755
Tokens Per Sec: 226.146
Memory Usage: 1.826
Perplexities: 0.000
Bleu Scores: 0.161
Rouge1 Scores: 0.397
Rougel Scores: 0.259
Retrieval Latencies: 0.002
Memory Reductions: 0.000
Query Times: 1.755
Accuracy Drops: 0.603
Compression Ratios: 0.651
Knowledge Retentions: 0.259
