In [10]:
!pip install evaluate rouge_score

import time
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
from evaluate import load as load_metric
import psutil
import os

# Load dataset (first 500 test examples of ARC-Challenge)
dataset = load_dataset("allenai/ai2_arc", "ARC-Challenge", split="test[:500]")

# Load TinyLlama model and tokenizer
model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16, device_map="auto")
model.eval()

# Load evaluation metrics
bleu_metric = load_metric("bleu")
rouge_metric = load_metric("rouge")
# f1_metric = load_metric("f1")  # Uncomment if using a specific f1 implementation

# Accumulators
total_latency = 0
num_tokens = 0
total_perplexity = 0
bleu_scores = []
rouge1_scores = []
rougeL_scores = []
memory_usages = []
retrieval_latencies = []
f1_scores = []
knowledge_retentions = []
query_times = []
accuracy_drops = []
compression_ratios = []
flop_reductions = []
memory_reductions = []

# Evaluation loop
for i, sample in enumerate(dataset):
    question = sample["question"]
    answer = sample["answerKey"]
    input_text = f"Question: {question}\nAnswer:"
    
    # Tokenize inputs
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True).to(model.device)

    memory_before = psutil.Process(os.getpid()).memory_info().rss / (1024 ** 3)
    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=64)
    end_time = time.time()
    memory_after = psutil.Process(os.getpid()).memory_info().rss / (1024 ** 3)

    output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    latency = end_time - start_time
    total_latency += latency
    query_times.append(latency)

    num_tokens += outputs[0].shape[-1]
    memory_usages.append(memory_after - memory_before)

    # Collect BLEU and ROUGE scores
    bleu = bleu_metric.compute(predictions=[output_text], references=[[answer]])  # References wrapped in a list
    bleu_scores.append(bleu["bleu"])

    rouge = rouge_metric.compute(predictions=[output_text], references=[[answer]])  # References wrapped in a list
    rouge1_scores.append(rouge["rouge1"])  # Direct access to the score, not using fmeasure
    rougeL_scores.append(rouge["rougeL"])  # Direct access to the score, not using fmeasure

    # Accumulate other metrics (placeholders)
    f1_scores.append(0.0)  # Placeholder
    knowledge_retentions.append(0.0)
    accuracy_drops.append(0.0)
    compression_ratios.append(0.0)
    flop_reductions.append(0.0)
    memory_reductions.append(0.0)

# Averages
n = len(dataset)
avg_latency = total_latency / n
avg_tps = num_tokens / total_latency
avg_perplexity = total_perplexity / n if total_perplexity > 0 else 0
avg_bleu = sum(bleu_scores) / n
avg_rouge1 = sum(rouge1_scores) / n
avg_rougeL = sum(rougeL_scores) / n
avg_memory = sum(memory_usages) / n
avg_f1 = sum(f1_scores) / n
avg_knowledge_retention = sum(knowledge_retentions) / n
avg_query_time = sum(query_times) / n
avg_accuracy_drop = sum(accuracy_drops) / n
avg_compression_ratio = sum(compression_ratios) / n
avg_flop_reduction = sum(flop_reductions) / n
avg_memory_reduction = sum(memory_reductions) / n

# Print results
print(f"Avg latency: {avg_latency:.3f} sec")
print(f"Tokens per sec: {avg_tps:.2f}")
print(f"Avg perplexity: {avg_perplexity:.2f}")
print(f"BLEU Score: {avg_bleu:.3f}")
print(f"ROUGE-1 Score: {avg_rouge1:.3f}")
print(f"ROUGE-L Score: {avg_rougeL:.3f}")
print(f"Memory usage (GB): {avg_memory:.3f}")
print(f"FLOP Reduction (%): {avg_flop_reduction:.2f}")
print(f"Retrieval Latency (sec): {avg_query_time:.3f}")
print(f"F1 Score: {avg_f1:.3f}")
print(f"Knowledge Retention: {avg_knowledge_retention:.3f}")
print(f"Memory Reduction (%): {avg_memory_reduction:.2f}")
print(f"Query Processing Time (sec): {avg_query_time:.3f}")
print(f"Accuracy Drop: {avg_accuracy_drop:.3f}")
print(f"Compression Ratio: {avg_compression_ratio:.2f}")


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Avg latency: 1.857 sec
Tokens per sec: 47.60
Avg perplexity: 0.00
BLEU Score: 0.000
ROUGE-1 Score: 0.006
ROUGE-L Score: 0.006
Memory usage (GB): 0.000
FLOP Reduction (%): 0.00
Retrieval Latency (sec): 1.857
F1 Score: 0.000
Knowledge Retention: 0.000
Memory Reduction (%): 0.00
Query Processing Time (sec): 1.857
Accuracy Drop: 0.000
Compression Ratio: 0.00
