In [2]:
# Install dependencies
!pip install transformers datasets torch nltk rouge_score psutil gpustat

# Set environment variable to reduce memory fragmentation
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

# Hugging Face login
from huggingface_hub import login
login("HUUGING FACE CODE")  # Replace with your actual Hugging Face token

import time
import torch
import psutil
import gpustat
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from nltk.translate.bleu_score import sentence_bleu
from rouge_score import rouge_scorer
import numpy as np
from tqdm import tqdm

# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-3B")
model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-3.2-3B",
    device_map="auto",  # Use "0" for single GPU if OOM persists
    torch_dtype=torch.float16
)
model.eval()

# Load GSM8K dataset (first 100 samples)
dataset = load_dataset("gsm8k", "main")["train"].select(range(100))

# Initialize metrics
latencies, tokens_per_sec, perplexities, bleus, rouge1s, rougeLs, memories, f1s = [], [], [], [], [], [], [], []
scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)

# Evaluation loop
for sample in tqdm(dataset, desc="Evaluating"):
    question = sample["question"]
    reference = sample["answer"].split("#### ")[-1].strip()
    prompt = f"Solve this math problem: {question}\nProvide the final answer as a number or concise phrase."

    # Tokenize input
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    input_length = inputs.input_ids.size(1)

    # Clear GPU memory
    torch.cuda.empty_cache()

    # Measure latency and generate
    start_time = time.time()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=20,  # Reduced to minimize memory
            do_sample=False
        )
    latency = time.time() - start_time

    # Decode output
    generated = tokenizer.decode(outputs[0][input_length:], skip_special_tokens=True).strip()
    output_length = outputs[0][input_length:].size(0)

    # Latency and tokens per second
    latencies.append(latency)
    tokens_per_sec.append(output_length / latency if latency > 0 else 0)

    # Perplexity (forward pass, skip for very short sequences)
    if output_length > 2:
        with torch.no_grad():
            input_ids = outputs[:, input_length:].to("cuda")
            labels = input_ids.clone().to("cuda")
            outputs_forward = model(input_ids, labels=labels)
            perplexity = torch.exp(outputs_forward.loss).item()
        perplexities.append(perplexity)
    else:
        perplexities.append(float('inf'))

    # BLEU and ROUGE
    bleu = sentence_bleu([reference.split()], generated.split())
    bleus.append(bleu)
    rouge_scores = scorer.score(reference, generated)
    rouge1s.append(rouge_scores['rouge1'].fmeasure)
    rougeLs.append(rouge_scores['rougeL'].fmeasure)

    # Memory usage (GPU)
    gpu_stats = gpustat.new_query().gpus[0]
    memory_used = gpu_stats.memory_used / 1024  # Convert MB to GB
    memories.append(memory_used)

    # F1 Score (binary: correct or not)
    is_correct = generated == reference
    f1 = 1.0 if is_correct else 0.0
    f1s.append(f1)

    # Log memory usage for debugging
    print(f"Sample {len(latencies)}: GPU Memory Used: {memory_used:.3f} GB")

# Compute averages
avg_latency = np.mean(latencies)
avg_tps = np.mean(tokens_per_sec)
avg_perplexity = np.mean([p for p in perplexities if p != float('inf')]) if perplexities else float('inf')
avg_bleu = np.mean(bleus)
avg_rouge1 = np.mean(rouge1s)
avg_rougeL = np.mean(rougeLs)
avg_memory = np.mean(memories)
avg_f1 = np.mean(f1s)
avg_knowledge_retention = avg_f1
avg_flop_reduction = 0.0
avg_retrieval_latency = 0.0
avg_memory_reduction = 0.0
avg_query_time = avg_latency
avg_accuracy_drop = 0.0
avg_compression_ratio = 1.0

# Print results
print(f"Avg latency: {avg_latency:.3f} sec")
print(f"Tokens per sec: {avg_tps:.2f}")
print(f"Avg perplexity: {avg_perplexity:.2f}")
print(f"BLEU Score: {avg_bleu:.3f}")
print(f"ROUGE-1 Score: {avg_rouge1:.3f}")
print(f"ROUGE-L Score: {avg_rougeL:.3f}")
print(f"Memory usage (GB): {avg_memory:.3f}")
print(f"FLOP Reduction (%): {avg_flop_reduction:.2f}")
print(f"Retrieval Latency (sec): {avg_retrieval_latency:.3f}")
print(f"F1 Score: {avg_f1:.3f}")
print(f"Knowledge Retention: {avg_knowledge_retention:.3f}")
print(f"Memory Reduction (%): {avg_memory_reduction:.2f}")
print(f"Query Processing Time (sec): {avg_query_time:.3f}")
print(f"Accuracy Drop: {avg_accuracy_drop:.3f}")
print(f"Compression Ratio: {avg_compression_ratio:.2f}")



tokenizer_config.json:   0%|          | 0.00/50.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/301 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/844 [00:00<?, ?B/s]

2025-04-21 18:03:45.483793: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745258625.756047      31 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745258625.834301      31 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/185 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/7.94k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/2.31M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/419k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1319 [00:00<?, ? examples/s]

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Evaluating:   1%|          | 1/100 [00:01<01:41,  1.02s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 1: GPU Memory Used: 3.143 GB


Evaluating:   2%|▏         | 2/100 [00:01<00:56,  1.72it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 2: GPU Memory Used: 3.143 GB


Evaluating:   3%|▎         | 3/100 [00:01<00:43,  2.23it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 3: GPU Memory Used: 3.143 GB


Evaluating:   4%|▍         | 4/100 [00:01<00:36,  2.61it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 4: GPU Memory Used: 3.143 GB


Evaluating:   5%|▌         | 5/100 [00:03<01:07,  1.40it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 5: GPU Memory Used: 3.160 GB


Evaluating:   6%|▌         | 6/100 [00:03<00:53,  1.76it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 6: GPU Memory Used: 3.145 GB


Evaluating:   7%|▋         | 7/100 [00:03<00:44,  2.09it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 7: GPU Memory Used: 3.145 GB


Evaluating:   8%|▊         | 8/100 [00:04<01:05,  1.40it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 8: GPU Memory Used: 3.170 GB


Evaluating:   9%|▉         | 9/100 [00:05<00:53,  1.72it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 9: GPU Memory Used: 3.148 GB


Evaluating:  10%|█         | 10/100 [00:06<01:09,  1.29it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 10: GPU Memory Used: 3.164 GB


Evaluating:  11%|█         | 11/100 [00:06<00:55,  1.60it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 11: GPU Memory Used: 3.146 GB


Evaluating:  12%|█▏        | 12/100 [00:07<00:46,  1.91it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 12: GPU Memory Used: 3.148 GB


Evaluating:  13%|█▎        | 13/100 [00:07<00:38,  2.24it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 13: GPU Memory Used: 3.145 GB


Evaluating:  14%|█▍        | 14/100 [00:07<00:34,  2.52it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 14: GPU Memory Used: 3.146 GB


Evaluating:  15%|█▌        | 15/100 [00:08<00:53,  1.58it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 15: GPU Memory Used: 3.162 GB


Evaluating:  16%|█▌        | 16/100 [00:10<01:08,  1.23it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 16: GPU Memory Used: 3.166 GB


Evaluating:  17%|█▋        | 17/100 [00:10<00:54,  1.53it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 17: GPU Memory Used: 3.145 GB


Evaluating:  18%|█▊        | 18/100 [00:10<00:44,  1.85it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 18: GPU Memory Used: 3.146 GB


Evaluating:  19%|█▉        | 19/100 [00:10<00:37,  2.15it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 19: GPU Memory Used: 3.146 GB


Evaluating:  20%|██        | 20/100 [00:12<00:55,  1.45it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 20: GPU Memory Used: 3.164 GB


Evaluating:  21%|██        | 21/100 [00:12<00:45,  1.75it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 21: GPU Memory Used: 3.146 GB


Evaluating:  22%|██▏       | 22/100 [00:12<00:37,  2.07it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 22: GPU Memory Used: 3.145 GB


Evaluating:  23%|██▎       | 23/100 [00:12<00:32,  2.36it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 23: GPU Memory Used: 3.145 GB


Evaluating:  24%|██▍       | 24/100 [00:13<00:29,  2.61it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 24: GPU Memory Used: 3.145 GB


Evaluating:  25%|██▌       | 25/100 [00:13<00:26,  2.83it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 25: GPU Memory Used: 3.146 GB


Evaluating:  26%|██▌       | 26/100 [00:13<00:24,  2.99it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 26: GPU Memory Used: 3.148 GB


Evaluating:  27%|██▋       | 27/100 [00:14<00:23,  3.12it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 27: GPU Memory Used: 3.145 GB


Evaluating:  28%|██▊       | 28/100 [00:14<00:22,  3.21it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 28: GPU Memory Used: 3.148 GB


Evaluating:  29%|██▉       | 29/100 [00:14<00:21,  3.29it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 29: GPU Memory Used: 3.145 GB


Evaluating:  30%|███       | 30/100 [00:14<00:20,  3.34it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 30: GPU Memory Used: 3.145 GB


Evaluating:  31%|███       | 31/100 [00:15<00:20,  3.39it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 31: GPU Memory Used: 3.145 GB


Evaluating:  32%|███▏      | 32/100 [00:15<00:19,  3.42it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 32: GPU Memory Used: 3.146 GB


Evaluating:  33%|███▎      | 33/100 [00:15<00:19,  3.35it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 33: GPU Memory Used: 3.150 GB


Evaluating:  34%|███▍      | 34/100 [00:16<00:19,  3.38it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 34: GPU Memory Used: 3.146 GB


Evaluating:  35%|███▌      | 35/100 [00:16<00:18,  3.46it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 35: GPU Memory Used: 3.145 GB


Evaluating:  36%|███▌      | 36/100 [00:16<00:18,  3.48it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 36: GPU Memory Used: 3.145 GB


Evaluating:  37%|███▋      | 37/100 [00:16<00:17,  3.54it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 37: GPU Memory Used: 3.146 GB


Evaluating:  38%|███▊      | 38/100 [00:17<00:17,  3.52it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 38: GPU Memory Used: 3.146 GB


Evaluating:  39%|███▉      | 39/100 [00:17<00:17,  3.51it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 39: GPU Memory Used: 3.146 GB


Evaluating:  40%|████      | 40/100 [00:17<00:17,  3.49it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 40: GPU Memory Used: 3.146 GB


Evaluating:  41%|████      | 41/100 [00:18<00:16,  3.56it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 41: GPU Memory Used: 3.145 GB


Evaluating:  42%|████▏     | 42/100 [00:18<00:16,  3.53it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 42: GPU Memory Used: 3.146 GB


Evaluating:  43%|████▎     | 43/100 [00:18<00:16,  3.52it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 43: GPU Memory Used: 3.146 GB


Evaluating:  44%|████▍     | 44/100 [00:18<00:15,  3.51it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 44: GPU Memory Used: 3.145 GB


Evaluating:  45%|████▌     | 45/100 [00:19<00:15,  3.51it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 45: GPU Memory Used: 3.146 GB


Evaluating:  46%|████▌     | 46/100 [00:19<00:15,  3.49it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 46: GPU Memory Used: 3.148 GB


Evaluating:  47%|████▋     | 47/100 [00:19<00:14,  3.55it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 47: GPU Memory Used: 3.145 GB


Evaluating:  48%|████▊     | 48/100 [00:20<00:14,  3.54it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 48: GPU Memory Used: 3.145 GB


Evaluating:  49%|████▉     | 49/100 [00:20<00:14,  3.52it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 49: GPU Memory Used: 3.145 GB


Evaluating:  50%|█████     | 50/100 [00:20<00:14,  3.52it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 50: GPU Memory Used: 3.145 GB


Evaluating:  51%|█████     | 51/100 [00:20<00:14,  3.50it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 51: GPU Memory Used: 3.146 GB


Evaluating:  52%|█████▏    | 52/100 [00:21<00:13,  3.48it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 52: GPU Memory Used: 3.146 GB


Evaluating:  53%|█████▎    | 53/100 [00:21<00:13,  3.47it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 53: GPU Memory Used: 3.145 GB


Evaluating:  54%|█████▍    | 54/100 [00:21<00:13,  3.48it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 54: GPU Memory Used: 3.145 GB


Evaluating:  55%|█████▌    | 55/100 [00:22<00:12,  3.48it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 55: GPU Memory Used: 3.146 GB


Evaluating:  56%|█████▌    | 56/100 [00:22<00:12,  3.54it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 56: GPU Memory Used: 3.145 GB


Evaluating:  57%|█████▋    | 57/100 [00:22<00:12,  3.53it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 57: GPU Memory Used: 3.145 GB


Evaluating:  58%|█████▊    | 58/100 [00:22<00:11,  3.50it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 58: GPU Memory Used: 3.145 GB


Evaluating:  59%|█████▉    | 59/100 [00:23<00:11,  3.49it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 59: GPU Memory Used: 3.146 GB


Evaluating:  60%|██████    | 60/100 [00:23<00:11,  3.46it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 60: GPU Memory Used: 3.148 GB


Evaluating:  61%|██████    | 61/100 [00:23<00:11,  3.47it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 61: GPU Memory Used: 3.145 GB


Evaluating:  62%|██████▏   | 62/100 [00:24<00:11,  3.45it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 62: GPU Memory Used: 3.150 GB


Evaluating:  63%|██████▎   | 63/100 [00:24<00:10,  3.53it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 63: GPU Memory Used: 3.145 GB


Evaluating:  64%|██████▍   | 64/100 [00:24<00:10,  3.51it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 64: GPU Memory Used: 3.148 GB


Evaluating:  65%|██████▌   | 65/100 [00:25<00:16,  2.15it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 65: GPU Memory Used: 3.164 GB


Evaluating:  66%|██████▌   | 66/100 [00:25<00:13,  2.45it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 66: GPU Memory Used: 3.145 GB


Evaluating:  67%|██████▋   | 67/100 [00:26<00:12,  2.70it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 67: GPU Memory Used: 3.146 GB


Evaluating:  68%|██████▊   | 68/100 [00:26<00:11,  2.90it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 68: GPU Memory Used: 3.146 GB


Evaluating:  69%|██████▉   | 69/100 [00:26<00:10,  3.05it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 69: GPU Memory Used: 3.145 GB


Evaluating:  70%|███████   | 70/100 [00:26<00:09,  3.22it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 70: GPU Memory Used: 3.143 GB


Evaluating:  71%|███████   | 71/100 [00:27<00:08,  3.37it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 71: GPU Memory Used: 3.143 GB


Evaluating:  72%|███████▏  | 72/100 [00:27<00:08,  3.40it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 72: GPU Memory Used: 3.146 GB


Evaluating:  73%|███████▎  | 73/100 [00:27<00:07,  3.42it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 73: GPU Memory Used: 3.148 GB


Evaluating:  74%|███████▍  | 74/100 [00:28<00:07,  3.44it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 74: GPU Memory Used: 3.146 GB


Evaluating:  75%|███████▌  | 75/100 [00:28<00:07,  3.46it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 75: GPU Memory Used: 3.145 GB


Evaluating:  76%|███████▌  | 76/100 [00:28<00:06,  3.47it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 76: GPU Memory Used: 3.145 GB


Evaluating:  77%|███████▋  | 77/100 [00:28<00:06,  3.54it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 77: GPU Memory Used: 3.143 GB


Evaluating:  78%|███████▊  | 78/100 [00:29<00:06,  3.54it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 78: GPU Memory Used: 3.145 GB


Evaluating:  79%|███████▉  | 79/100 [00:29<00:05,  3.54it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 79: GPU Memory Used: 3.145 GB


Evaluating:  80%|████████  | 80/100 [00:30<00:11,  1.78it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 80: GPU Memory Used: 3.162 GB


Evaluating:  81%|████████  | 81/100 [00:30<00:09,  2.08it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 81: GPU Memory Used: 3.145 GB


Evaluating:  82%|████████▏ | 82/100 [00:31<00:07,  2.36it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 82: GPU Memory Used: 3.148 GB


Evaluating:  83%|████████▎ | 83/100 [00:31<00:06,  2.65it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 83: GPU Memory Used: 3.145 GB


Evaluating:  84%|████████▍ | 84/100 [00:31<00:05,  2.86it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 84: GPU Memory Used: 3.145 GB


Evaluating:  85%|████████▌ | 85/100 [00:32<00:04,  3.03it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 85: GPU Memory Used: 3.145 GB


Evaluating:  86%|████████▌ | 86/100 [00:32<00:04,  3.15it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 86: GPU Memory Used: 3.146 GB


Evaluating:  87%|████████▋ | 87/100 [00:32<00:03,  3.30it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 87: GPU Memory Used: 3.145 GB


Evaluating:  88%|████████▊ | 88/100 [00:33<00:05,  2.28it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 88: GPU Memory Used: 3.164 GB


Evaluating:  89%|████████▉ | 89/100 [00:33<00:04,  2.57it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 89: GPU Memory Used: 3.145 GB


Evaluating:  90%|█████████ | 90/100 [00:33<00:03,  2.79it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 90: GPU Memory Used: 3.145 GB


Evaluating:  91%|█████████ | 91/100 [00:34<00:04,  2.04it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 91: GPU Memory Used: 3.145 GB


Evaluating:  92%|█████████▏| 92/100 [00:36<00:05,  1.39it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 92: GPU Memory Used: 3.166 GB


Evaluating:  93%|█████████▎| 93/100 [00:36<00:04,  1.69it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 93: GPU Memory Used: 3.146 GB


Evaluating:  94%|█████████▍| 94/100 [00:37<00:03,  1.58it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 94: GPU Memory Used: 3.164 GB


Evaluating:  95%|█████████▌| 95/100 [00:38<00:04,  1.23it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 95: GPU Memory Used: 3.162 GB


Evaluating:  96%|█████████▌| 96/100 [00:38<00:02,  1.53it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 96: GPU Memory Used: 3.145 GB


Evaluating:  97%|█████████▋| 97/100 [00:39<00:02,  1.22it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 97: GPU Memory Used: 3.166 GB


Evaluating:  98%|█████████▊| 98/100 [00:40<00:01,  1.51it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 98: GPU Memory Used: 3.150 GB


Evaluating:  99%|█████████▉| 99/100 [00:41<00:00,  1.20it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Sample 99: GPU Memory Used: 3.166 GB


Evaluating: 100%|██████████| 100/100 [00:41<00:00,  2.40it/s]

Sample 100: GPU Memory Used: 3.148 GB
Avg latency: 0.194 sec
Tokens per sec: 15.20
Avg perplexity: 30.51
BLEU Score: 0.000
ROUGE-1 Score: 0.003
ROUGE-L Score: 0.003
Memory usage (GB): 3.148
FLOP Reduction (%): 0.00
Retrieval Latency (sec): 0.000
F1 Score: 0.000
Knowledge Retention: 0.000
Memory Reduction (%): 0.00
Query Processing Time (sec): 0.194
Accuracy Drop: 0.000
Compression Ratio: 1.00



