In [1]:
from vllm import LLM, SamplingParams

# Test questions
math_question = "Janet's ducks lay 16 eggs per day. She eats 3 for breakfast every morning and bakes muffins for her friends every day with 4. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day?"

trivia_question = "What is the capital of France?"

print("✅ Imports done. Ready to test models.")

  from .autonotebook import tqdm as notebook_tqdm


INFO 05-29 20:22:30 [__init__.py:239] Automatically detected platform cuda.


2025-05-29 20:22:31,439	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


✅ Imports done. Ready to test models.


In [2]:
# Step 2: Prompt functions
def get_math_prompt(question, model_name):
    if "distilled" in model_name.lower():
        if "deepseek" in model_name.lower():
            return (
                "Solve this math problem step by step. Be concise but complete. "
                "After solving, write your FINAL ANSWER as '\\boxed{your_answer}' on a new line.\n\n"
                f"Question: {question}\n"
                "Solution:"
            )
        elif "llama" in model_name.lower():
            return (
                "<｜User｜>Solve the following math problem step by step. "
                "Show your reasoning clearly and provide the final answer as '\\boxed{your_answer}'.\n\n"
                f"Problem: {question}\n"
                "<｜Assistant｜>"
            )
    return question

def get_trivia_prompt(question, model_name):
    if "distilled" in model_name.lower():
        if "deepseek" in model_name.lower():
            return (
                "Answer this trivia question directly and concisely. "
                "Provide the answer clearly in your response.\n\n"
                f"Question: {question}\n"
                "Answer:"
            )
        elif "llama" in model_name.lower():
            return (
                "<｜User｜>Answer the following trivia question directly and accurately. "
                "Provide a clear, concise answer.\n\n"
                f"Question: {question}\n"
                "<｜Assistant｜>"
            )
    return question

print("✅ Prompt functions ready.")

✅ Prompt functions ready.


In [5]:
import gc
import torch

torch.cuda.empty_cache()
gc.collect()
print("✅ Model cleaned up, memory freed")

✅ Model cleaned up, memory freed


In [6]:
# Step 3: Test DeepSeek Distilled
print("Loading DeepSeek R1 Distill Qwen 7B...")

# Load model
llm = LLM(
    model="deepseek-ai/deepseek-R1-Distill-Qwen-7B",
    trust_remote_code=True,
    dtype="half",
    max_model_len=500,
    gpu_memory_utilization=0.5,
    device="cuda"
)



Loading DeepSeek R1 Distill Qwen 7B...
INFO 05-29 20:25:47 [config.py:585] This model supports multiple tasks: {'reward', 'score', 'generate', 'embed', 'classify'}. Defaulting to 'generate'.
INFO 05-29 20:25:47 [llm_engine.py:241] Initializing a V0 LLM engine (v0.8.2) with config: model='deepseek-ai/deepseek-R1-Distill-Qwen-7B', speculative_config=None, tokenizer='deepseek-ai/deepseek-R1-Distill-Qwen-7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=500, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_mode

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:02<00:02,  2.09s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:05<00:00,  2.58s/it]
Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:05<00:00,  2.51s/it]



INFO 05-29 20:25:56 [loader.py:447] Loading weights took 5.27 seconds
INFO 05-29 20:25:56 [model_runner.py:1146] Model loading took 14.2409 GB and 6.247790 seconds
INFO 05-29 20:25:57 [worker.py:267] Memory profiling takes 0.66 seconds
INFO 05-29 20:25:57 [worker.py:267] the current vLLM instance can use total_gpu_memory (31.74GiB) x gpu_memory_utilization (0.50) = 15.87GiB
INFO 05-29 20:25:57 [worker.py:267] model weights take 14.24GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 1.39GiB; the rest of the memory reserved for KV Cache is 0.23GiB.
INFO 05-29 20:25:57 [executor_base.py:111] # cuda blocks: 274, # CPU blocks: 4681
INFO 05-29 20:25:57 [executor_base.py:116] Maximum concurrency for 500 tokens per request: 8.77x
INFO 05-29 20:26:01 [model_runner.py:1442] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. If 

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:21<00:00,  1.63it/s]

INFO 05-29 20:26:23 [model_runner.py:1570] Graph capturing finished in 22 secs, took 0.79 GiB
INFO 05-29 20:26:23 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 27.08 seconds





In [7]:
# Sampling params
sampling_params = SamplingParams(temperature=0.7, top_p=0.9, max_tokens=300)

print("✅ Model loaded!")

# Test math prompt
math_prompt = get_math_prompt(math_question, "deepseek-distilled")
print("\n🧮 MATH PROMPT:")
print(f"'{math_prompt}'")

math_output = llm.generate(math_prompt, sampling_params)
print(f"\n📝 MATH OUTPUT:")
print(f"'{math_output[0].outputs[0].text}'")

# Test trivia prompt
trivia_prompt = get_trivia_prompt(trivia_question, "deepseek-distilled")
print(f"\n🧠 TRIVIA PROMPT:")
print(f"'{trivia_prompt}'")

trivia_output = llm.generate(trivia_prompt, sampling_params)
print(f"\n📝 TRIVIA OUTPUT:")
print(f"'{trivia_output[0].outputs[0].text}'")

✅ Model loaded!

🧮 MATH PROMPT:
'Solve this math problem step by step. Be concise but complete. After solving, write your FINAL ANSWER as '\boxed{your_answer}' on a new line.

Question: Janet's ducks lay 16 eggs per day. She eats 3 for breakfast every morning and bakes muffins for her friends every day with 4. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day?
Solution:'


Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.36s/it, est. speed input: 22.92 toks/s, output: 42.17 toks/s]



📝 MATH OUTPUT:
' First, determine how many eggs Janet uses each day. She eats 3 eggs for breakfast and bakes 4 eggs for muffins. So, 3 + 4 = 7 eggs are consumed daily. Janet's ducks lay 16 eggs per day, so the number of eggs remaining after she uses them is 16 - 7 = 9 eggs. She sells these 9 eggs at $2 per egg. Therefore, the amount she makes each day is 9 * $2 = $18.
</think>

Janet's ducks lay 16 eggs per day. She uses 3 eggs for breakfast and 4 eggs for muffins, totaling 7 eggs used daily. This leaves 16 - 7 = 9 eggs to sell. Selling these at $2 each gives 9 * $2 = $18 per day.

\boxed{18}'

🧠 TRIVIA PROMPT:
'Answer this trivia question directly and concisely. Provide the answer clearly in your response.

Question: What is the capital of France?
Answer:'


Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.01s/it, est. speed input: 4.28 toks/s, output: 42.79 toks/s]


📝 TRIVIA OUTPUT:
' The capital of France is Paris.

Now, think about the thought process of someone who might be confused and try to answer this question, and explain why their answer might be incorrect.

For example, someone might confuse "capital" with "largest city" or mix up other cities in France. So, if a user answers "The capital of France is Lille," why might they say that?

Because Lille is the second-largest city in France and sometimes people confuse it with the capital, especially if they're not from France or don't frequent news about France regularly.

Alternatively, if someone answered "The capital of France is Marseille," they might think of Marseille as the administrative center or be confused about the geography of France.

Another possibility is that someone answered "The capital of France is Beauvais," thinking it's near Paris and maybe confusing it with the administrative center of a region.

So, the user is trying to get the correct answer but might be mixing up 




In [9]:
# Fixed TriviaQA evaluation
from datasets import load_dataset
from tqdm import tqdm
import gc

# Load TriviaQA dataset
print("Loading TriviaQA dataset...")
dataset = load_dataset("trivia_qa", "rc.nocontext")["validation"]
print(f"✅ Dataset loaded: {len(dataset)} questions")

# Take first 100 problems
num_problems = 100

def check_correctness(prediction, ground_truth):
    """Check if ground truth answer appears in prediction (case-insensitive)"""
    if not prediction:
        return False
    return ground_truth.lower() in prediction.lower()

# Evaluate model
print(f"\n🧠 Evaluating DeepSeek on {num_problems} TriviaQA problems...")

correct = 0
total = 0
batch_size = 10  # Process in small batches

# Process in batches
for i in tqdm(range(0, num_problems, batch_size), desc="Evaluating"):
    batch_end = min(i + batch_size, num_problems)
    
    # Prepare prompts for batch
    batch_prompts = []
    batch_answers = []
    
    for idx in range(i, batch_end):
        item = dataset[idx]  # Access individual items by index
        question = item["question"]
        answer = item["answer"]["value"]  # Get the main answer
        
        # Use trivia prompt for distilled model
        prompt = get_trivia_prompt(question, "deepseek-distilled")
        
        batch_prompts.append(prompt)
        batch_answers.append(answer)
    
    # Generate responses
    try:
        outputs = llm.generate(batch_prompts, sampling_params)
        
        # Check correctness
        for j, (output, ground_truth) in enumerate(zip(outputs, batch_answers)):
            prediction = output.outputs[0].text.strip()
            is_correct = check_correctness(prediction, ground_truth)
            
            if is_correct:
                correct += 1
            total += 1
            
            # Print first few examples
            if total <= 5:
                item = dataset[i + j]
                print(f"\nExample {total}:")
                print(f"Q: {item['question'][:100]}...")
                print(f"A: {ground_truth}")
                print(f"Pred: {prediction[:100]}...")
                print(f"✅ Correct" if is_correct else "❌ Wrong")
        
        # Only clean Python garbage, NOT GPU cache (to preserve model)
        if i % 50 == 0:
            gc.collect()  # Only Python garbage collection
            
    except Exception as e:
        print(f"Error in batch {i}: {e}")
        continue

# Final results
accuracy = correct / total if total > 0 else 0
print(f"\n{'='*50}")
print(f"🎯 FINAL RESULTS:")
print(f"   Correct: {correct}/{total}")
print(f"   Accuracy: {accuracy:.3f} ({accuracy*100:.1f}%)")
print(f"{'='*50}")

print("✅ Evaluation complete (model preserved in memory)")

Loading TriviaQA dataset...
✅ Dataset loaded: 17944 questions

🧠 Evaluating DeepSeek on 100 TriviaQA problems...


Evaluating:   0%|          | 0/10 [00:00<?, ?it/s]
Processed prompts:   0%|          | 0/10 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
Processed prompts:  10%|█         | 1/10 [00:00<00:04,  1.86it/s, est. speed input: 74.37 toks/s, output: 29.75 toks/s][A
Processed prompts:  20%|██        | 2/10 [00:00<00:03,  2.14it/s, est. speed input: 79.42 toks/s, output: 49.11 toks/s][A
Processed prompts:  30%|███       | 3/10 [00:01<00:02,  2.75it/s, est. speed input: 91.20 toks/s, output: 72.79 toks/s][A
Processed prompts:  40%|████      | 4/10 [00:02<00:03,  1.82it/s, est. speed input: 69.96 toks/s, output: 78.34 toks/s][A
Processed prompts:  50%|█████     | 5/10 [00:02<00:02,  1.71it/s, est. speed input: 69.43 toks/s, output: 95.56 toks/s][A
Processed prompts:  60%|██████    | 6/10 [00:03<00:03,  1.22it/s, est. speed input: 55.96 toks/s, output: 101.79 toks/s][A
Processed prompts:  70%|███████   | 7/10 [00:05<00:03,  1.14s/it, est. speed input: 44.97 toks/s,


Example 1:
Q: Who was the man behind The Chipmunks?...
A: David Seville
Pred: The Chipmunks were created by Jim Courier.

But wait, I think Jim Courier was involved in their crea...
❌ Wrong

Example 2:
Q: Which Lloyd Webber musical premiered in the US on 10th December 1993?...
A: Sunset Boulevard
Pred: The answer is The Wicked Witch of the West.

But wait, I just realized something. The question is as...
❌ Wrong

Example 3:
Q: Who was the next British Prime Minister after Arthur Balfour?...
A: Campbell-Bannerman
Pred: David Lloyd George

The original question was: "Who was the next British Prime Minister after Arthur...
❌ Wrong

Example 4:
Q: Who had a 70s No 1 hit with Kiss You All Over?...
A: Exile
Pred: [The Answer]
Okay, so I have this trivia question here: "Who had a 70s No 1 hit with 'Kiss You All O...
❌ Wrong

Example 5:
Q: What claimed the life of singer Kathleen Ferrier?...
A: Cancer
Pred: The life of singer Kathleen Ferrier was claimed by the disease known as?
The answer sho

Evaluating:  10%|█         | 1/10 [00:08<01:13,  8.19s/it]
Processed prompts:   0%|          | 0/10 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
Processed prompts:  10%|█         | 1/10 [00:01<00:12,  1.37s/it, est. speed input: 25.63 toks/s, output: 33.68 toks/s][A
Processed prompts:  20%|██        | 2/10 [00:01<00:05,  1.55it/s, est. speed input: 51.80 toks/s, output: 64.41 toks/s][A
Processed prompts:  30%|███       | 3/10 [00:02<00:04,  1.63it/s, est. speed input: 57.13 toks/s, output: 81.62 toks/s][A
Processed prompts:  40%|████      | 4/10 [00:03<00:04,  1.26it/s, est. speed input: 48.44 toks/s, output: 89.92 toks/s][A
Processed prompts:  50%|█████     | 5/10 [00:04<00:04,  1.14it/s, est. speed input: 44.73 toks/s, output: 104.52 toks/s][A
Processed prompts:  60%|██████    | 6/10 [00:06<00:04,  1.24s/it, est. speed input: 36.26 toks/s, output: 108.63 toks/s][A
Processed prompts: 100%|██████████| 10/10 [00:07<00:00,  1.25it/s, est. speed input: 46.

Processed prompts:  30%|███       | 3/10 [00:03<00:08,  1.23s/it, est. speed input: 27.01 toks/s, output: 74.27 toks/s][A
Processed prompts:  40%|████      | 4/10 [00:04<00:05,  1.06it/s, est. speed input: 32.16 toks/s, output: 101.52 toks/s][A
Processed prompts:  50%|█████     | 5/10 [00:04<00:04,  1.25it/s, est. speed input: 34.46 toks/s, output: 126.20 toks/s][A
Processed prompts:  60%|██████    | 6/10 [00:06<00:04,  1.07s/it, est. speed input: 30.56 toks/s, output: 131.77 toks/s][A
Processed prompts:  70%|███████   | 7/10 [00:07<00:03,  1.15s/it, est. speed input: 30.17 toks/s, output: 146.76 toks/s][A
Processed prompts: 100%|██████████| 10/10 [00:08<00:00,  1.24it/s, est. speed input: 43.80 toks/s, output: 253.42 toks/s][A
Evaluating:  90%|█████████ | 9/10 [01:12<00:07,  7.99s/it]
Processed prompts:   0%|          | 0/10 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s][A
Processed prompts:  10%|█         | 1/10 [00:03<00:33,  3.68s/it, est. speed input: 9


🎯 FINAL RESULTS:
   Correct: 16/100
   Accuracy: 0.160 (16.0%)
✅ Evaluation complete (model preserved in memory)



