In [None]:
from huggingface_hub import login
import os

# Set your HuggingFace token as an environment variable: export HF_TOKEN="your_token_here"
# Or use: huggingface-cli login
hf_token = os.environ.get("HF_TOKEN")
if hf_token:
    login(token=hf_token)
else:
    print("Warning: HF_TOKEN not found in environment variables. Using huggingface-cli login instead.")
    login()

In [2]:
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer

# model_name = "Qwen/Qwen2.5-Math-1.5B-Instruct"
# model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
model_name = "deepseek-ai/DeepSeek-R1-Distill-Qwen-7B"

print("Using vLLM for fast inference!")

Using vLLM for fast inference!


In [3]:
# Load model using vLLM (much faster than transformers)
llm = LLM(
    model=model_name,
    trust_remote_code=True,
    max_model_len=1024,  # Adjust based on your GPU memory
    gpu_memory_utilization=0.9,  # Use 90% of GPU memory
)

# Load tokenizer for formatting
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

# Set pad token if not present
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print(f"vLLM model loaded successfully!")

INFO 01-02 09:03:38 [utils.py:253] non-default args: {'trust_remote_code': True, 'max_model_len': 1024, 'disable_log_stats': True, 'model': 'deepseek-ai/DeepSeek-R1-Distill-Qwen-7B'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 01-02 09:03:38 [model.py:514] Resolved architecture: Qwen2ForCausalLM
INFO 01-02 09:03:38 [model.py:1661] Using max model len 1024
INFO 01-02 09:03:45 [scheduler.py:230] Chunked prefill is enabled with max_num_batched_tokens=8192.
[0;36m(EngineCore_DP0 pid=2114)[0;0m INFO 01-02 09:03:46 [core.py:93] Initializing a V1 LLM engine (v0.13.0) with config: model='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', speculative_config=None, tokenizer='deepseek-ai/DeepSeek-R1-Distill-Qwen-7B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=1024, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


[0;36m(EngineCore_DP0 pid=2114)[0;0m INFO 01-02 09:03:53 [default_loader.py:308] Loading weights took 4.55 seconds
[0;36m(EngineCore_DP0 pid=2114)[0;0m INFO 01-02 09:03:53 [gpu_model_runner.py:3659] Model loading took 14.2717 GiB memory and 5.136926 seconds
[0;36m(EngineCore_DP0 pid=2114)[0;0m INFO 01-02 09:04:01 [backends.py:643] Using cache directory: /root/.cache/vllm/torch_compile_cache/7f016add07/rank_0_0/backbone for vLLM's torch.compile
[0;36m(EngineCore_DP0 pid=2114)[0;0m INFO 01-02 09:04:01 [backends.py:703] Dynamo bytecode transform time: 7.38 s
[0;36m(EngineCore_DP0 pid=2114)[0;0m INFO 01-02 09:04:07 [backends.py:226] Directly load the compiled graph(s) for compile range (1, 8192) from the cache, took 3.658 s
[0;36m(EngineCore_DP0 pid=2114)[0;0m INFO 01-02 09:04:07 [monitor.py:34] torch.compile takes 11.04 s in total
[0;36m(EngineCore_DP0 pid=2114)[0;0m INFO 01-02 09:04:08 [gpu_worker.py:375] Available KV cache memory: 55.57 GiB
[0;36m(EngineCore_DP0 pid=2114)

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 51/51 [00:01<00:00, 26.30it/s]
Capturing CUDA graphs (decode, FULL): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 35/35 [00:01<00:00, 32.72it/s]


[0;36m(EngineCore_DP0 pid=2114)[0;0m INFO 01-02 09:04:12 [gpu_model_runner.py:4587] Graph capturing finished in 4 secs, took 0.54 GiB
[0;36m(EngineCore_DP0 pid=2114)[0;0m INFO 01-02 09:04:12 [core.py:259] init engine (profile, create kv cache, warmup model) took 18.54 seconds
INFO 01-02 09:04:13 [llm.py:360] Supported tasks: ['generate']
vLLM model loaded successfully!


In [4]:
sampling_params = SamplingParams(
    temperature=0.0,  # Greedy decoding
    max_tokens=1024,
    top_p=1.0,
)

In [5]:
# Load GSM8K test dataset
from datasets import load_dataset
import json
import re
from tqdm import tqdm

# Load GSM8K test set
dataset = load_dataset("openai/gsm8k", "main", split="test")
print(f"Loaded {len(dataset)} examples from GSM8K test set")


Loaded 1319 examples from GSM8K test set


In [13]:
trainset = load_dataset("openai/gsm8k", "main", split="train")

In [6]:
math_dataset = load_dataset("qwedsacf/competition_math", "default")

In [7]:
# GSM8K evaluation patterns
gsm8k_pattern = re.compile(r"\n#### (.+)$")  # Original GSM8K format
boxed_pattern = re.compile(r"\\boxed\{([^}]+)\}")  # Qwen model output format
hash_pattern = re.compile(r"####\s*(.+?)(?:\n|$)")  # #### format anywhere in text

def extract_number(text):
    """Extract numeric value from text, ignoring units, currency symbols, etc.
    
    Examples:
        '$65,000' -> '65000'
        '\$18' -> '18'
        '18 dollars' -> '18'
        '-5.5%' -> '-5.5'
        '1/2' -> '0.5'
    """
    if text is None:
        return None
    
    text = str(text).strip()
    
    # Handle fractions (e.g., "1/2" -> "0.5")
    fraction_match = re.match(r'^(-?\d+)/(\d+)$', text.replace(' ', ''))
    if fraction_match:
        numerator = float(fraction_match.group(1))
        denominator = float(fraction_match.group(2))
        if denominator != 0:
            return str(numerator / denominator)
    
    # Remove currency symbols ($, â‚¬, Â£, Â¥, etc.)
    text = re.sub(r'[\$â‚¬Â£Â¥â‚¹]', '', text)
    
    # Remove common units (words)
    text = re.sub(r'\b(dollars?|cents?|years?|months?|days?|hours?|minutes?|seconds?|meters?|km|cm|kg|g|lbs?|oz|percent|%)\b', '', text, flags=re.IGNORECASE)
    
    # Remove commas from numbers
    text = text.replace(',', '')
    
    # Extract the number (handles negative, decimals)
    # This pattern matches: optional minus, digits, optional decimal point and more digits
    number_match = re.search(r'-?\d+(?:\.\d+)?', text)
    
    if number_match:
        return number_match.group(0)
    
    return None

def extract_gsm8k_answer(text):
    """Extract answer from GSM8K format (#### answer at end)"""
    match = re.search(gsm8k_pattern, text)
    if not match:
        return None
    answer_text = match.group(1).strip()
    return extract_number(answer_text)

def extract_boxed_answer(text):
    """Extract answer from boxed format (\\boxed{answer})"""
    matches = boxed_pattern.findall(text)
    if not matches:
        return None
    answer_text = matches[-1].strip()
    return extract_number(answer_text)

def extract_hash_answer(text):
    """Extract answer from #### format (anywhere in text)"""
    matches = hash_pattern.findall(text)
    if not matches:
        return None
    answer_text = matches[-1].strip()
    return extract_number(answer_text)

def normalize_answer(answer):
    """Normalize numeric answer for comparison"""
    if answer is None:
        return None
    
    answer = str(answer).strip()
    
    # Try to convert to float and back to handle different representations
    try:
        # Convert to float to handle cases like "5.0" vs "5"
        num = float(answer)
        # If it's a whole number, convert to int to avoid "5.0" vs "5" mismatches
        if num.is_integer():
            return str(int(num))
        else:
            # Round to reasonable precision to avoid floating point issues
            return f"{num:.10g}"  # 'g' format removes trailing zeros
    except (ValueError, TypeError):
        # If conversion fails, just return cleaned string
        return answer.replace(',', '').strip()

def eval_gsm8k(generated, ground_truth, extraction_func=None):
    """Evaluate GSM8K answer - handles multiple formats
    
    Args:
        generated: The generated response text
        ground_truth: The ground truth answer in GSM8K format
        extraction_func: Function to extract answer from generated text
                        If None, tries boxed, then hash, then gsm8k formats
    """
    # Extract ground truth (GSM8K format with ####)
    gt_answer = extract_gsm8k_answer(ground_truth)
    
    # Extract generated answer - try multiple formats
    if extraction_func is not None:
        gen_answer = extraction_func(generated)
    else:
        # Try boxed format first (for Qwen)
        gen_answer = extract_boxed_answer(generated)
        # If not found, try #### format
        if gen_answer is None:
            gen_answer = extract_hash_answer(generated)
        # Last resort: try end-of-text GSM8K format
        if gen_answer is None:
            gen_answer = extract_gsm8k_answer(generated)
    
    # Normalize both answers
    gt_answer_norm = normalize_answer(gt_answer)
    gen_answer_norm = normalize_answer(gen_answer)
    
    # Compare normalized answers
    is_correct = (gt_answer_norm == gen_answer_norm) if (gt_answer_norm and gen_answer_norm) else False
    
    return is_correct, gt_answer, gen_answer

print("Evaluation functions defined")

Evaluation functions defined


In [8]:
test_system_prompt = """Please solve the problem step by step (separate steps with double newlines), but keep it short and put your final answer (do not include any other text or units) within \\boxed{}."""

# test_system_prompt = "Please solve step by step and put your final numerical answer (do not include any other text or units) after #### "
# test_system_prompt = "Solve this problem step by step. Put your final answer in the format: #### [answer]"

# Choose a question to test (or use a custom one):
test_question = dataset[3]['question']  # First question from GSM8K
test_ground_truth = dataset[3]['answer']

print("="*80)
print("TESTING SYSTEM PROMPT")
print("="*80)
print(f"\nSystem Prompt:\n{test_system_prompt}\n")
print(f"Question:\n{test_question}\n")
print("-"*80)

# Create messages
test_messages = [
    {"role": "system", "content": test_system_prompt},
    {"role": "user", "content": test_question}
]

# Format and generate
test_prompt = tokenizer.apply_chat_template(test_messages, tokenize=False, add_generation_prompt=True)
test_outputs = llm.generate([test_prompt], sampling_params)
test_response = test_outputs[0].outputs[0].text.strip()

# Evaluate
is_correct, gt_ans, gen_ans = eval_gsm8k(test_response, test_ground_truth)

print(f"\nGENERATED RESPONSE:")
print(test_response)
print("\n" + "="*80)
print(f"Ground Truth Answer: {gt_ans}")
print(f"Generated Answer: {gen_ans}")
print(f"Correct: {'âœ“' if is_correct else 'âœ—'}")
print("="*80)


TESTING SYSTEM PROMPT

System Prompt:
Please solve the problem step by step (separate steps with double newlines), but keep it short and put your final answer (do not include any other text or units) within \boxed{}.

Question:
James decides to run 3 sprints 3 times a week.  He runs 60 meters each sprint.  How many total meters does he run a week?

--------------------------------------------------------------------------------


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]


GENERATED RESPONSE:
First, determine how many sprints James runs each week. He runs 3 sprints 3 times a week, so that's 3 multiplied by 3, which equals 9 sprints per week.

Next, calculate the total distance he covers in one week by multiplying the number of sprints by the distance of each sprint. Each sprint is 60 meters, so 9 sprints multiplied by 60 meters per sprint equals 540 meters.

Therefore, James runs a total of 540 meters each week.
</think>

**Solution:**

1. **Determine the total number of sprints James runs in a week:**
   
   James runs **3 sprints** **3 times** a week.
   
   \[
   \text{Total sprints per week} = 3 \times 3 = 9 \text{ sprints}
   \]

2. **Calculate the total distance James runs in a week:**
   
   Each sprint is **60 meters**.
   
   \[
   \text{Total distance} = 9 \text{ sprints} \times 60 \text{ meters/sprint} = 540 \text{ meters}
   \]

**Final Answer:**

\[
\boxed{540}
\]

Ground Truth Answer: 540
Generated Answer: 540
Correct: âœ“


In [24]:
# Prepare all prompts first (for batch inference with vLLM)
output_file = "gsm8k_generations.jsonl"

# System prompt for math reasoning
# system_prompt = "Please reason step by step, and put your final answer within \\boxed{}."
system_prompt = test_system_prompt

print("Preparing prompts for batch inference...")
prompts = []
questions = []
ground_truths = []

for example in tqdm(dataset, desc="Preparing data"):
    question = example['question']
    ground_truth_answer = example['answer']
    
    # Create messages in chat format
    messages = [
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": question}
    ]
    
    # Format conversation using tokenizer's chat template
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    prompts.append(prompt)
    questions.append(question)
    ground_truths.append(ground_truth_answer)

print(f"\nPrepared {len(prompts)} prompts")
print(f"Starting batch inference with vLLM (this will be MUCH faster!)...")

Preparing prompts for batch inference...


Preparing data: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1319/1319 [00:00<00:00, 12133.99it/s]


Prepared 1319 prompts
Starting batch inference with vLLM (this will be MUCH faster!)...





In [25]:
# Generate responses using vLLM (batch inference - FAST!)
print("Generating responses with vLLM...")
outputs = llm.generate(prompts, sampling_params)

print(f"Generated {len(outputs)} responses!")
print("Processing results...")


Generating responses with vLLM...


Adding requests:   0%|          | 0/1319 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1319 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/sâ€¦

Generated 1319 responses!
Processing results...


In [26]:
# Process outputs and evaluate
results = []
correct_count = 0
total_count = 0

with open(output_file, 'w') as f:
    for idx, output in enumerate(tqdm(outputs, desc="Evaluating")):
        try:
            # Extract generated text
            generated_response = output.outputs[0].text.strip()
            
            # Get corresponding question and ground truth
            question = questions[idx]
            ground_truth_answer = ground_truths[idx]
            
            # Evaluate
            is_correct, gt_answer, gen_answer = eval_gsm8k(generated_response, ground_truth_answer)
            
            if is_correct:
                correct_count += 1
            total_count += 1
            
            # Save result in the same format as evaluate.py
            result = {
                "question": question,
                "ground_truth": ground_truth_answer,
                "generated_response": generated_response,
                "gt_answer": gt_answer,
                "gen_answer": gen_answer,
                "correct": is_correct,
                "index": idx
            }
            results.append(result)
            
            # Write to JSONL file
            f.write(json.dumps(result) + '\n')
            
        except Exception as e:
            print(f"Error at index {idx}: {e}")
            result = {
                "question": questions[idx],
                "ground_truth": ground_truths[idx],
                "generated_response": "",
                "gt_answer": None,
                "gen_answer": None,
                "correct": False,
                "index": idx,
                "error": str(e)
            }
            results.append(result)
            f.write(json.dumps(result) + '\n')
            total_count += 1

# Calculate and print accuracy
accuracy = correct_count / total_count if total_count > 0 else 0
print(f"\n{'='*50}")
print(f"Evaluation Complete!")
print(f"{'='*50}")
print(f"Total examples: {total_count}")
print(f"Correct: {correct_count}")
print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f}%)")
print(f"Results saved to: {output_file}")


Evaluating: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 1319/1319 [00:00<00:00, 40734.59it/s]


Evaluation Complete!
Total examples: 1319
Correct: 1158
Accuracy: 0.8779 (87.79%)
Results saved to: gsm8k_generations.jsonl





In [27]:
# Display some example results
print("="*80)
print("Example Results (first 3):")
print("="*80)

for i, result in enumerate(results[:3]):
    print(f"\n--- Example {i+1} ---")
    print(f"Question: {result['question'][:100]}...")
    print(f"Ground Truth Answer: {result['gt_answer']}")
    print(f"Generated Answer: {result['gen_answer']}")
    print(f"Correct: {result['correct']}")
    print(f"Full Generation: {result['generated_response'][:200]}...")
    print()


Example Results (first 3):

--- Example 1 ---
Question: Janetâ€™s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for ...
Ground Truth Answer: 18
Generated Answer: 18
Correct: True
Full Generation: First, I need to determine how many duck eggs Janet has each day. She lays 16 eggs per day.

Next, I'll account for the eggs she uses. She eats 3 eggs for breakfast and uses 4 eggs to bake muffins, to...


--- Example 2 ---
Question: A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it ...
Ground Truth Answer: 3
Generated Answer: 3
Correct: True
Full Generation: First, identify the amount of blue fiber needed for the robe, which is 2 bolts.

Next, determine the amount of white fiber required, which is half of the blue fiber. Half of 2 bolts is 1 bolt.

Finall...


--- Example 3 ---
Question: Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repai...
Ground Truth

In [17]:
# ========================================================================
# SYNTHETIC DATA GENERATION WITH RETRY MECHANISM
# ========================================================================

# Configuration
MAX_RETRIES = 5  # Maximum number of generation attempts per question
SYNTHETIC_OUTPUT_FILE = "gsm8k_synthetic_data.jsonl"
# synthetic_system_prompt = r"Please reason step by step, and put your final answer within \boxed{}."
synthetic_system_prompt = test_system_prompt

# You can also limit the number of examples to process (None = all)
MAX_EXAMPLES_FOR_SYNTHETIC = None  # Set to e.g. 100 for testing, None for full dataset

print("="*80)
print("SYNTHETIC DATA GENERATION CONFIGURATION")
print("="*80)
print(f"Max retries per question: {MAX_RETRIES}")
print(f"System prompt: {synthetic_system_prompt}")
print(f"Output file: {SYNTHETIC_OUTPUT_FILE}")
print(f"Max examples: {MAX_EXAMPLES_FOR_SYNTHETIC or 'All'}")
print("="*80)

SYNTHETIC DATA GENERATION CONFIGURATION
Max retries per question: 5
System prompt: Please solve the problem step by step (separate steps with double newlines), but keep it short and put your final answer (do not include any other text or units) within \boxed{}.
Output file: gsm8k_synthetic_data.jsonl
Max examples: All


In [None]:
# Generate synthetic data with BATCH retry mechanism (FAST!)
import time

# Prepare dataset
synthetic_dataset = trainset if MAX_EXAMPLES_FOR_SYNTHETIC is None else dataset.select(range(min(MAX_EXAMPLES_FOR_SYNTHETIC, len(dataset))))

print(f"\nStarting BATCH synthetic data generation for {len(synthetic_dataset)} examples...")
print(f"Using batch inference with retry (max {MAX_RETRIES} attempts)\n")

start_time = time.time()

# Prepare all data
all_questions = []
all_ground_truths = []
all_prompts = []

for example in synthetic_dataset:
    question = example['question']
    ground_truth = example['answer']
    
    messages = [
        {"role": "system", "content": synthetic_system_prompt},
        {"role": "user", "content": question}
    ]
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    
    all_questions.append(question)
    all_ground_truths.append(ground_truth)
    all_prompts.append(prompt)

# Track results
results = [None] * len(synthetic_dataset)  # Store successful responses
pending_indices = list(range(len(synthetic_dataset)))  # Indices still needing correct answer
total_attempts = 0
attempt_count = 0

# Batch generation with retry
while pending_indices and attempt_count < MAX_RETRIES:
    attempt_count += 1
    print(f"\nAttempt {attempt_count}/{MAX_RETRIES}: Generating for {len(pending_indices)} questions...")
    
    # Get prompts for pending questions
    pending_prompts = [all_prompts[i] for i in pending_indices]
    
    # Batch generate
    outputs = llm.generate(pending_prompts, sampling_params)
    total_attempts += len(pending_prompts)
    
    # Evaluate results
    new_pending = []
    for i, output_idx in enumerate(pending_indices):
        response = outputs[i].outputs[0].text.strip()
        
        # Check if correct
        is_correct, gt_ans, gen_ans = eval_gsm8k(response, all_ground_truths[output_idx])
        
        if is_correct:
            # Store successful result
            results[output_idx] = response
        else:
            # Still needs correct answer
            new_pending.append(output_idx)
    
    pending_indices = new_pending
    success_count = sum(1 for r in results if r is not None)
    print(f"  -> {success_count}/{len(synthetic_dataset)} correct ({success_count/len(synthetic_dataset)*100:.2f}%)")

# Create final dataset (only successful examples)
synthetic_data = []
success_count = 0
failed_count = 0

for idx in range(len(synthetic_dataset)):
    if results[idx] is not None:
        # Extract ground truth numeric answer for easy comparison
        ground_truth_answer = extract_gsm8k_answer(all_ground_truths[idx])
        
        synthetic_entry = {
            "question": all_questions[idx],
            "answer": all_ground_truths[idx],  # Full GSM8K answer with #### format
            "ground_truth": ground_truth_answer,  # Extracted numeric answer
            "deepseek_response": results[idx]
        }
        synthetic_data.append(synthetic_entry)
        success_count += 1
    else:
        failed_count += 1

elapsed_time = time.time() - start_time

# Print statistics
print(f"\n{'='*80}")
print(f"SYNTHETIC DATA GENERATION COMPLETE")
print(f"{'='*80}")
print(f"Total examples processed: {len(synthetic_dataset)}")
print(f"Successful (found correct answer): {success_count} ({success_count/len(synthetic_dataset)*100:.2f}%)")
print(f"Failed (no correct answer in {MAX_RETRIES} attempts): {failed_count} ({failed_count/len(synthetic_dataset)*100:.2f}%)")
print(f"Total generation attempts: {total_attempts}")
print(f"Average attempts per question: {total_attempts/len(synthetic_dataset):.2f}")
print(f"Time elapsed: {elapsed_time:.2f}s ({elapsed_time/60:.2f} minutes)")
print(f"Speed: {len(synthetic_dataset)/elapsed_time:.2f} examples/second")
print(f"{'='*80}")

SYNTHETIC DATA GENERATION CONFIGURATION
Max retries per question: 5
System prompt: Please solve the problem step by step (separate steps with double newlines), but keep it short and put your final answer (do not include any other text or units) within \boxed{}.
Output file: gsm8k_synthetic_data.jsonl
Max examples: All


In [19]:
# Save synthetic data to JSONL file
print(f"\nSaving synthetic data to {SYNTHETIC_OUTPUT_FILE}...")

with open(SYNTHETIC_OUTPUT_FILE, 'w') as f:
    for entry in synthetic_data:
        f.write(json.dumps(entry) + '\n')

print(f"âœ“ Saved {len(synthetic_data)} successful entries to {SYNTHETIC_OUTPUT_FILE}")
print(f"\nFile format (same as GSM8K + additional fields):")
print(f"  - question: The math problem")
print(f"  - answer: Full ground truth with #### format")
print(f"  - ground_truth: Extracted numeric answer (for easy comparison)")
print(f"  - deepseek_response: Model's correct response")


Saving synthetic data to gsm8k_synthetic_data.jsonl...
âœ“ Saved 6862 successful entries to gsm8k_synthetic_data.jsonl

File format (same as GSM8K + additional fields):
  - question: The math problem
  - answer: Full ground truth with #### format
  - ground_truth: Extracted numeric answer (for easy comparison)
  - deepseek_response: Model's correct response


In [15]:
# Display examples of synthetic data
print("="*80)
print("SYNTHETIC DATA EXAMPLES")
print("="*80)

# Show first few successful examples
if len(synthetic_data) > 0:
    print(f"\nðŸ“Š Example 1:")
    print("-"*80)
    example = synthetic_data[0]
    print(f"Question: {example['question'][:200]}...")
    print(f"\nGround Truth (extracted): {example['ground_truth']}")
    print(f"\nFull Answer: {example['answer'][:100]}...")
    print(f"\nDeepseek Response: {example['deepseek_response'][:400]}...")
    
    if len(synthetic_data) > 1:
        print(f"\n\nðŸ“Š Example 2:")
        print("-"*80)
        example = synthetic_data[1]
        print(f"Question: {example['question'][:200]}...")
        print(f"\nGround Truth (extracted): {example['ground_truth']}")
        print(f"\nFull Answer: {example['answer'][:100]}...")
        print(f"\nDeepseek Response: {example['deepseek_response'][:400]}...")

print("\n" + "="*80)


Saving synthetic data to gsm8k_synthetic_data.jsonl...
âœ“ Saved 6862 successful entries to gsm8k_synthetic_data.jsonl

File format (same as GSM8K + deepseek_response):
  - question: The math problem
  - answer: Ground truth with #### format
  - deepseek_response: Model's correct response


In [16]:
# How to load and use the synthetic data
print("="*80)
print("HOW TO USE THE SYNTHETIC DATA")
print("="*80)

print(f"""
The synthetic data has been saved to: {SYNTHETIC_OUTPUT_FILE}

Format (same as GSM8K dataset + additional fields):
{{
    "question": "Math problem text",
    "answer": "Solution with #### final_answer",
    "ground_truth": "18",  // Extracted numeric answer
    "deepseek_response": "Model's correct reasoning with \\boxed{{answer}}"
}}

Only successful examples (where model got correct answer) are included.

Example usage:
""")

print("# Load the synthetic data:")
print("from datasets import load_dataset")
print(f"synthetic_ds = load_dataset('json', data_files='{SYNTHETIC_OUTPUT_FILE}')['train']")
print()
print("# Access fields:")
print("for example in synthetic_ds:")
print("    question = example['question']")
print("    full_answer = example['answer']  # Full GSM8K format")
print("    ground_truth = example['ground_truth']  # Just the number")
print("    model_response = example['deepseek_response']")
print()
print("# The ground_truth field makes comparison easier:")
print("# - No need to parse #### format")
print("# - Direct numeric comparison")
print("# - Useful for analysis and evaluation")
print()
print("# Use cases:")
print("# 1. Fine-tuning - Use as training data")
print("# 2. Data augmentation - Add to existing datasets")
print("# 3. Distillation - Train smaller models")
print("# 4. Analysis - Study model reasoning patterns")
print("# 5. Evaluation - Easy comparison with ground_truth field")

print("\n" + "="*80)

SYNTHETIC DATA EXAMPLES

ðŸ“Š Example 1:
--------------------------------------------------------------------------------
Question: Natalia sold clips to 48 of her friends in April, and then she sold half as many clips in May. How many clips did Natalia sell altogether in April and May?

Ground Truth Answer: Natalia sold 48/2 = <<48/2=24>>24 clips in May.
Natalia sold 48+24 = <<48+24=72>>72 clips altogether in April and May.
#### 72

Deepseek Response: First, I note that Natalia sold 48 clips in April.

In May, she sold half as many clips as she did in April, which is 24 clips.

To find the total number of clips sold over both months, I add the clips sold in April and May: 48 + 24 = 72.

Therefore, Natalia sold a total of 72 clips in April and May.
</think>

**Step 1:** Determine the number of clips Natalia sold in April.
\[
\text{Clips sold in April} = 48
\]

**Step 2:** Calculate the number of clips sold in May, which is half of April's sale...


ðŸ“Š Example 2:
---------------------