In [3]:
%%capture
!pip install vllm

In [4]:
from vllm import LLM, SamplingParams
import json
from typing import List, Dict

INFO 04-12 17:08:33 [__init__.py:239] Automatically detected platform cuda.


In [5]:
# Configuration
llm_names = ["google/gemma-3-1b-it", "google/gemma-3-4b-it"]

from datasets import load_dataset

ds = load_dataset("HuggingFaceH4/MATH-500")
dataset = ds['problem'][:5]
k_responses = 5  # Number of responses per question
temperature = 0.7  # Sampling temperature for diversity
max_tokens = 512  # Maximum tokens per response

In [6]:
# Initialize the model and output file dynamically
def initialize_model_and_output_file(llm_id):
    llm_name = llm_names[llm_id]
    output_file = f"{llm_name.replace('/', '_')}_responses.json"
    return llm_name, output_file

In [7]:
# Generate responses using vLLM
def generate_responses(llm_name, prompts, k_responses):
    # Initialize the LLM model
    llm = LLM(model=llm_name, max_model_len=2048, dtype="auto")

    # Set sampling parameters for diverse responses
    sampling_params = SamplingParams(
        temperature=temperature,
        top_p=0.95,
        max_tokens=max_tokens,
        n=k_responses  # Specify the number of responses to generate per prompt
    )

    # Generate responses for each prompt
    all_responses = []
    for prompt in prompts:
        predictions = llm.generate([prompt], sampling_params)
        responses = [prediction.outputs[0].text for prediction in predictions]  # Access generated text directly
        all_responses.append({"prompt": prompt, "responses": responses})

    return all_responses

In [8]:
# Save responses to a JSON file
def save_responses_to_file(responses, output_file):
    with open(output_file, "w") as f:
        json.dump(responses, f, indent=4)
    print(f"Responses saved to {output_file}")

In [9]:
llm_id = 0

llm_name, output_file = initialize_model_and_output_file(llm_id)

print(f"Generating responses using {llm_name}...")

# Generate responses for the dataset prompts
responses = generate_responses(llm_name, dataset, k_responses)

# Save the generated responses to a file
save_responses_to_file(responses, output_file)

Generating responses using google/gemma-3-1b-it...
INFO 04-12 17:08:53 [config.py:600] This model supports multiple tasks: {'embed', 'reward', 'classify', 'generate', 'score'}. Defaulting to 'generate'.
INFO 04-12 17:08:53 [config.py:1780] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 04-12 17:08:58 [core.py:61] Initializing a V1 LLM engine (v0.8.3) with config: model='google/gemma-3-1b-it', speculative_config=None, tokenizer='google/gemma-3-1b-it', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfi

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 04-12 17:09:01 [loader.py:447] Loading weights took 0.75 seconds
INFO 04-12 17:09:02 [gpu_model_runner.py:1273] Model loading took 1.9148 GiB and 2.198860 seconds
INFO 04-12 17:09:14 [backends.py:416] Using cache directory: /root/.cache/vllm/torch_compile_cache/b77175cda9/rank_0_0 for vLLM's torch.compile
INFO 04-12 17:09:14 [backends.py:426] Dynamo bytecode transform time: 12.39 s


[rank0]:W0412 17:09:17.527000 6556 torch/_inductor/utils.py:1137] [0/0] Not enough SMs to use max_autotune_gemm mode


INFO 04-12 17:09:21 [backends.py:132] Cache the graph of shape None for later use
INFO 04-12 17:10:00 [backends.py:144] Compiling a graph for general shape takes 44.68 s
INFO 04-12 17:10:26 [monitor.py:33] torch.compile takes 57.07 s in total
INFO 04-12 17:10:28 [kv_cache_utils.py:578] GPU KV cache size: 331,440 tokens
INFO 04-12 17:10:28 [kv_cache_utils.py:581] Maximum concurrency for 2,048 tokens per request: 161.84x
INFO 04-12 17:11:06 [gpu_model_runner.py:1608] Graph capturing finished in 39 secs, took 0.43 GiB
INFO 04-12 17:11:07 [core.py:162] init engine (profile, create kv cache, warmup model) took 124.66 seconds


Processed prompts: 100%|██████████| 5/5 [00:06<00:00,  1.34s/it, est. speed input: 3.73 toks/s, output: 382.30 toks/s]
Processed prompts: 100%|██████████| 5/5 [00:06<00:00,  1.25s/it, est. speed input: 3.99 toks/s, output: 408.70 toks/s]
Processed prompts: 100%|██████████| 5/5 [00:06<00:00,  1.25s/it, est. speed input: 3.99 toks/s, output: 408.24 toks/s]


Responses saved to google_gemma-3-1b-it_responses.json
