In [1]:
model_name = "Qwen/Qwen2.5-Coder-1.5B-Instruct"

In [2]:
import nest_asyncio
import asyncio
import vllm
from concurrent.futures import ThreadPoolExecutor


In [3]:
nest_asyncio.apply()

In [4]:
def generate_text_with_llm(model_name, prompt):
    llm = vllm.LLM(model=model_name, 
                   max_model_len=2048,
                   gpu_memory_utilization=0.6)
    sampling_params = vllm.SamplingParams(
        temperature=0.5,
        top_p=0.95,
        max_tokens=512,
        stop=["</s>", "Human:", "Assistant:"]
    )
    
    outputs = llm.generate([prompt], sampling_params)
    generated_code = outputs[0].outputs[0].text
    print(f"prompt: {prompt}")
    print(f"output: {generated_code}")
    return generated_code

In [5]:
def main():
    prompt = """
    You are a code assistant specializing in algorithmic problem solving.
    Please create a Python code that generates random input examples 
    based on specified variables and constraints.

    Function Requirements:
    1. Variables:
       - Input:
          - t: an integer representing the number of test cases (1 ≤ t ≤ 10^4).
          - For each test case:
              - n: an integer representing the number of different car models (1 ≤ n ≤ 5×10^5).
              - x: an integer representing the maximum number of cars Karel can sell to a single customer (1 ≤ x ≤ 10).
              - a: an array of n integers where each element ai represents the number of cars of the i-th model (1 ≤ ai ≤ 10^9).

    2. Example Input and Expected Output:
      - Input:
        - t = 1
        - n = 3
        - x = 2
        - a = [3, 4, 5]

    Requirements for `generate_inputs()`:
    - Generate 100 sets of input examples in the specified format.
    - Ensure a variety of examples by considering:
      - **Variable values within defined constraints**
      - **Boundary cases**
      - **Mixed cases with both duplicate and unique values**
      - **A mix of small and large values for n**
    - Set a random seed at the beginning of the code for reproducibility.
    - Write each generated input set to a separate .txt file. Generate a 
    total of 100 files, each named `input_example_{i}.txt`, where i is the example number (from 1 to 100).
"""
    # Iterate over each prompt section and generate a response for each without cumulative context
    
    output = generate_text_with_llm(model_name, prompt)
    print(output)


In [6]:
main()

INFO 11-01 02:52:23 llm_engine.py:237] Initializing an LLM engine (v0.6.3.post1) with config: model='Qwen/Qwen2.5-Coder-1.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-Coder-1.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=Qwen/Qwen2.5-Coder-1.5B-Instruct, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stre

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 11-01 02:52:25 model_runner.py:1067] Loading model weights took 2.8875 GB
INFO 11-01 02:52:25 gpu_executor.py:122] # GPU blocks: 22327, # CPU blocks: 9362
INFO 11-01 02:52:25 gpu_executor.py:126] Maximum concurrency for 2048 tokens per request: 174.43x
INFO 11-01 02:52:26 model_runner.py:1395] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 11-01 02:52:26 model_runner.py:1399] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 11-01 02:52:33 model_runner.py:1523] Graph capturing finished in 7 secs.


Processed prompts: 100%|██████████| 1/1 [00:02<00:00,  2.04s/it, est. speed input: 175.93 toks/s, output: 208.85 toks/s]

prompt: 
    You are a code assistant specializing in algorithmic problem solving.
    Please create a Python code that generates random input examples 
    based on specified variables and constraints.

    Function Requirements:
    1. Variables:
       - Input:
          - t: an integer representing the number of test cases (1 ≤ t ≤ 10^4).
          - For each test case:
              - n: an integer representing the number of different car models (1 ≤ n ≤ 5×10^5).
              - x: an integer representing the maximum number of cars Karel can sell to a single customer (1 ≤ x ≤ 10).
              - a: an array of n integers where each element ai represents the number of cars of the i-th model (1 ≤ ai ≤ 10^9).

    2. Example Input and Expected Output:
      - Input:
        - t = 1
        - n = 3
        - x = 2
        - a = [3, 4, 5]

    Requirements for `generate_inputs()`:
    - Generate 100 sets of input examples in the specified format.
    - Ensure a variety of examples by 


