In [1]:
model_name = "microsoft/Phi-3.5-mini-instruct"

In [2]:
import nest_asyncio
import asyncio
import vllm
from concurrent.futures import ThreadPoolExecutor


In [3]:
nest_asyncio.apply()

In [4]:
def generate_text_with_llm(model_name, prompt):
    llm = vllm.LLM(model=model_name, 
                   max_model_len=2048,
                   gpu_memory_utilization=0.6)
    sampling_params = vllm.SamplingParams(
        temperature=0.3,
        top_p=0.95,
        max_tokens=1024,
        stop=["</s>", "Human:", "Assistant:"]
    )
    
    outputs = llm.generate([prompt], sampling_params)
    generated_code = outputs[0].outputs[0].text
    return generated_code

In [None]:
def main():
    # system_prompt = "\nYou are QA model.\n"
    system_prompt = "You are Specification Analyzer"
    problem_description = """
    B. Kar Salesman
    time limit per test: 1 second
    memory limit per test: 256 megabytes

    Karel is a salesman in a car dealership. The dealership has n different models of cars. There are a_i cars of the i-th model. Karel is an excellent salesperson and can convince customers to buy up to x cars (of Karel's choice), as long as the cars are from different models.

    Determine the minimum number of customers Karel has to bring in to sell all the cars.

    Input:
    Each test contains multiple test cases. The first line contains the number of test cases t (1 ≤ t ≤ 10^4). The description of the test cases follows.

    The first line of each test case contains two integers n and x (1 ≤ n ≤ 5⋅10^5; 1 ≤ x ≤ 10) — the number of different models of cars and the maximum number of cars Karel can convince a customer to buy.

    The second line contains n integers a1, a2, …, an (1 ≤ a_i ≤ 10^9) — the number of cars of each model.

    It is guaranteed that the sum of n over all test cases does not exceed 5⋅10^5.

    Output:
    For each test case, output the minimum possible number of customers needed to sell all the cars.
    """

    prompt_sections = """
        1. Input and output variables with constraints: \n
        2. Definition of problem: \n
        3. Time and memory constraints: \n
        4. Test case:"""
    
    prompt = f"""\nSystem Prompt: {system_prompt}\nSpecification: {problem_description}\nAnswer the following questions and don't make a code.\nQuestion: {prompt_sections}\nAnswer: """
    output = generate_text_with_llm(model_name, prompt)
    print(output)

In [None]:
main()

INFO 11-01 04:42:11 config.py:107] Replacing legacy 'type' key with 'rope_type'
INFO 11-01 04:42:13 llm_engine.py:237] Initializing an LLM engine (v0.6.3.post1) with config: model='microsoft/Phi-3.5-mini-instruct', speculative_config=None, tokenizer='microsoft/Phi-3.5-mini-instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=microsoft/Phi-3.5-mini-i

  @torch.library.impl_abstract("xformers_flash::flash_fwd")
  @torch.library.impl_abstract("xformers_flash::flash_bwd")


INFO 11-01 04:42:13 model_runner.py:1056] Starting to load model microsoft/Phi-3.5-mini-instruct...
INFO 11-01 04:42:13 selector.py:247] Cannot use FlashAttention-2 backend due to sliding window.
INFO 11-01 04:42:13 selector.py:115] Using XFormers backend.
INFO 11-01 04:42:14 weight_utils.py:243] Using model weights format ['*.safetensors']


Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 11-01 04:42:15 model_runner.py:1067] Loading model weights took 7.1659 GB
INFO 11-01 04:42:15 gpu_executor.py:122] # GPU blocks: 1108, # CPU blocks: 682
INFO 11-01 04:42:15 gpu_executor.py:126] Maximum concurrency for 2048 tokens per request: 8.66x
INFO 11-01 04:42:16 model_runner.py:1395] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 11-01 04:42:16 model_runner.py:1399] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 11-01 04:42:22 model_runner.py:1523] Graph capturing finished in 6 secs.


Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.53s/it, est. speed input: 91.56 toks/s, output: 93.10 toks/s]



1. Input and output variables with constraints:

- Input:
    - t: The number of test cases (1 ≤ t ≤ 10^4)
    - n: The number of different car models (1 ≤ n ≤ 5*10^5)
    - x: The maximum number of cars Karel can sell to a single customer (1 ≤ x ≤ 10)
    - a: An array of integers representing the number of cars of each model (1 ≤ a_i ≤ 10^9)

- Output:
    - The minimum number of customers needed to sell all cars for each test case

2. Definition of problem:

The problem requires us to find the minimum number of customers Karel needs to convince to buy cars from the dealership, given that he can sell up to x cars per customer and can only sell cars from different models. The goal is to sell all the cars available in the dealership.

3. Time and memory constraints:

- Time limit per test: 1 second
- Memory limit per test: 256 megabytes

4. Test case:

Example:

Input:
    t = 2
    n = 3
    x = 2
    a = [3, 4, 5]

Output:
    2

Explanation:

- For the first test case, Karel can s




: 