In [1]:
!pip install vllm

Collecting vllm
  Downloading vllm-0.6.2-cp38-abi3-manylinux1_x86_64.whl.metadata (2.4 kB)
Collecting transformers>=4.45.0 (from vllm)
  Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Collecting openai>=1.40.0 (from vllm)
  Downloading openai-1.50.0-py3-none-any.whl.metadata (24 kB)
Collecting uvicorn[standard] (from vllm)
  Downloading uvicorn-0.30.6-py3-none-any.whl.metadata (6.6 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)
  Downloading prometheus_fastapi_instrumentator-7.0.0-py3-none-any.whl.metadata (13 kB)
Collecting tiktoken>=0.6.0 (from vllm)
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting lm-format-enforcer==0.10.6 (from vllm)
  Downloadi

In [2]:
import time
import torch
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
import numpy as np

def test_quantized_model_with_vllm():
    model_id = "ohmyhong/llama-3-Korean-Bllossom-8B-awq"

    # vLLM 모델 로드 (양자화 적용)
    llm = LLM(model=model_id, quantization="awq", trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

    PROMPT = '''You are a helpful AI assistant. Please answer the user's questions kindly. 당신은 유능한 AI 어시스턴트 입니다. 사용자의 질문에 대해 친절하게 답변해주세요.'''
    instructions = [
        "서울의 유명한 관광 코스를 만들어줄래?",
        "한국 전통 음식 중 외국인에게 추천할 만한 것은?",
        "기후 변화에 대응하기 위해 개인이 할 수 있는 일은?"
    ]

    sampling_params = SamplingParams(
        temperature=0.6,
        top_p=0.95,
        max_tokens=2048,
        stop=[tokenizer.eos_token, "<|eot_id|>"]
    )

    results = []

    for instruction in instructions:
        messages = [
            {"role": "system", "content": PROMPT},
            {"role": "user", "content": instruction}
        ]

        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

        start_time = time.time()
        outputs = llm.generate([prompt], sampling_params)
        end_time = time.time()

        generated_text = outputs[0].outputs[0].text
        metrics = outputs[0].metrics

        # TTFT 계산
        ttft = metrics.first_token_time - metrics.arrival_time

        inference_time = end_time - start_time
        token_count = len(tokenizer.encode(generated_text))
        tokens_per_second = token_count / inference_time

        result = {
            "instruction": instruction,
            "generated_text": generated_text,
            "inference_time": inference_time,
            "token_count": token_count,
            "tokens_per_second": tokens_per_second,
            "ttft": ttft
        }
        results.append(result)

    # 모델 크기 계산
    # model_size_mb = calculate_model_size(llm.model)

    return results

# 테스트 실행
results = test_quantized_model_with_vllm()

print(f"{'='*50}")
print("Quantized Model with vLLM Evaluation Results")
print(f"{'='*50}")

# print(f"Model Size: {model_size:.2f} MB")

avg_inference_time = np.mean([r["inference_time"] for r in results])
avg_tokens_per_second = np.mean([r["tokens_per_second"] for r in results])
avg_ttft = np.mean([r["ttft"] for r in results])

print(f"Average Inference Time: {avg_inference_time:.2f} seconds")
print(f"Average Tokens/Second: {avg_tokens_per_second:.2f}")
print(f"Average Time to First Token: {avg_ttft:.4f} seconds")

print("\nFull Outputs:")
for idx, result in enumerate(results, 1):
    print(f"\n{'-'*40}")
    print(f"Sample {idx}:")
    print(f"Instruction: {result['instruction']}")
    print(f"Generated Text:\n{result['generated_text']}")
    print(f"Inference Time: {result['inference_time']:.2f} seconds")
    print(f"Token Count: {result['token_count']}")
    print(f"Tokens/Second: {result['tokens_per_second']:.2f}")
    print(f"Time to First Token: {result['ttft']:.4f} seconds")

print("\nTest completed.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/997 [00:00<?, ?B/s]

INFO 09-27 05:51:09 awq_marlin.py:94] Detected that the model can run with awq_marlin, however you specified quantization=awq explicitly, so forcing awq. Use quantization=awq_marlin for faster inference
INFO 09-27 05:51:09 llm_engine.py:226] Initializing an LLM engine (v0.6.1.dev238+ge2c6e0a82) with config: model='ohmyhong/llama-3-Korean-Bllossom-8B-awq', speculative_config=None, tokenizer='ohmyhong/llama-3-Korean-Bllossom-8B-awq', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=awq, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfi

tokenizer_config.json:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

INFO 09-27 05:51:15 model_runner.py:1014] Starting to load model ohmyhong/llama-3-Korean-Bllossom-8B-awq...
INFO 09-27 05:51:16 weight_utils.py:242] Using model weights format ['*.safetensors']


model-00002-of-00002.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.68G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/63.5k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]


INFO 09-27 05:51:38 model_runner.py:1025] Loading model weights took 5.3440 GB
INFO 09-27 05:51:39 gpu_executor.py:122] # GPU blocks: 14536, # CPU blocks: 2048
INFO 09-27 05:51:41 model_runner.py:1329] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-27 05:51:41 model_runner.py:1333] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-27 05:52:05 model_runner.py:1456] Graph capturing finished in 24 secs.


Processed prompts: 100%|██████████| 1/1 [00:06<00:00,  6.53s/it, est. speed input: 10.57 toks/s, output: 94.64 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:04<00:00,  4.39s/it, est. speed input: 16.17 toks/s, output: 96.10 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:05<00:00,  5.78s/it, est. speed input: 12.45 toks/s, output: 96.32 toks/s]

Quantized Model with vLLM Evaluation Results
Average Inference Time: 5.57 seconds
Average Tokens/Second: 95.40
Average Time to First Token: 0.0425 seconds

Full Outputs:

----------------------------------------
Sample 1:
Instruction: 서울의 유명한 관광 코스를 만들어줄래?
Generated Text:
서울의 유명한 관광 코스를 만들어 드리겠습니다. 서울은 다양한 문화, 역사, 자연, 쇼핑, 음식 등 다양한 관광 자원이 풍부한 도시이기 때문에, 다양한 주제를 고려하여 코스를 구성할 수 있습니다. 다음과 같은 코스를 추천합니다:

### 코스 1: 역사와 문화

1. **경복궁**: 조선시대의 궁전으로, 서울의 가장 오래된 궁전 중 하나입니다.
2. **창덕궁**: 조선시대의 궁전으로, 특히 '창덕궁 옹의 전망대'에서 서울의 전경을 볼 수 있습니다.
3. **인사동**: 전통 문화예술을 체험할 수 있는 곳으로, 다양한 전통 시장과 문화 공간이 있습니다.
4. **국립중앙박물관**: 한국의 역사, 미술, 과학을 전시하는 국립 박물관입니다.
5. **서울역사박물관**: 서울의 역사와 문화를 체험할 수 있는 박물관입니다.

### 코스 2: 자연과 녹지

1. **남산국립공원**: 서울에서 가장 큰 공원으로, 숲길을 걸으며 휴식을 취할 수 있습니다.
2. **서울숲**: 도시 내에서 자연을 느낄 수 있는 공간으로, 다양한 생태관광을 즐길 수 있습니다.
3. **한강시민공원**: 한강변의 녹지공원으로, 걷기, 자전거 타기 등 다양한 활동을 즐길 수 있습니다.
4. **서울랜드**: 주말에는 다양한 이벤트와 공연이 열리는 곳으로, 가족과 함께 즐길 수 있습니다.

### 코스 3: 쇼핑과 음식

1. **명동**: 쇼핑과 음식을 즐길 수 있는 명동 상가입니다.
2. **홍대시장**: 전통 


