In [1]:
!pip install vllm

Collecting vllm
  Downloading vllm-0.6.2-cp38-abi3-manylinux1_x86_64.whl.metadata (2.4 kB)
Collecting transformers>=4.45.0 (from vllm)
  Downloading transformers-4.45.1-py3-none-any.whl.metadata (44 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
Collecting openai>=1.40.0 (from vllm)
  Downloading openai-1.50.0-py3-none-any.whl.metadata (24 kB)
Collecting uvicorn[standard] (from vllm)
  Downloading uvicorn-0.30.6-py3-none-any.whl.metadata (6.6 kB)
Collecting prometheus-fastapi-instrumentator>=7.0.0 (from vllm)
  Downloading prometheus_fastapi_instrumentator-7.0.0-py3-none-any.whl.metadata (13 kB)
Collecting tiktoken>=0.6.0 (from vllm)
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting lm-format-enforcer==0.10.6 (from vllm)
  Downloadi

In [2]:
import time
import torch
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
import numpy as np

def test_original_model_with_vllm():
    model_id = "MLP-KTLim/llama-3-Korean-Bllossom-8B"

    # vLLM 모델 로드 (원본 모델)
    llm = LLM(model=model_id, trust_remote_code=True)
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)

    PROMPT = '''You are a helpful AI assistant. Please answer the user's questions kindly. 당신은 유능한 AI 어시스턴트 입니다. 사용자의 질문에 대해 친절하게 답변해주세요.'''
    instructions = [
        "서울의 유명한 관광 코스를 만들어줄래?",
        "한국 전통 음식 중 외국인에게 추천할 만한 것은?",
        "기후 변화에 대응하기 위해 개인이 할 수 있는 일은?"
    ]

    sampling_params = SamplingParams(
        temperature=0.6,
        top_p=0.95,
        max_tokens=2048,
        stop=[tokenizer.eos_token, "<|eot_id|>"]
    )

    results = []

    for instruction in instructions:
        messages = [
            {"role": "system", "content": PROMPT},
            {"role": "user", "content": instruction}
        ]
        prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

        start_time = time.time()
        outputs = llm.generate([prompt], sampling_params)
        end_time = time.time()

        generated_text = outputs[0].outputs[0].text
        metrics = outputs[0].metrics

        # TTFT 계산
        ttft = metrics.first_token_time - metrics.arrival_time

        inference_time = end_time - start_time
        token_count = len(tokenizer.encode(generated_text))
        tokens_per_second = token_count / inference_time

        result = {
            "instruction": instruction,
            "generated_text": generated_text,
            "inference_time": inference_time,
            "token_count": token_count,
            "tokens_per_second": tokens_per_second,
            "ttft": ttft
        }
        results.append(result)

    # 모델 크기 계산 (근사치)
    # model_size_gb = sum(p.numel() * p.element_size() for p in llm.llm_engine.model.parameters()) / (1024**3)

    return results

# 테스트 실행
results = test_original_model_with_vllm()

print(f"{'='*50}")
print("Original Model with vLLM Evaluation Results")
print(f"{'='*50}")

# print(f"Approximate Model Size: {model_size:.2f} GB")

avg_inference_time = np.mean([r["inference_time"] for r in results])
avg_tokens_per_second = np.mean([r["tokens_per_second"] for r in results])
avg_ttft = np.mean([r["ttft"] for r in results])

print(f"Average Inference Time: {avg_inference_time:.2f} seconds")
print(f"Average Tokens/Second: {avg_tokens_per_second:.2f}")
print(f"Average Time to First Token: {avg_ttft:.4f} seconds")

print("\nFull Outputs:")
for idx, result in enumerate(results, 1):
    print(f"\n{'-'*40}")
    print(f"Sample {idx}:")
    print(f"Instruction: {result['instruction']}")
    print(f"Generated Text:\n{result['generated_text']}")
    print(f"Inference Time: {result['inference_time']:.2f} seconds")
    print(f"Token Count: {result['token_count']}")
    print(f"Tokens/Second: {result['tokens_per_second']:.2f}")
    print(f"Time to First Token: {result['ttft']:.4f} seconds")

print("\nTest completed.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/710 [00:00<?, ?B/s]

INFO 09-27 06:34:33 llm_engine.py:226] Initializing an LLM engine (v0.6.1.dev238+ge2c6e0a82) with config: model='MLP-KTLim/llama-3-Korean-Bllossom-8B', speculative_config=None, tokenizer='MLP-KTLim/llama-3-Korean-Bllossom-8B', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, rope_scaling=None, rope_theta=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=0, served_model_name=MLP-KTLim/llama-3-Korean-Bllossom-8B, use_v2_block_manager=False, num_scheduler_ste

tokenizer_config.json:   0%|          | 0.00/51.1k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/444 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

INFO 09-27 06:34:37 model_runner.py:1014] Starting to load model MLP-KTLim/llama-3-Korean-Bllossom-8B...
INFO 09-27 06:34:37 weight_utils.py:242] Using model weights format ['*.safetensors']


model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]


INFO 09-27 06:35:37 model_runner.py:1025] Loading model weights took 14.9595 GB
INFO 09-27 06:35:39 gpu_executor.py:122] # GPU blocks: 9689, # CPU blocks: 2048
INFO 09-27 06:35:41 model_runner.py:1329] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 09-27 06:35:41 model_runner.py:1333] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
INFO 09-27 06:36:05 model_runner.py:1456] Graph capturing finished in 24 secs.


Processed prompts: 100%|██████████| 1/1 [00:10<00:00, 10.22s/it, est. speed input: 6.75 toks/s, output: 68.71 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:07<00:00,  7.68s/it, est. speed input: 9.24 toks/s, output: 69.00 toks/s]
Processed prompts: 100%|██████████| 1/1 [00:09<00:00,  9.38s/it, est. speed input: 7.68 toks/s, output: 69.02 toks/s]

Original Model with vLLM Evaluation Results
Average Inference Time: 9.10 seconds
Average Tokens/Second: 68.79
Average Time to First Token: 0.0398 seconds

Full Outputs:

----------------------------------------
Sample 1:
Instruction: 서울의 유명한 관광 코스를 만들어줄래?
Generated Text:
물론입니다! 서울은 다양한 문화, 역사, 자연, 그리고 현대적인 매력을 동시에 자랑하는 도시로, 많은 관광객들이 찾는 인기 있는 여행지입니다. 아래는 서울의 유명한 관광 코스입니다.

### 1단계: 역사와 문화 탐방
1. **경복궁 (Gyeongbokgung Palace)**: 조선 시대의 대표적인 궁궐로, 조선 왕조의 역사와 건축 양식을 체험할 수 있습니다.
2. **창덕궁 (Changdeokgung Palace)**: 경복궁과 함께 조선 왕조의 궁궐로, 특별히 유명한 후원과 경회루를 방문해 보세요.
3. **북촌 한옥마을 (Bukchon Hanok Village)**: 전통 한옥이 있는 마을로, 조선 시대의 전통 주거지와 문화를 체험할 수 있습니다.

### 2단계: 현대적인 서울
1. **명동 (Myeongdong)**: 쇼핑과 음식으로 유명한 거리로, 다양한 브랜드 매장과 전통 시장, 그리고 다양한 음식점이 있습니다.
2. **인사동 (Insa-dong)**: 전통 예술과 문화를 체험할 수 있는 거리로, 전통 한옥과 갤러리, 전통 음식점이 많이 있습니다.
3. **홍대 (Hongdae)**: 젊은층을 중심으로 한 문화와 예술이 활성화된 지역으로, 스트리트 아티스트와 다양한 카페가 있습니다.

### 3단계: 자연과 휴식
1. **남산 서울타워 (Namsan Seoul Tower)**: 서울의 전경을 한눈에 볼 수 있는 전망대로, 산책로와 공원도 즐길 수 있습니다.
2. **


