In [1]:
!pip install autoawq



In [2]:
!pip show torch transformers

Name: torch
Version: 2.3.1
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, fsspec, jinja2, networkx, nvidia-cublas-cu12, nvidia-cuda-cupti-cu12, nvidia-cuda-nvrtc-cu12, nvidia-cuda-runtime-cu12, nvidia-cudnn-cu12, nvidia-cufft-cu12, nvidia-curand-cu12, nvidia-cusolver-cu12, nvidia-cusparse-cu12, nvidia-nccl-cu12, nvidia-nvtx-cu12, sympy, triton, typing-extensions
Required-by: accelerate, autoawq, autoawq_kernels, fastai, torchaudio, torchvision
---
Name: transformers
Version: 4.45.1
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transfo

In [3]:
!pip install --upgrade torch transformers

Collecting torch
  Downloading torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl.metadata (26 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting triton==3.0.0 (from torch)
  Downloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)
Downloading torch-2.4.1-cp310-cp310-manylinux1_x86_64.whl (797.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m797.1/797.1 MB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl (664.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading triton-3.0.0-1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (209.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.4/209.4 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00

In [4]:
import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer
from threading import Thread
import numpy as np

def test_quantized_model():
    model_id = "ohmyhong/llama-3-Korean-Bllossom-8B-awq"

    # 양자화된 모델과 토크나이저 로드
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True
    )
    model.eval()

    PROMPT = '''You are a helpful AI assistant. Please answer the user's questions kindly. 당신은 유능한 AI 어시스턴트 입니다. 사용자의 질문에 대해 친절하게 답변해주세요.'''
    instructions = [
        "서울의 유명한 관광 코스를 만들어줄래?",
        "한국 전통 음식 중 외국인에게 추천할 만한 것은?",
        "기후 변화에 대응하기 위해 개인이 할 수 있는 일은?"
    ]

    results = []

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    for instruction in instructions:
        messages = [
            {"role": "system", "content": PROMPT},
            {"role": "user", "content": instruction}
        ]

        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)

        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

        start_time = time.time()
        ttft = None
        generated_text = ""

        def generation():
            with torch.no_grad():
                model.generate(
                    input_ids,
                    max_new_tokens=2048,
                    eos_token_id=terminators,
                    do_sample=True,
                    temperature=0.6,
                    top_p=0.9,
                    repetition_penalty=1.1,
                    no_repeat_ngram_size=3,
                    streamer=streamer
                )

        thread = Thread(target=generation)
        thread.start()

        for new_text in streamer:
            if ttft is None:
                ttft = time.time() - start_time
            generated_text += new_text

        thread.join()
        end_time = time.time()

        inference_time = end_time - start_time
        token_count = len(tokenizer.encode(generated_text))
        tokens_per_second = token_count / inference_time

        result = {
            "instruction": instruction,
            "generated_text": generated_text,
            "inference_time": inference_time,
            "token_count": token_count,
            "tokens_per_second": tokens_per_second,
            "ttft": ttft
        }
        results.append(result)

    return results


# 테스트 실행
results = test_quantized_model()

print(f"{'='*50}")
print("Original Model Evaluation Results")
print(f"{'='*50}")

# print(f"Model Size: {model_size:.2f} GB")

avg_inference_time = np.mean([r["inference_time"] for r in results])
avg_tokens_per_second = np.mean([r["tokens_per_second"] for r in results])
avg_ttft = np.mean([r["ttft"] for r in results])

print(f"Average Inference Time: {avg_inference_time:.2f} seconds")
print(f"Average Tokens/Second: {avg_tokens_per_second:.2f}")
print(f"Average Time to First Token: {avg_ttft:.4f} seconds")

print("\nFull Outputs:")
for idx, result in enumerate(results, 1):
    print(f"\n{'-'*40}")
    print(f"Sample {idx}:")
    print(f"Instruction: {result['instruction']}")
    print(f"Generated Text:\n{result['generated_text']}")
    print(f"Inference Time: {result['inference_time']:.2f} seconds")
    print(f"Token Count: {result['token_count']}")
    print(f"Tokens/Second: {result['tokens_per_second']:.2f}")
    print(f"Time to First Token: {result['ttft']:.4f} seconds")

print("\nTest completed.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
We suggest you to set `torch_dtype=torch.float16` for better efficiency with AWQ.


model.safetensors.index.json:   0%|          | 0.00/63.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.68G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's 

Original Model Evaluation Results
Average Inference Time: 27.35 seconds
Average Tokens/Second: 15.87
Average Time to First Token: 0.4035 seconds

Full Outputs:

----------------------------------------
Sample 1:
Instruction: 서울의 유명한 관광 코스를 만들어줄래?
Generated Text:
물론입니다! 서울은 세계적으로 유명하고 다양한 문화, 역사, 자연 등을 경험할 수 있는 도시입니다. 다음은 서울의 주요 관광코스로 추천하는 순서입니다:

### 1. 경복궁 (Gyeongbokgung Palace)
- **주소:** 서울 중구 새문안로1
- **설명:** 조선 왕조 최초의 궁전으로, 한국의 전통과 역사를 느낄 수 있습니다.

### 2. 창덕고등학교 (Changdeokguk School)
-   **주소:** 서울특별시 종로구 창덕가길 21
-   **설정:** 조선 시대의 교육 기관으로, 역사적 가치가 높습니다.

###   3. 불국사 (Bulguksa Temple)
    - **주소:**
        + 부산광역시 금산면 일산리 707 (본원)
        + 서울시 동대문구 서강로 7 (분당벼룩아동박물관)
    -
    -   ** 설명:** 불국사는 대한민국에서 가장 중요한 불교 사찰로, 건축과 미술 모두 뛰어납니다.

###     4. 남산 (Namsan Park)
    -

    -  **주소**: 서울특별시 용산구 남산5단지10번길
    -  
    -    **설정이:** 남산에는 여러 명소들이 있으며, 특히 남산타워(N Seoul Tower)는 사랑을 기념하기 위한 곳으로 유명합니다.

###       5. 홍dae University Street
    -

      -  **
        **주소**
        : 서울특별시 마포구 서

In [None]:
import time
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import numpy as np

def test_quantized_model():
    model_id = "ohmyhong/llama-3-Korean-Bllossom-8B-awq"

    # 양자화된 모델과 토크나이저 로드
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        torch_dtype=torch.bfloat16,
        device_map="auto",
        trust_remote_code=True
    )
    model.eval()

    PROMPT = '''You are a helpful AI assistant. Please answer the user's questions kindly. 당신은 유능한 AI 어시스턴트 입니다. 사용자의 질문에 대해 친절하게 답변해주세요.'''
    instructions = [
        "서울의 유명한 관광 코스를 만들어줄래?",
        "한국 전통 음식 중 외국인에게 추천할 만한 것은?",
        "기후 변화에 대응하기 위해 개인이 할 수 있는 일은?"
    ]

    results = []

    terminators = [
        tokenizer.eos_token_id,
        tokenizer.convert_tokens_to_ids("<|eot_id|>")
    ]

    for instruction in instructions:
        messages = [
            {"role": "system", "content": PROMPT},
            {"role": "user", "content": instruction}
        ]

        input_ids = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            return_tensors="pt"
        ).to(model.device)

        start_time = time.time()
        outputs = model.generate(
            input_ids,
            max_new_tokens=2048,
            eos_token_id=terminators,
            do_sample=True,
            temperature=0.6,
            top_p=0.9
        )
        end_time = time.time()

        generated_text = tokenizer.decode(outputs[0][input_ids.shape[-1]:], skip_special_tokens=True)

        inference_time = end_time - start_time
        token_count = len(tokenizer.encode(generated_text))
        tokens_per_second = token_count / inference_time

        result = {
            "instruction": instruction,
            "generated_text": generated_text,
            "inference_time": inference_time,
            "token_count": token_count,
            "tokens_per_second": tokens_per_second
        }
        results.append(result)

    return results

# 테스트 실행
results = test_quantized_model()

print(f"{'='*50}")
print("Quantized Model Evaluation Results (without vLLM)")
print(f"{'='*50}")

avg_inference_time = np.mean([r["inference_time"] for r in results])
avg_tokens_per_second = np.mean([r["tokens_per_second"] for r in results])

print(f"Average Inference Time: {avg_inference_time:.2f} seconds")
print(f"Average Tokens/Second: {avg_tokens_per_second:.2f}")

print("\nFull Outputs:")
for idx, result in enumerate(results, 1):
    print(f"\n{'-'*40}")
    print(f"Sample {idx}:")
    print(f"Instruction: {result['instruction']}")
    print(f"Generated Text:\n{result['generated_text']}")
    print(f"Inference Time: {result['inference_time']:.2f} seconds")
    print(f"Token Count: {result['token_count']}")
    print(f"Tokens/Second: {result['tokens_per_second']:.2f}")

print("\nTest completed.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
We suggest you to set `torch_dtype=torch.float16` for better efficiency with AWQ.


model.safetensors.index.json:   0%|          | 0.00/63.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.68G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.05G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/172 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's 

Quantized Model Evaluation Results (without vLLM)
Average Inference Time: 34.95 seconds
Average Tokens/Second: 15.94

Full Outputs:

----------------------------------------
Sample 1:
Instruction: 서울의 유명한 관광 코스를 만들어줄래?
Generated Text:
서울의 유명한 관광 코스를 만들어보겠습니다. 이 코스는 서울의 주요 관광지와 역사적인 지역을 포함합니다. 이 코스는 서울시내 주요 교통 노선으로 이동할 수 있게 구성되어 있습니다.

### 코스 1: 역사와 문화

1. **광화문** (Gwanghwamun Square)
   - 광화문은 한국의 역사와 문화를 대표하는 장소입니다. 광화문 광장에는 광화문 광불, 광화문 사자, 광화문 광등 등 다양한 문화유산이 있습니다.
2. **창경궁** (Changdeokgung Palace)
   - 창경궁은 조선시대의 궁궐 중 하나로, 한국의 역사와 궁궐 문화를 느낄 수 있는 곳입니다.
3. **인사동** (Insa-dong)
   - 인사동은 서울의 대표적인 문화 관광지로, 전통의상, 고유의품, 예술 작품 등을 판매하는 작은 상점들이 있습니다.
4. **노학사** (Namsan Library)
   - 노학사는 서울의 대표적인 도서관으로, 다양한 책과 자료를 제공합니다. 또한, 주변에 있는 노학사 공원은 휴식과 명소를 제공합니다.

### 코스 2: 자연과 엔터테인먼트

1. **서울랜드** (Seoul Land)
   - 서울랜드는 가족과 함께 즐길 수 있는 테마파크입니다. 다양한 놀이기구와 이벤트가 있습니다.
2. **숭산 자연사박물관** (National Museum of Natural History)
   - 숭산 자연사박물관은 자연과 생명에 대한 이해를 높일 수 있는 곳입니다. 다양한 생물 표본과 영상을 제공합니다.
3. **한강시민공원** (Han