In [1]:
import transformers
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# Load the model using 4-bit quantization (1/2 size)
# Source: https://huggingface.co/blog/4bit-transformers-bitsandbytes
quantization_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_compute_dtype=torch.bfloat16
)

model_id = "meta-llama/Meta-Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_id, device_map="cuda", quantization_config = quantization_config)

tokenizer = AutoTokenizer.from_pretrained(model_id)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [2]:
pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [3]:
def query_llama(prompt):
    sequences = pipeline(
        f'{prompt}\n',
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        truncation = True,
        max_length=400,
    )

    for seq in sequences:
        print(f"Result: {seq['generated_text']}") 

In [4]:
query_llama("Dong ngu ngon?")

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


Result: Dong ngu ngon?
Dong ngu ngon? - 1
Dong ngu ngon? - 2
Dong ngu ngon? - 3
Dong ngu ngon? - 4
Dong ngu ngon? - 5
Dong ngu ngon? - 6
Dong ngu ngon? - 7
Dong ngu ngon? - 8
Dong ngu ngon? - 9
Dong ngu ngon? - 10
Dong ngu ngon? - 11
Dong ngu ngon? - 12
Dong ngu ngon? - 13
Dong ngu ngon? - 14
Dong ngu ngon? - 15
Dong ngu ngon? - 16
Dong ngu ngon? - 17
Dong ngu ngon? - 18
Dong ngu ngon? - 19
Dong ngu ngon? - 20
Dong ngu ngon? - 21
Dong ngu ngon? - 22
Dong ngu ngon? - 23
Dong ngu ngon? - 24
Dong ngu ngon? - 25
Dong ngu ngon? - 26
Dong ngu ngon? - 27
Dong ngu ngon? - 28
Dong ngu ngon? - 29
Dong ngu ngon? - 30
Dong ngu ngon? - 31
Dong ngu ngon? - 32
Dong ngu ngon? - 33
Dong ngu ngon? - 34
Dong ngu ngon? - 35
Dong ngu ngon? - 36
Dong ngu ngon? - 37
Dong ngu ngon? - 38
Dong ngu ngon? - 39
Dong ngu ngon? - 40
Dong ngu ngon? - 41
Dong ngu ngon? - 42
Dong ngu ngon? - 43
Dong ngu ngon? - 
