# Install requirements

In [None]:
%%capture
%pip install -U pip
%pip install torch==2.1.2 torchvision==0.16.2 torchaudio==2.1.2 --index-url https://download.pytorch.org/whl/cu118
%pip install transformers==4.41.2
%pip install bitsandbytes==0.43.1

# Load Llama 3 8B Instruct

Login to Hugging Face (this is required to download the model).

In [None]:
from huggingface_hub import notebook_login

notebook_login()

Load the model and the tokenizer, using an appropriate BitsAndBytes configuration for quantization (4bit). 

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

model_path = "meta-llama/Meta-Llama-3-8B-Instruct"

bnb_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_use_double_quant = True,
    bnb_4bit_quant_type = "nf4",
    bnb_4bit_compute_dtype = torch.bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_path, device_map = "auto", quantization_config = bnb_config)
tokenizer = AutoTokenizer.from_pretrained(model_path)

tokenizer.pad_token = tokenizer.eos_token

# Generate User Stories

Prepare the prompt. You have to use the [Meta Llama 3 Instruct prompt format](https://llama.meta.com/docs/model-cards-and-prompt-formats/meta-llama-3).

The tensor should contain these special tokens: 
- <|begin_of_text|> = 128000
- <|start_header_id|> = 128006
- <|end_header_id|> = 128007
- <|eot_id|> = 128009

In [None]:
from transformers import set_seed

set_seed(42)

user_prompt = (
    "Considering the following machine learning technique: "
    "nearest neighbour search in the field of machine learning. "
    "Can you provide me with specific user stories for the following application domains? "
    "Finance and Marketing"
)

messages = [
    {
        "role": "system",
        "content": "You are a helpful AI assistant",
    },
    {
        "role": "user",
        "content": user_prompt
    },
]

model_inputs = tokenizer.apply_chat_template(messages, add_generation_prompt = True, return_tensors="pt").to("cuda")
input_length = model_inputs.shape[1]

Generate some tokens.

In [None]:
generated_ids = model.generate(
    input_ids = model_inputs,
    min_new_tokens = 0,
    max_new_tokens = 1024,
    do_sample = True,
    temperature = 0.7,
    top_k = 50,
    top_p = 0.9
)

print(tokenizer.batch_decode(generated_ids[:, input_length : ], skip_special_tokens = False)[0])


Use [TextStreamer](https://huggingface.co/docs/transformers/internal/generation_utils#transformers.TextStreamer) to print the token(s) to stdout as soon as entire words are formed.

In [None]:
from transformers import TextStreamer

streamer = TextStreamer(tokenizer, skip_prompt = True)

_ = model.generate(
    input_ids = model_inputs,
    min_new_tokens = 0,
    max_new_tokens = 1024,
    do_sample = True,
    temperature = 0.7,
    top_k = 50,
    top_p = 0.9,
    streamer = streamer
)

