In [1]:
import os
import sys

# PyTorch
import torch

# 🤗 Hugging Face transformers
# https://huggingface.co/docs/transformers/index
from transformers import (
    AutoTokenizer,
    BitsAndBytesConfig,
    LlamaForCausalLM,
    pipeline,
)

In [3]:
# Confirm the correct device is being used
# E.g. 'AMD Radeon Pro w7800'
print(f"Device name: {torch.cuda.get_device_name(0)}")

# set device to 'cuda' for ROCm GPUs
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    print("No CUDA device found.")
    sys.exit()

# verify the device is set to 'cuda'
print(f"Device: {device}")

In [4]:
# set path to local model
path_to_model = (
    f"{os.getcwd()}/fine-tuning-llama-3/Llama-Math-Single-GPU/checkpoint-8000"
)
print(path_to_model)

In [5]:
my_tokenizer = AutoTokenizer.from_pretrained(path_to_model)

fp4_config = BitsAndBytesConfig(
    load_in_4bit=True,  # Use 4bit quantization
    bnb_4bit_quant_type="fp4",  # Use FP4 datatype ("nf4" alternative)
    bnb_4bit_use_double_quant=True,  # Nested quantization
    bnb_4bit_compute_dtype=torch.float16,  # Computational type might be different than input type
    bnb_4bit_quant_storage=torch.float16,
)

my_model = LlamaForCausalLM.from_pretrained(
    path_to_model,
    quantization_config=fp4_config,
)

adapted_pipeline = pipeline(
    "text-generation",
    model=my_model,
    tokenizer=my_tokenizer,
    device_map="auto",
)

prompt = r"Johnny has three apples. Jane has fourteen oranges. Jane says that she will trade three oranges for one apple. What is the maximum number of oranges that Johnny could trade for?"

sequences = adapted_pipeline(
    text_inputs=prompt,
    do_sample=True,
    top_k=10,
    num_return_sequences=10,
    eos_token_id=my_tokenizer.eos_token_id,
    max_new_tokens=200,
)

for i, seq in enumerate(sequences):
    print(f"sequence: {i}")
    print(seq["generated_text"])