In [None]:
# Authors: Fluid Numerics LLC
#          Garrett Byrd             (garrett@fluidnumerics.com)
#          Dr. Joseph Schoonover    (joe@fluidnumerics.com)

In [None]:
import transformers
import torch

In [None]:
# Confirm the correct device is being used
# E.g. 'AMD Instinct MI210'
print(f"Device name: {torch.cuda.get_device_name(0)}")

In [None]:
# set path to local model
path_to_model = "/home/garrett/amd/misc/Meta-Llama-3-8B-Instruct"

# If not using a local model, this can be set as the name of a model on hugging face, e.g.
# path_to_model = "meta-llama/llama-3-8b-instruct"
# https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct

# set device to 'cuda' for ROCm GPUs, else use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# verify the device is set to 'cuda'
print(f"--------\nDevice: {device}\n--------\n")

# model = transformers.AutoModel.from_pretrained(path_to_model)

# AutoTokenizer is a generic tokenizer class that will be instantiated as one of the tokenizer classes 
# of the library when created with the AutoTokenizer.from_pretrained(pretrained_model_name_or_path) class method.
# https://huggingface.co/transformers/v3.0.2/model_doc/auto.html#transformers.AutoTokenizer

# Instantiate one of the tokenizer classes of the library from a pre-trained model vocabulary.
tokenizer = transformers.AutoTokenizer.from_pretrained(path_to_model)

# A pipeline tokenizes the input, feeds it to the model, and generates output.
# https://huggingface.co/docs/transformers/en/main_classes/pipelines
pipeline = transformers.pipeline(
    "text-generation",          # What type of model are we running?
    model=path_to_model,        # path to local model
    torch_dtype=torch.float16,  # set precision of tensors used by the model
    device_map="auto",          # uses the 'accelerate' package
    
)

# Provide an input and generate a response
prompt = 'I like listening to Snarky Puppy and Frank Zappa. What are some other musicians I might like?\n'

sequences = pipeline(
    text_inputs=prompt,
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)
print()
# For more about big models
# https://huggingface.co/docs/accelerate/en/usage_guides/big_modeling

# text_inputs:
#       the input text
#
# do_sample:
#       if False:
#           Use "greedy selection";
#           do not sample a token probabilistically,
#           always use the token most likely to come next.
#       if True:
#           Sample a token probabilistically.
#
# top_k:
#       When generating a token,
#       only sample the tokens with the 'top_k' highest probabilities.
#
# num_return_sequences:
#       How many token sequences (responses) to generate.
#
# eos_token_id
#       What is the ID of the end of sequence token?
#       This can differ per model, so it important to specify it from the tokenizer.
#       In Llama 2, this is "2".
#
# https://huggingface.co/docs/transformers/en/internal/generation_utils

# Print the response.
for seq in sequences:
    print(f"\nResult:\n{seq['generated_text']}")

In [None]:
sequences = pipeline(
    'Can you tell me about AMD?\n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)

for seq in sequences:
    print(f"\nResult:\n{seq['generated_text']}")

In [None]:
sequences = pipeline(
    'Can you tell me about the company Fluid Numerics?\n',
    do_sample=True,
    top_k=10,
    num_return_sequences=1,
    eos_token_id=tokenizer.eos_token_id,
)

for seq in sequences:
    print(f"\nResult:\n{seq['generated_text']}")

$\Huge\text{softmax}(\textbf{x}, T) = \frac{\exp(x_i/T)}{\sum_{n=1}^{N}\exp(x_n/T)}$

In [None]:
# Above,
#       x is a vector,
#       T is the temperature.
#       I.e., softmax turns an arbitrary vector into a probability distribution.

# temperature:
#       Affects the probabilistic behavior of token selection.
#
#       A temperature that tends to zero
#       functions identically to greedy selection.
#
#       A temperature that tends to infinity
#       normalizes the probability distribution into
#       a uniform distribution.
#
#       Temperature is set to 1 (one) by default.

# varying temperature
for i in range(-2,3):
    temp = 10.0**(i/2)
    sequences = pipeline(
        'What is two plus two?',
        max_new_tokens=200,
        do_sample=True,
        top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        temperature=temp
    )

    print(f"\n-------- temperature = {temp}")
    for seq in sequences:
        print(f"\nResult:\n{seq['generated_text']}")