In [1]:
!pip install flash-attn transformers accelerate

import time
from datetime import timedelta

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.utils import is_flash_attn_2_available

torch.random.manual_seed(0)

model = AutoModelForCausalLM.from_pretrained(
    "microsoft/Phi-3-mini-4k-instruct",
    device_map="cuda",
    torch_dtype="auto",
    trust_remote_code=True,
    # attn_implementation="flash_attention_2",
).to("cuda")
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")

print("flash_attn_2 available:", is_flash_attn_2_available())



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


flash_attn_2 available: True


In [10]:
def gen(text):
    duration_start = time.perf_counter()
    prompt = "<|user|>\n{} <|end|>\n<|assistant|>".format(text)
    tokens = tokenizer.encode(prompt, return_tensors="pt").to("cuda")
    outputs = model.generate(tokens, max_new_tokens=1024, return_dict_in_generate=True)
    output_tokens = outputs.sequences[0]
    output_gen_tokens = output_tokens[
        len(tokens[0]) : -1
    ]  # From just after prompt to just before <|end|> token
    output_string = tokenizer.decode(output_gen_tokens)
    duration_seconds = time.perf_counter() - duration_start
    print(
        "== took {} ({} toks: {}/tok; {} toks/sec) ==".format(
            timedelta(seconds=duration_seconds),
            len(output_gen_tokens),
            timedelta(seconds=duration_seconds / len(output_gen_tokens)),
            len(output_gen_tokens) / duration_seconds,
        )
    )
    del tokens, outputs, output_tokens, output_gen_tokens
    return output_string


print(gen("What is the closest star to the Sun?"))
print(gen("What is the difference between hue, saturation, and value?"))
print(gen("Where is Waldo?"))

took 0:00:04.113437 (113 toks: 0:00:00.036402/tok; 27.470943151399975 toks/sec)
The closest star to the Sun is Proxima Centauri. It is part of the Alpha Centauri star system, which also includes Alpha Centauri A and Alpha Centauri B. Proxima Centauri is approximately 4.24 light-years away from the Sun. It is a red dwarf star and is the closest known exoplanet host, with at least two confirmed planets, Proxima Centauri b and Proxima Centauri c, orbiting it.
took 0:00:16.295132 (442 toks: 0:00:00.036867/tok; 27.12466469573394 toks/sec)
Hue, saturation, and value are three components of the color model known as HSV (Hue, Saturation, Value), which is also referred to as HSB (Hue, Saturation, Brightness) in some contexts. These components are used to describe and manipulate colors in digital imaging and color theory. Here's a breakdown of each component:

1. Hue: Hue refers to the color itself, or the dominant wavelength of light that determines the color we perceive. It is measured in degr