In [1]:
%pip install -Uq pip
%pip uninstall -q -y optimum optimum-intel
%pip install --pre -Uq openvino openvino-tokenizers[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly
%pip install -q "torch>=2.1" "nncf>=2.7" "transformers>=4.36.0" onnx "optimum>=1.16.1" "accelerate" "datasets>=2.14.6" "gradio>=4.19" "git+https://github.com/huggingface/optimum-intel.git" --extra-index-url https://download.pytorch.org/whl/cpu

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
from pathlib import Path
import requests
r = requests.get(
    url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/notebook_utils.py",
)
open("notebook_utils.py", "w").write(r.text)
from notebook_utils import download_file

if not Path("./config.py").exists():
    download_file(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/notebooks/llm-question-answering/config.py")
from config import SUPPORTED_LLM_MODELS
import ipywidgets as widgets

In [3]:
model_ids = list(SUPPORTED_LLM_MODELS)

model_id = widgets.Dropdown(
    options=model_ids,
    value=model_ids[1],
    description="Model:",
    disabled=False,
)

model_id

Dropdown(description='Model:', index=1, options=('tiny-llama-1b', 'phi-2', 'dolly-v2-3b', 'red-pajama-instruct…

In [4]:
model_configuration = SUPPORTED_LLM_MODELS[model_id.value]
print(f"Selected model {model_id.value}")

Selected model phi-2


In [5]:
model_configuration = SUPPORTED_LLM_MODELS[model_id.value]
model_configuration

{'model_id': 'susnato/phi-2',
 'prompt_template': 'Instruct:{instruction}\nOutput:'}

In [6]:
from IPython.display import display

prepare_int4_model = widgets.Checkbox(
    value=True,
    description="Prepare INT4 model",
    disabled=False,
)
prepare_int8_model = widgets.Checkbox(
    value=False,
    description="Prepare INT8 model",
    disabled=False,
)
prepare_fp16_model = widgets.Checkbox(
    value=False,
    description="Prepare FP16 model",
    disabled=False,
)

display(prepare_int4_model)
display(prepare_int8_model)
display(prepare_fp16_model)

Checkbox(value=True, description='Prepare INT4 model')

Checkbox(value=False, description='Prepare INT8 model')

Checkbox(value=False, description='Prepare FP16 model')

In [8]:
from pathlib import Path
import logging
import openvino as ov
import nncf
from optimum.intel.openvino import OVModelForCausalLM, OVWeightQuantizationConfig
import gc


nncf.set_log_level(logging.ERROR)

pt_model_id = model_configuration["model_id"]
fp16_model_dir = Path(model_id.value) / "FP16"
int8_model_dir = Path(model_id.value) / "INT8_compressed_weights"
int4_model_dir = Path(model_id.value) / "INT4_compressed_weights"

core = ov.Core()


def convert_to_fp16():
    if (fp16_model_dir / "openvino_model.xml").exists():
        return
    ov_model = OVModelForCausalLM.from_pretrained(pt_model_id, export=True, compile=False, load_in_8bit=False)
    ov_model.half()
    ov_model.save_pretrained(fp16_model_dir)
    del ov_model
    gc.collect()


def convert_to_int8():
    if (int8_model_dir / "openvino_model.xml").exists():
        return
    ov_model = OVModelForCausalLM.from_pretrained(pt_model_id, export=True, compile=False, load_in_8bit=True)
    ov_model.save_pretrained(int8_model_dir)
    del ov_model
    gc.collect()


def convert_to_int4():
    compression_configs = {
        "mistral-7b": {
            "sym": True,
            "group_size": 64,
            "ratio": 0.6,
        },
        "red-pajama-3b-instruct": {
            "sym": False,
            "group_size": 128,
            "ratio": 0.5,
        },
        "dolly-v2-3b": {"sym": False, "group_size": 32, "ratio": 0.5},
        "llama-3-8b-instruct": {"sym": True, "group_size": 128, "ratio": 1.0},
        "default": {
            "sym": False,
            "group_size": 128,
            "ratio": 0.8,
        },
    }

    model_compression_params = compression_configs.get(model_id.value, compression_configs["default"])
    if (int4_model_dir / "openvino_model.xml").exists():
        return
    ov_model = OVModelForCausalLM.from_pretrained(
        pt_model_id,
        export=True,
        compile=False,
        quantization_config=OVWeightQuantizationConfig(bits=4, **model_compression_params),
    )
    ov_model.save_pretrained(int4_model_dir)
    del ov_model
    gc.collect()


if prepare_fp16_model.value:
    convert_to_fp16()
if prepare_int8_model.value:
    convert_to_int8()
if prepare_int4_model.value:
    convert_to_int4()

In [9]:
fp16_weights = fp16_model_dir / "openvino_model.bin"
int8_weights = int8_model_dir / "openvino_model.bin"
int4_weights = int4_model_dir / "openvino_model.bin"

if fp16_weights.exists():
    print(f"Size of FP16 model is {fp16_weights.stat().st_size / 1024 / 1024:.2f} MB")
for precision, compressed_weights in zip([8, 4], [int8_weights, int4_weights]):
    if compressed_weights.exists():
        print(f"Size of model with INT{precision} compressed weights is {compressed_weights.stat().st_size / 1024 / 1024:.2f} MB")
    if compressed_weights.exists() and fp16_weights.exists():
        print(f"Compression rate for INT{precision} model: {fp16_weights.stat().st_size / compressed_weights.stat().st_size:.3f}")

Size of model with INT8 compressed weights is 2656.55 MB


In [10]:
core = ov.Core()

support_devices = core.available_devices
if "NPU" in support_devices:
    support_devices.remove("NPU")

device = widgets.Dropdown(
    options=support_devices + ["AUTO"],
    value="CPU",
    description="Device:",
    disabled=False,
)

device

Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU')

In [None]:
device

In [11]:
available_models = []
if int4_model_dir.exists():
    available_models.append("INT4")
if int8_model_dir.exists():
    available_models.append("INT8")
if fp16_model_dir.exists():
    available_models.append("FP16")

model_to_run = widgets.Dropdown(
    options=available_models,
    value=available_models[0],
    description="Model to run:",
    disabled=False,
)

model_to_run

Dropdown(description='Model to run:', options=('INT8',), value='INT8')

In [12]:
from transformers import AutoTokenizer

if model_to_run.value == "INT4":
    model_dir = int4_model_dir
elif model_to_run.value == "INT8":
    model_dir = int8_model_dir
else:
    model_dir = fp16_model_dir
print(f"Loading model from {model_dir}")

model_name = model_configuration["model_id"]
print(model_name)
ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}

tok = AutoTokenizer.from_pretrained(model_name)

ov_model = OVModelForCausalLM.from_pretrained(
    model_dir,
    device=device.value,
    ov_config=ov_config,
)

Loading model from phi-2\INT8_compressed_weights
susnato/phi-2


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Compiling the model to CPU ...


In [13]:
from threading import Thread
from time import perf_counter
from typing import List
import gradio as gr
from transformers import AutoTokenizer, TextIteratorStreamer
import numpy as np

In [14]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer_kwargs = model_configuration.get("toeknizer_kwargs", {})


def get_special_token_id(tokenizer: AutoTokenizer, key: str) -> int:
    """
    Gets the token ID for a given string that has been added to the tokenizer as a special token.

    Args:
        tokenizer (PreTrainedTokenizer): the tokenizer
        key (str): the key to convert to a single token

    Raises:
        RuntimeError: if more than one ID was generated

    Returns:
        int: the token ID for the given key
    """
    token_ids = tokenizer.encode(key)
    if len(token_ids) > 1:
        raise ValueError(f"Expected only a single token for '{key}' but found {token_ids}")
    return token_ids[0]


response_key = model_configuration.get("response_key")
tokenizer_response_key = None

if response_key is not None:
    tokenizer_response_key = next(
        (token for token in tokenizer.additional_special_tokens if token.startswith(response_key)),
        None,
    )

end_key_token_id = None
if tokenizer_response_key:
    try:
        end_key = model_configuration.get("end_key")
        if end_key:
            end_key_token_id = get_special_token_id(tokenizer, end_key)
        # Ensure generation stops once it generates "### End"
    except ValueError:
        pass

prompt_template = model_configuration.get("prompt_template", "{instruction}")
end_key_token_id = end_key_token_id or tokenizer.eos_token_id
pad_token_id = end_key_token_id or tokenizer.pad_token_id

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
def run_generation(
    user_text: str,
    top_p: float,
    temperature: float,
    top_k: int,
    max_new_tokens: int,
    perf_text: str,
):
    """
    Text generation function

    Parameters:
      user_text (str): User-provided instruction for a generation.
      top_p (float):  Nucleus sampling. If set to < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for a generation.
      temperature (float): The value used to module the logits distribution.
      top_k (int): The number of highest probability vocabulary tokens to keep for top-k-filtering.
      max_new_tokens (int): Maximum length of generated sequence.
      perf_text (str): Content of text field for printing performance results.
    Returns:
      model_output (str) - model-generated text
      perf_text (str) - updated perf text filed content
    """

    # Prepare input prompt according to model expected template
    prompt_text = prompt_template.format(instruction=user_text)

    # Tokenize the user text.
    model_inputs = tokenizer(prompt_text, return_tensors="pt", **tokenizer_kwargs)

    # Start generation on a separate thread, so that we don't block the UI. The text is pulled from the streamer
    # in the main thread. Adds timeout to the streamer to handle exceptions in the generation thread.
    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)
    generate_kwargs = dict(
        model_inputs,
        streamer=streamer,
        max_new_tokens=max_new_tokens,
        do_sample=True,
        top_p=top_p,
        temperature=float(temperature),
        top_k=top_k,
        eos_token_id=end_key_token_id,
        pad_token_id=pad_token_id,
    )
    t = Thread(target=ov_model.generate, kwargs=generate_kwargs)
    t.start()

    # Pull the generated text from the streamer, and update the model output.
    model_output = ""
    per_token_time = []
    num_tokens = 0
    start = perf_counter()
    for new_text in streamer:
        current_time = perf_counter() - start
        model_output += new_text
        perf_text, num_tokens = estimate_latency(current_time, perf_text, new_text, per_token_time, num_tokens)
        yield model_output, perf_text
        start = perf_counter()
    return model_output, perf_text

In [16]:
def estimate_latency(
    current_time: float,
    current_perf_text: str,
    new_gen_text: str,
    per_token_time: List[float],
    num_tokens: int,
):
    """
    Helper function for performance estimation

    Parameters:
      current_time (float): This step time in seconds.
      current_perf_text (str): Current content of performance UI field.
      new_gen_text (str): New generated text.
      per_token_time (List[float]): history of performance from previous steps.
      num_tokens (int): Total number of generated tokens.

    Returns:
      update for performance text field
      update for a total number of tokens
    """
    num_current_toks = len(tokenizer.encode(new_gen_text))
    num_tokens += num_current_toks
    per_token_time.append(num_current_toks / current_time)
    if len(per_token_time) > 10 and len(per_token_time) % 4 == 0:
        current_bucket = per_token_time[:-10]
        return (
            f"Average generation speed: {np.mean(current_bucket):.2f} tokens/s. Total generated tokens: {num_tokens}",
            num_tokens,
        )
    return current_perf_text, num_tokens


def reset_textbox(instruction: str, response: str, perf: str):
    """
    Helper function for resetting content of all text fields

    Parameters:
      instruction (str): Content of user instruction field.
      response (str): Content of model response field.
      perf (str): Content of performance info filed

    Returns:
      empty string for each placeholder
    """
    return "", "", ""

In [26]:
examples = [
    "Give me a recipe for pizza with pineapple",
    "Write me a tweet about the new OpenVINO release",
    "Explain the difference between CPU and GPU",
    "Give five ideas for a great weekend with family",
    "Do Androids dream of Electric sheep?",
    "Who is Dolly?",
    "Please give me advice on how to write resume?",
    "Name 3 advantages to being a cat",
    "Write instructions on how to become a good AI engineer",
    "Write a love letter to my best friend",
]


with gr.Blocks() as demo:
    gr.Markdown(
        "# Question Answering with " + model_id.value + " and OpenVINO.\n"
        "Provide instruction which describes a task below or select among predefined examples and model writes response that performs requested task."
    )

    with gr.Row():
        with gr.Column(scale=4):
            user_text = gr.Textbox(
                placeholder="Write an email about an alpaca that likes flan",
                label="User instruction",
            )
            model_output = gr.Textbox(label="Model response", interactive=False)
            performance = gr.Textbox(label="Performance", lines=1, interactive=False)
            with gr.Column(scale=1):
                button_clear = gr.Button(value="Clear")
                button_submit = gr.Button(value="Submit")
            gr.Examples(examples, user_text)
        with gr.Column(scale=1):
            max_new_tokens = gr.Slider(
                minimum=1,
                maximum=1000,
                value=256,
                step=1,
                interactive=True,
                label="Max New Tokens",
            )
            top_p = gr.Slider(
                minimum=0.05,
                maximum=1.0,
                value=0.92,
                step=0.05,
                interactive=True,
                label="Top-p (nucleus sampling)",
            )
            top_k = gr.Slider(
                minimum=0,
                maximum=50,
                value=0,
                step=1,
                interactive=True,
                label="Top-k",
            )
            temperature = gr.Slider(
                minimum=0.1,
                maximum=5.0,
                value=0.8,
                step=0.1,
                interactive=True,
                label="Temperature",
            )

    user_text.submit(
        run_generation,
        [user_text, top_p, temperature, top_k, max_new_tokens, performance],
        [model_output, performance],
    )
    button_submit.click(
        run_generation,
        [user_text, top_p, temperature, top_k, max_new_tokens, performance],
        [model_output, performance],
    )
    button_clear.click(
        reset_textbox,
        [user_text, model_output, performance],
        [user_text, model_output, performance],
    )

if __name__ == "__main__":
    demo.queue()
    try:
        demo.launch(height=800)
    except Exception:
        demo.launch(share=True, height=800)

# If you are launching remotely, specify server_name and server_port
# EXAMPLE: `demo.launch(server_name='your server name', server_port='server port in int')`
# To learn more please refer to the Gradio docs: https://gradio.app/docs/

Running on local URL:  http://127.0.0.1:7870

To create a public link, set `share=True` in `launch()`.
