In [None]:
Running GENAI

# Installation and Setup

This cell installs the required packages and sets up the environment for the notebook.


In [None]:
!pip install --upgrade pip
!pip uninstall -y optimum optimum-intel

!pip install --pre -U openvino openvino-tokenizers[transformers] --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly

!pip install --extra-index-url https://download.pytorch.org/whl/cpu \
    git+https://github.com/huggingface/optimum-intel.git \
    git+https://github.com/openvinotoolkit/nncf.git \
    torch>=2.1 \
    datasets \
    accelerate \
    gradio>=4.19 \
    onnx \
    einops \
    transformers_stream_generator \
    tiktoken \
    transformers>=4.38.1 \
    bitsandbytes


# Model Configuration

This cell loads the model configuration and sets up the model ID and language.


In [1]:
from pathlib import Path
import requests
import shutil
import os
import ipywidgets as widgets
from IPython.display import display

config_shared_path = Path("../../utils/llm_config.py")
config_dst_path = Path("llm_config.py")

if not config_dst_path.exists():
    if config_shared_path.exists():
        try:
            os.symlink(config_shared_path, config_dst_path)
        except Exception:
            shutil.copy(config_shared_path, config_dst_path)
    else:
        r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py")
        with open("llm_config.py", "w") as f:
            f.write(r.text)
elif not os.path.islink(config_dst_path):
    print("LLM config will be updated")
    if config_shared_path.exists():
        shutil.copy(config_shared_path, config_dst_path)
    else:
        r = requests.get(url="https://raw.githubusercontent.com/openvinotoolkit/openvino_notebooks/latest/utils/llm_config.py")
        with open("llm_config.py", "w") as f:
            f.write(r.text)

from llm_config import SUPPORTED_LLM_MODELS

model_language = "English"
model_id_value = "neural-chat-7b-v3-1"

model_configuration = SUPPORTED_LLM_MODELS[model_language][model_id_value]
print(f"Selected model {model_id_value}")


Selected model neural-chat-7b-v3-1


# Model Preparation

This cell prepares the model for conversion to different formats (FP16, INT8, INT4) and displays toggle buttons for model preparation options.


In [2]:
from IPython.display import display, Markdown

# Directories for different model formats
model_dir = Path(model_id_value)
fp16_model_dir = model_dir / "FP16"
int8_model_dir = model_dir / "INT8_compressed_weights"
int4_model_dir = model_dir / "INT4_compressed_weights"

# Function to convert model to FP16 format
def convert_to_fp16():
    if (fp16_model_dir / "openvino_model.xml").exists():
        return
    remote_code = model_configuration.get("remote_code", False)
    export_command_base = f"optimum-cli export openvino --model {model_configuration['model_id']} --task text-generation-with-past --weight-format fp16"
    if remote_code:
        export_command_base += " --trust-remote-code"
    export_command = export_command_base + " " + str(fp16_model_dir)
    display(Markdown("**Export command:**"))
    display(Markdown(f"`{export_command}`"))
    os.system(export_command)

# Function to convert model to INT8 format
def convert_to_int8():
    if (int8_model_dir / "openvino_model.xml").exists():
        return
    int8_model_dir.mkdir(parents=True, exist_ok=True)
    remote_code = model_configuration.get("remote_code", False)
    export_command_base = f"optimum-cli export openvino --model {model_configuration['model_id']} --task text-generation-with-past --weight-format int8"
    if remote_code:
        export_command_base += " --trust-remote-code"
    export_command = export_command_base + " " + str(int8_model_dir)
    display(Markdown("**Export command:**"))
    display(Markdown(f"`{export_command}`"))
    os.system(export_command)

# Function to convert model to INT4 format
def convert_to_int4():
    compression_configs = {
        "neural-chat-7b-v3-1": {"sym": True, "group_size": 64, "ratio": 0.6},
        "default": {"sym": False, "group_size": 128, "ratio": 0.8},
    }

    model_compression_params = compression_configs.get(model_id_value, compression_configs["default"])
    if (int4_model_dir / "openvino_model.xml").exists():
        return
    remote_code = model_configuration.get("remote_code", False)
    export_command_base = f"optimum-cli export openvino --model {model_configuration['model_id']} --task text-generation-with-past --weight-format int4"
    int4_compression_args = f" --group-size {model_compression_params['group_size']} --ratio {model_compression_params['ratio']}"
    if model_compression_params["sym"]:
        int4_compression_args += " --sym"
    export_command_base += int4_compression_args
    if remote_code:
        export_command_base += " --trust-remote-code"
    export_command = export_command_base + " " + str(int4_model_dir)
    display(Markdown("**Export command:**"))
    display(Markdown(f"`{export_command}`"))
    os.system(export_command)

# Toggle buttons for model preparation options
prepare_fp16_model = widgets.ToggleButton(
    value=True,
    description="Prepare FP16 model",
    disabled=False,
)
prepare_int8_model = widgets.ToggleButton(
    value=False,
    description="Prepare INT8 model",
    disabled=False,
)
prepare_int4_model = widgets.ToggleButton(
    value=False,
    description="Prepare INT4 model",
    disabled=False,
)

# Display toggle buttons
display(prepare_fp16_model)
display(prepare_int8_model)
display(prepare_int4_model)

# Function to handle toggling of model preparation options
def handle_toggle(change):
    if prepare_fp16_model.value:
        convert_to_fp16()
    if prepare_int8_model.value:
        convert_to_int8()
    if prepare_int4_model.value:
        convert_to_int4()

# Attach toggle button event handlers
prepare_fp16_model.observe(handle_toggle, names='value')
prepare_int8_model.observe(handle_toggle, names='value')
prepare_int4_model.observe(handle_toggle, names='value')

# Display toggled models
handle_toggle(None)


ToggleButton(value=True, description='Prepare FP16 model')

ToggleButton(value=False, description='Prepare INT8 model')

ToggleButton(value=False, description='Prepare INT4 model')

# Model Size and Compression Rate

This cell calculates and displays the size of the model in different formats and the compression rate.


In [3]:
fp16_weights = fp16_model_dir / "openvino_model.bin"
int8_weights = int8_model_dir / "openvino_model.bin"
int4_weights = int4_model_dir / "openvino_model.bin"

if fp16_weights.exists():
    print(f"Size of FP16 model is {fp16_weights.stat().st_size / 1024 / 1024:.2f} MB")

for precision, compressed_weights in zip([8, 4], [int8_weights, int4_weights]):
    if compressed_weights.exists():
        print(f"Size of model with INT{precision} compressed weights is {compressed_weights.stat().st_size / 1024 / 1024:.2f} MB")
    if compressed_weights.exists() and fp16_weights.exists():
        print(f"Compression rate for INT{precision} model: {fp16_weights.stat().st_size / compressed_weights.stat().st_size:.3f}")


Size of FP16 model is 13828.51 MB
Size of model with INT8 compressed weights is 6943.14 MB
Compression rate for INT8 model: 1.992
Size of model with INT4 compressed weights is 5069.90 MB
Compression rate for INT4 model: 2.728


## Device Selection

This cell allows the user to select the device (CPU or GPU) for running the inference.


In [4]:
import openvino as ov

core = ov.Core()

support_devices = core.available_devices
if "NPU" in support_devices:
    support_devices.remove("NPU")

device = widgets.Dropdown(
    options=support_devices + ["AUTO"],
    value="CPU",
    description="Device:",
    disabled=False,
)

display(device)

available_models = []
if int4_model_dir.exists():
    available_models.append("INT4")
if int8_model_dir.exists():
    available_models.append("INT8")
if fp16_model_dir.exists():
    available_models.append("FP16")

model_to_run = widgets.Dropdown(
    options=available_models,
    value=available_models[0],
    description="Model to run:",
    disabled=False,
)

display(model_to_run)


Dropdown(description='Device:', options=('CPU', 'AUTO'), value='CPU')

Dropdown(description='Model to run:', options=('INT4', 'INT8', 'FP16'), value='INT4')

## OpenVINO-based Inference

This cell runs the inference using the selected OpenVINO-optimized model, device, and prompt, and displays the output and accuracy score, leveraging OpenVINO's optimized inference engine for efficient and accelerated performance.


In [5]:
from transformers import AutoConfig, AutoTokenizer
from optimum.intel.openvino import OVModelForCausalLM
from IPython.display import display, HTML
import torch
import ipywidgets as widgets
import contextlib
import io
import sys

# Dummy values for example purposes
model_dirs = {
    "INT4": "neural-chat-7b-v3-1/INT4_compressed_weights",
    "INT8": "neural-chat-7b-v3-1/INT8_compressed_weights",
    "FP16": "neural-chat-7b-v3-1/FP16"
}

# Add CSS for custom styles and animations
display(HTML("""
<style>
    .custom-dropdown {
        width: 300px;
        margin: 0 auto;
    }
    .custom-text {
        width: 80%;
        margin: 0 auto;
    }
    .custom-button {
        width: 30%;
        margin: 20px auto;
        font-size: 16px;
    }
    .output-area {
        border: 2px solid #4CAF50;
        padding: 20px;
        margin: 20px auto;
        width: 80%;
        background-color: #f9f9f9;
        font-family: 'Courier New', Courier, monospace;
        font-size: 14px;
    }
    .output-text {
        color: #0000FF;
    }
    @keyframes fadeIn {
        from { opacity: 0; }
        to { opacity: 1; }
    }
    .fade-in {
        animation: fadeIn 2s;
    }
</style>
"""))

# Widgets for user input
model_dropdown = widgets.Dropdown(
    options=["INT4", "INT8", "FP16"],
    value="INT8",
    description="Model Type:",
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='300px')
)

device_dropdown = widgets.Dropdown(
    options=["CPU", "GPU"],
    value="CPU",
    description="Device:",
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='300px')
)

prompt_text = widgets.Text(
    value="What is a constructor?",
    description="Prompt:",
    style={'description_width': 'initial'},
    layout=widgets.Layout(width='80%')
)

run_button = widgets.Button(
    description="Run Inference",
    button_style='success',
    layout=widgets.Layout(width='30%')
)

output_area = widgets.Output(layout=widgets.Layout(width='80%', padding='20px', border='2px solid #4CAF50', background_color='#f9f9f9'))

# Function to suppress stdout and stderr
@contextlib.contextmanager
def suppress_output():
    new_stdout = io.StringIO()
    new_stderr = io.StringIO()
    old_stdout = sys.stdout
    old_stderr = sys.stderr
    try:
        sys.stdout = new_stdout
        sys.stderr = new_stderr
        yield
    finally:
        sys.stdout = old_stdout
        sys.stderr = old_stderr

# Function to run the inference
def run_inference(b):
    with output_area:
        output_area.clear_output()
        model_to_run = model_dropdown.value
        device = device_dropdown.value
        prompt = prompt_text.value

        model_dir = model_dirs.get(model_to_run, "default/path")

        ov_config = {"PERFORMANCE_HINT": "LATENCY", "NUM_STREAMS": "1", "CACHE_DIR": ""}

        # Load tokenizer
        try:
            with suppress_output():
                tok = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
        except Exception as e:
            output_area.append_stdout(f"Error loading tokenizer: {e}\n")
            return

        # Load OVModelForCausalLM
        try:
            with suppress_output():
                ov_model = OVModelForCausalLM.from_pretrained(
                    model_dir,
                    device=device,
                    ov_config=ov_config,
                    config=AutoConfig.from_pretrained(model_dir, trust_remote_code=True),
                )
        except Exception as e:
            output_area.append_stdout(f"Error loading OVModelForCausalLM: {e}\n")
            return

        # Prepare prompt and initial inputs
        with suppress_output():
            tokens = tok.encode(prompt, return_tensors="pt")

        # Perform inference
        try:
            with suppress_output():
                res = ov_model.generate(
                    tokens,
                    do_sample=False,
                    temperature=0.9,
                    repetition_penalty=1.1,
                    top_k=1,
                    max_new_tokens=100,
                )
                output = tok.decode(res[0], skip_special_tokens=True)
        except Exception as e:
            output_area.append_stdout(f"Error during inference: {e}\n")
            return

        # Display output
        output_area.append_stdout(f"Output:\n")
        output_area.append_display_data(HTML(f'<div class="fade-in output-text">{output}</div>'))

        # Accuracy check (example)
        expected_keywords = ["constructor", "class", "object"]
        accuracy_score = sum([1 for kw in expected_keywords if kw in output.lower()]) / len(expected_keywords)
        output_area.append_display_data(HTML(f'<div class="fade-in">Accuracy score: {accuracy_score:.2f}</div>'))

run_button.on_click(run_inference)

# Organize the layout
input_widgets = widgets.VBox([model_dropdown, device_dropdown, prompt_text, run_button], layout=widgets.Layout(align_items='center'))
main_layout = widgets.VBox([input_widgets, output_area], layout=widgets.Layout(align_items='center'))

# Display the main layout
display(main_layout)


  _torch_pytree._register_pytree_node(


INFO:nncf:NNCF initialized successfully. Supported frameworks detected: torch, tensorflow, onnx, openvino


2024-07-14 05:07:56.415198: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-14 05:07:56.488995: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-07-14 05:07:56.847066: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


VBox(children=(VBox(children=(Dropdown(description='Model Type:', index=1, layout=Layout(width='300px'), optio…

# FINE TUNING META-LLAMA-2-7b-hf using Openvino

# Fine-Tuning a Language Model with Hugging Face Transformers and Trainer API:

Defines model and dataset details.
Loads a portion of the "wikitext" dataset using Hugging Face datasets library.
Sets up training arguments and initializes Trainer for fine-tuning.
Trains the model, saves it, and exports it to ONNX format for deployment.


In [2]:
import os
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset

# Define model and dataset details
model_name = "meta-llama/Llama-2-7b-hf"
dataset_name = "wikitext"
subset_name = "wikitext-2-raw-v1"
text_column = "text"

# Load a smaller portion of the dataset
dataset = load_dataset(dataset_name, subset_name, split='train[:1%]')

# Check column names
print("Dataset columns:", dataset.column_names)

# Set the Hugging Face token as an environment variable
os.environ["HUGGING_FACE_HUB_TOKEN"] = "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"

try:
    tokenizer = AutoTokenizer.from_pretrained(model_name, use_auth_token=True)
    model = AutoModelForCausalLM.from_pretrained(model_name, use_auth_token=True)
except OSError as e:
    print(f"Error loading model {model_name}: {e}")
    raise

# Add a padding token if it doesn't exist
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
    model.resize_token_embeddings(len(tokenizer))

# Define the tokenize function
def tokenize_function(examples):
    return tokenizer(examples[text_column], padding="max_length", truncation=True, max_length=128)

# Tokenize the dataset
tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=[text_column])

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./new",
    per_device_train_batch_size=4,  # Reduce batch size
    per_device_eval_batch_size=4,
    num_train_epochs=1,  # Set to 1 epoch for quick testing
    evaluation_strategy="no",
    save_strategy="epoch",
    logging_dir="./old",
    logging_steps=10,
    learning_rate=5e-5,
    weight_decay=0.01,
    save_total_limit=1,
    load_best_model_at_end=False,  # Set to False since evaluation is not performed
)

# Define the data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=False
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("./llama")
tokenizer.save_pretrained("./llama")

print("Fine-tuning completed within the time limit.")



Dataset columns: ['text']


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
10,2.7324
20,2.4076
30,2.4474
40,2.1416
50,2.4605
60,2.2383
70,2.3425
80,1.7444
90,2.1473


Fine-tuning completed within the time limit.


OverflowError: cannot fit 'int' into an index-sized integer

## Wrapping Fine-Tuned Model for ONNX Export:

Defines a custom model wrapper class to handle input and output formats.
Loads the fine-tuned Llama model using Hugging Face Transformers.
Wraps the model with the custom wrapper for compatibility with ONNX export.
Exports the wrapped model to ONNX format using opset version 14.

In [2]:
import torch
import torch.onnx
import transformers
import os

class ModelWrapper(torch.nn.Module):
    def __init__(self, model):
        super(ModelWrapper, self).__init__()
        self.model = model

    def forward(self, input_ids, attention_mask):
        return self.model(input_ids=input_ids, attention_mask=attention_mask).logits

# Load your fine-tuned Llama model
model_path = "./llama/fine_tuned_model"
model = transformers.LlamaForCausalLM.from_pretrained(model_path)

# Wrap the model with the custom wrapper
model_wrapper = ModelWrapper(model)

# Dummy input for the export
dummy_input_ids = torch.zeros((1, 10), dtype=torch.long)
dummy_attention_mask = torch.ones((1, 10), dtype=torch.long)

# Export to ONNX using opset version 14
onnx_model_path = os.path.join("./llama", "fine_tuned_model.onnx")
torch.onnx.export(model_wrapper, (dummy_input_ids, dummy_attention_mask), onnx_model_path, 
                  input_names=["input_ids", "attention_mask"], output_names=["logits"], opset_version=14)
print(f"Model exported to {onnx_model_path}.")


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode
In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode
In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode
In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode
In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode
In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-operations-when-using-tensor-shape-in-tracing-mode
In-place op on output of tensor.shape. See https://pytorch.org/docs/master/onnx.html#avoid-inplace-o

Model exported to ./llama/fine_tuned_model.onnx.


## Converting ONNX Model to OpenVINO IR and Performing Inference:

Loads the tokenizer for the fine-tuned Llama model.
Converts the exported ONNX model to OpenVINO Intermediate Representation (IR) format using Model Optimizer.
Loads the optimized model into the Inference Engine (IE) of OpenVINO.
Defines a prompt for text generation, encodes it using the tokenizer, and prepares input for OpenVINO.
Performs inference using OpenVINO and benchmarks the inference time and throughput.

In [14]:
import os
import numpy as np
import torch
from transformers import AutoTokenizer
from openvino.inference_engine import IECore
import subprocess

# Load the tokenizer
model_path = "./llama"
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Convert the model to OpenVINO IR format
onnx_model_path = os.path.join(model_path, "fine_tuned_model.onnx")
model_path = os.path.join(model_path, "optimized_model")
mo_command = f"python -m openvino.tools.mo --input_model {onnx_model_path} --output_dir {model_path} --static_shape"
subprocess.run(mo_command.split())

# Load the optimized model
ie = IECore()

# Adjust CPU configuration for inference
ie.set_config({'CPU_THROUGHPUT_STREAMS': '1', 'CPU_BIND_THREAD': 'YES'}, 'CPU')

# Load the network
model_xml_path = os.path.join(model_path, "fine_tuned_model.xml")
model_bin_path = os.path.join(model_path, "fine_tuned_model.bin")
net = ie.read_network(model=model_xml_path, weights=model_bin_path)

# Reduce batch size if necessary (example, change 1 to a smaller number)
net.batch_size = 1

# Load the network to the device (CPU in this case)
exec_net = ie.load_network(network=net, device_name="CPU")

# Define the prompt for text generation
prompt = "go to market"

# Encode the prompt
input_ids = tokenizer.encode(prompt, return_tensors="pt", max_length=10, padding="max_length", truncation=True)

# Prepare the input for OpenVINO
input_dict = {"input_ids": input_ids.cpu().numpy().astype(np.int32)}

# Perform inference
output = exec_net.infer(inputs=input_dict)

# Convert logits to probabilities and sample
logits = output["logits"]
probs = torch.nn.functional.softmax(torch.tensor(logits), dim=-1)
generated_ids = torch.argmax(probs, dim=-1)

# Decode and print the generated text
generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
generated_text = generated_text.strip()  # Remove leading/trailing spaces
generated_text = generated_text.replace("[PAD]", "")  # Remove padding tokens if any

print("Generated text:", generated_text)

# Save the generated text to a file
output_dir = "llama/generated"
os.makedirs(output_dir, exist_ok=True)
with open(os.path.join(output_dir, "generated_text.txt"), "w") as file:
    file.write(generated_text)

generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
generated_text = generated_text.strip()  # Remove leading/trailing spaces
generated_text = generated_text.replace("[PAD]", "")  # Remove padding tokens if any

print("Generated text:", generated_text)

# Save the generated text to a file
output_dir = "llama/generated"
os.makedirs(output_dir, exist_ok=True)
with open(os.path.join(output_dir, "generated_text.txt"), "w") as file:
    file.write(generated_text)

[ INFO ] Generated IR will be compressed to FP16. If you get lower accuracy, please consider disabling compression explicitly by adding argument --compress_to_fp16=False.
Find more information about compression to FP16 at https://docs.openvino.ai/2023.0/openvino_docs_MO_DG_FP16_Compression.html
[ INFO ] The model was converted to IR v11, the latest model format that corresponds to the source DL framework input/output format. While IR v11 is backwards compatible with OpenVINO Inference Engine API v1.0, please use API v2.0 (as of 2022.1) to take advantage of the latest improvements in IR v11.
Find more information about API v2.0 and IR v11 at https://docs.openvino.ai/2023.0/openvino_2_0_transition_guide.html
[ INFO ] MO command line tool is considered as the legacy conversion API as of OpenVINO 2023.2 release. Please use OpenVINO Model Converter (OVC). OVC represents a lightweight alternative of MO and provides simplified model conversion API. 
Find more information about transition from

## Benchmarking Inference Performance with OpenVINO:

Loads the fine-tuned Llama model and tokenizer.
Defines an input prompt, encodes it using the tokenizer, and prepares input for OpenVINO.
Converts the ONNX model to OpenVINO IR format using Model Optimizer.
Loads the optimized model into the OpenVINO Inference Engine (IE) and sets CPU as the device.
Performs benchmarking to measure inference time and throughput over 100 iterations.

In [12]:
import os
import numpy as np
import torch
from transformers import AutoTokenizer
from openvino.inference_engine import IECore

# Load the fine-tuned model and tokenizer
model_path = "./llama"
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Define the input prompt
prompt = "what is constructor"

# Encode the prompt
input_ids = tokenizer.encode(prompt, return_tensors="pt", max_length=10, padding="max_length", truncation=True)

# Convert the model to OpenVINO IR format
onnx_model_path = os.path.join(model_path, "fine_tuned_model.onnx")
model_xml_path = os.path.join(model_path, "fine_tuned_model.xml")
model_bin_path = os.path.join(model_path, "fine_tuned_model.bin")
mo_command = f"python -m openvino.tools.mo --input_model {onnx_model_path} --output_dir {model_path} --static_shape"
os.system(mo_command)

# Load the optimized model
ie = IECore()
net = ie.read_network(model=model_xml_path, weights=model_bin_path)

# Set the device (CPU)
exec_net = ie.load_network(network=net, device_name="CPU")

# Prepare the input for OpenVINO
input_dict = {"input_ids": input_ids.cpu().numpy().astype(np.int32)}

# Benchmarking
batch_size = 1
num_iterations = 100

start_time = time.time()
for _ in range(num_iterations):
    output = exec_net.infer(inputs=input_dict)
end_time = time.time()

# Calculate the inference time
inference_time = (end_time - start_time) / num_iterations
print(f"Inference time: {inference_time:.4f} seconds")

# Calculate the throughput (sequences per second)
throughput = batch_size / inference_time
print(f"Throughput: {throughput:.2f} sequences per second")

[ INFO ] Generated IR will be compressed to FP16. If you get lower accuracy, please consider disabling compression explicitly by adding argument --compress_to_fp16=False.
Find more information about compression to FP16 at https://docs.openvino.ai/2023.0/openvino_docs_MO_DG_FP16_Compression.html
[ INFO ] The model was converted to IR v11, the latest model format that corresponds to the source DL framework input/output format. While IR v11 is backwards compatible with OpenVINO Inference Engine API v1.0, please use API v2.0 (as of 2022.1) to take advantage of the latest improvements in IR v11.
Find more information about API v2.0 and IR v11 at https://docs.openvino.ai/2023.0/openvino_2_0_transition_guide.html
[ INFO ] MO command line tool is considered as the legacy conversion API as of OpenVINO 2023.2 release. Please use OpenVINO Model Converter (OVC). OVC represents a lightweight alternative of MO and provides simplified model conversion API. 
Find more information about transition from