# LLM optimization examples with NNCF and OpenVINO

>**Note:** Make sure that you have enought disk space as the environment and the model can take sevaral gigabytes of you space

## Install pre-requisites

In [None]:
%pip install diffusers
%pip install openvino nncf
%pip install git+https://github.com/huggingface/optimum.git
%pip install git+https://github.com/huggingface/optimum-intel.git

## Floating-point model inference

In [None]:
from transformers import AutoTokenizer
from optimum.intel import OVModelForCausalLM
import openvino as ov
import time

MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"

model = OVModelForCausalLM.from_pretrained(MODEL_ID, export=True, load_in_8bit=False, trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

template = "<|user|>\n{}<|end|>\n<|assistant|>"
question = "Hey, model! Tell me about LLM?"
prompt = template.format(question)
inputs = tokenizer(prompt, return_tensors="pt")

start = time.time()
output = model.generate(**inputs, max_new_tokens=256)
print("Avarage latency per token: ", (time.time() - start) / output.shape[1])

answer = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
print("Answer: ", answer)

## 8-bit weight quantization

In [None]:
from transformers import AutoTokenizer
from optimum.intel import OVModelForCausalLM
import openvino as ov
import time

MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"

model = OVModelForCausalLM.from_pretrained(MODEL_ID, export=True, quantization_config=dict(bits=8), trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

template = "<|user|>\n{}<|end|>\n<|assistant|>"
question = "Hey, model! Tell me about LLM?"
prompt = template.format(question)
inputs = tokenizer(prompt, return_tensors="pt")

start = time.time()
output = model.generate(**inputs, max_new_tokens=256)
print("Avarage latency per token: ", (time.time() - start) / output.shape[1])

answer = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
print("Answer: ", answer)

## 4-bit data-aware weight quantization

In [None]:
from transformers import AutoTokenizer
from optimum.intel import OVModelForCausalLM
import openvino as ov

MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"

model = OVModelForCausalLM.from_pretrained(
    MODEL_ID,
    export=True,
    compile=False,
    quantization_config=dict(bits=4, sym=True, ratio=0.8, dataset="ptb", trust_remote_code=True),
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

template = "<|user|>\n{}<|end|>\n<|assistant|>"
question = "Hey, model! Tell me about LLM?"
prompt = template.format(question)
inputs = tokenizer(prompt, return_tensors="pt")

start = time.time()
output = model.generate(**inputs, max_new_tokens=256)
print("Avarage latency per token: ", (time.time() - start) / output.shape[1])

answer = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
print("Answer: ", answer)

## Dynamic Quantization and KV-cache Quantization

In [None]:
from transformers import AutoTokenizer
from optimum.intel import OVModelForCausalLM
import openvino as ov

MODEL_ID = "microsoft/Phi-3-mini-4k-instruct"

model = OVModelForCausalLM.from_pretrained(
    MODEL_ID,
    export=True,
    compile=False,
    quantization_config=dict(bits=4, sym=True, ratio=0.8, dataset="ptb"),
    ov_config={"KV_CACHE_PRECISION": "u8", "DYNAMIC_QUANTIZATION_GROUP_SIZE": "32", "PERFORMANCE_HINT": "LATENCY"},
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)

template = "<|user|>\n{}<|end|>\n<|assistant|>"
question = "Hey, model! Tell me about LLM?"
inputs = tokenizer(prompt, return_tensors="pt")

start = time.time()
output = model.generate(**inputs, max_new_tokens=256)
print("Avarage latency per token: ", (time.time() - start) / output.shape[1])

answer = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
print("Answer: ", answer)