# Quantization of a Large Language Model (LLM)

In [None]:
# Load environment variables from a .env file
from dotenv import load_dotenv, find_dotenv
import os

load_dotenv(find_dotenv())

In [2]:
import sys

# Set the project root directory and update the system path
project_root_directory = os.getcwd().split("notebooks")[0]
sys.path.insert(0, project_root_directory)
notebook_path = os.path.join(project_root_directory, "notebooks")
sys.path.insert(0, notebook_path)

In [None]:
from azure.ai.ml import MLClient
from azure.identity import (
    DefaultAzureCredential,
    InteractiveBrowserCredential,
)
from azure.ai.ml.entities import AmlCompute
import time

try:
    credential = DefaultAzureCredential()
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    credential = InteractiveBrowserCredential()

try:
    workspace_ml_client = MLClient.from_config(credential=credential)
except:
    workspace_ml_client = MLClient(
        credential,
        subscription_id=os.environ["SUBSCRIPTION_ID"],
        resource_group_name=os.environ["RESOURCE_GROUP"],
        workspace_name=os.environ["WORKSPACE_NAME"],
    )

# the models, fine tuning pipelines and environments are available in the AzureML system registry, "azureml"
registry_ml_client = MLClient(credential, registry_name="azureml")

# generating a unique timestamp that can be used for names and versions that need to be unique
timestamp = str(int(time.time()))

## 1 - Quantization of a Model from Marketplace/Hugging Face 

When using a PEFT (Parameter-Efficient Fine-Tuning) model, it is essential to utilize the `convert_lora.py` script. This script is specifically designed to handle the conversion and quantization of PEFT models, ensuring optimal performance and efficiency.

- First step is to clone the llama.cpp repo, because we need it to use the methods within.
- More info about build: [llama.cpp build documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)

In [8]:
from src.core.quantize_model import clone_repo, convert_model_to_gguf, quantize_model, build_project
clone_repo()

In [None]:
build_project()

### Download the model - from your personal registry

In [5]:
local_dir = "./model"

In [None]:
from azureml.core import Workspace, Model

# Connect to Azure ML workspace
workspace = Workspace.from_config()

# Specify the model name you want to download
model_name = "registered_model"

# Get the model
model = Model(workspace, name=model_name, version="x")

# Download the model
model.download(target_dir=local_dir, exist_ok=True)

print(f"Model {model_name} downloaded to {local_dir} directory.")

#### Using Hugging Face Library

- Alternatively, we can download the model directly using the Hugging Face library.

In [None]:
from huggingface_hub import snapshot_download

TOKEN = os.environ.get("TOKEN")
base_model_id = "teknium/OpenHermes-2.5-Mistral-7B"

snapshot_download(
    repo_id=base_model_id, local_dir=local_dir, token=TOKEN
)
# or using component import_model = registry_ml_client.components.get(name="download_model", label="latest")

In [None]:
model_name = "t5-large"
foundation_model = registry_ml_client.models.get(model_name, label="latest")
print(
    "\n\nUsing model name: {0}, version: {1}, id: {2} for fine tuning".format(
        foundation_model.name, foundation_model.version, foundation_model.id
    )
)

## 2 - Convert hf to ggu 

In [6]:
model_name = "original_model"
original_model_path = "./model/"
#original_model_path = "./model/mlflow_model_folder/data/model/"
quantized_model_path = "./model_quantized/"

In [None]:
convert_model_to_gguf(original_model_path, quantized_model_path)

In [18]:
quantized_model_path = f"{project_root_directory}notebooks//model_quantized"

In [None]:
quantized_model_path = quantize_model(quantized_model_path, "q4_k_m")

## 3 - How Does the Base Model Perform?

In [None]:
import transformers
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import time

In [4]:
local_dir = "./model"

In [None]:
local_dir or base_model_id 

In [None]:
tokenizer = AutoTokenizer.from_pretrained(local_dir)
model = AutoModelForCausalLM.from_pretrained(local_dir)
pipeline = transformers.pipeline(
    model=model, tokenizer=tokenizer, task="text-generation"
)

In [None]:
model

In [None]:
# Inspect the model's properties
print("Model's device:", model.device)
print("Model's dtype:", model.dtype)
print("Model's max lenght:", tokenizer.model_max_length)
print("Model's parameters:")
# for name, param in model.named_parameters():
#     print(f"  {name}: {param.shape}, {param.dtype}")

In [None]:
# We only input table and question, since system prompt is adeed in the prompt template.
my_prompt = "Summarize this dialog:\nHannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye\n---\nSummary:\n"

In [None]:
# Measure the latency
model.generation_config.pad_token_id = model.generation_config.eos_token_id
start_time = time.time()

with torch.no_grad():
    response = pipeline(
        my_prompt,
        max_new_tokens=256,
        repetition_penalty=1.15,
        return_full_text=False,
        pad_token_id=tokenizer.pad_token_id,  # Ensure this is an integer
    )

end_time = time.time()
latency = end_time - start_time

# Calculate the number of tokens generated
generated_text = response[0]["generated_text"]

In [None]:
generated_text

## 3 - Quantization and Deployment of a Fine-Tuned Large Language Model (LLM)

In [9]:
from llama_cpp import Llama

# GLOBAL VARIABLES
my_model_path = "./model_quantized/Q4_K_M.gguf"
CONTEXT_SIZE = 512

In [None]:

# LOAD THE MODEL
model_quantized = Llama(model_path=my_model_path, n_ctx=CONTEXT_SIZE)

In [None]:
model_quantized

In [37]:
def generate_text_from_prompt(
    user_prompt, max_tokens=256, temperature=0.3, top_p=0.1, echo=True, stop=["Q", "\n"]
):

    # Define the parameters
    model_output = model_quantized(
        user_prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        echo=echo,
        stop=stop,
    )

    return model_output

In [None]:
model_response = generate_text_from_prompt(my_prompt)

print(model_response)

In [None]:
model_response["choices"][0]["text"].strip().split("Summary:")[1].strip()

## Convert Model to MLflow Template

We can convert this model into an MLflow template for easier deployment and reproducibility. Below are the steps to achieve this:

1. **Create a Custom Loader
2. **Prepare the Model**: Ensure the model is in the correct format and directory.
3. **Log the Model**: Use MLflow to log the model with the appropriate signature and parameters.


In [18]:
my_model_path = "./model_quantized/Q4_K_M.gguf"

In [20]:
from src.core.tracking_model import tracking_mlflow_model 

code_path = "/home/azureuser/cloudfiles/code/Users/karinaa/fine-tuning-text-to-sql/src/core/custom_loader"
conda_path = "conda.yaml"
tracking_mlflow_model(code_path, my_model_path, conda_path)



## Testing Logged Model

In [10]:
import mlflow
import pandas as pd

In [11]:
# Set your run ID from MLflow

run_id = "23ca6ac6-9191-4854-802c-c9cb72689864"

In [None]:
mlflow_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model")
latest_run = mlflow.search_runs(order_by=["start_time desc"]).iloc[0]
print(f"Latest run ID: {latest_run.run_id}")

In [None]:
# create a json object with the key as "input_data" and value as a list of values from the text column of the test dataframe
data = {"text": [my_prompt]}

data

In [14]:
df = pd.DataFrame.from_dict(data)

In [15]:
unwrapped_model = mlflow_model.unwrap_python_model()

In [None]:
pred = unwrapped_model.predict(data, {'max_tokens': 256})

In [None]:
pred['choices'][0]['text']

## Endpoint

In [5]:
from azure.ai.ml.entities import (
    ManagedOnlineEndpoint,
    ManagedOnlineDeployment,
    ProbeSettings,
    OnlineRequestSettings,
)

import datetime

In [11]:

endpoint_name = "endpt-" + datetime.datetime.now().strftime("%m%d%H%M%f")

# create an online endpoint
endpoint = ManagedOnlineEndpoint(
    name=endpoint_name,
    description="Online endpoint for fine tuned  quantized model",
    auth_mode="key",
)
workspace_ml_client.begin_create_or_update(endpoint).wait()

You can find here the list of SKU's supported for deployment - [Managed online endpoints SKU list](https://learn.microsoft.com/en-us/azure/machine-learning/reference-managed-online-endpoints-vm-sku-list)

In [6]:
finetuned_model_name = "quantized_model"
version = "2"

registered_model = workspace_ml_client.models.get(
    name=finetuned_model_name, version=version
)

In [None]:
# Specify the environment name and version you want to retrieve
environment_name = "inferencing-env"
environment_version = "1"

# Get the environment
environment = workspace_ml_client.environments.get(name=environment_name, version=environment_version)

environment

In [8]:
# Create a deployment
demo_deployment = ManagedOnlineDeployment(
    name="blue",
    endpoint_name="endpt-10291635182376",
    model=registered_model.id,
    instance_type="Standard_NC48ads_A100_v4",  # use GPU instance type for faster explanations
    instance_count=1,
    #environment=environment,
    request_settings=OnlineRequestSettings(
        max_concurrent_requests_per_instance=1,
        request_timeout_ms=90000,
        max_queue_wait_ms=500,
    ),
    liveness_probe=ProbeSettings(
        failure_threshold=49,
        success_threshold=1,
        timeout=299,
        period=180,
        initial_delay=180,
    ),
    readiness_probe=ProbeSettings(
        failure_threshold=10,
        success_threshold=1,
        timeout=10,
        period=10,
        initial_delay=2000,
    ),
)

In [None]:
workspace_ml_client.online_deployments.begin_create_or_update(demo_deployment).wait()

In [None]:
endpoint.traffic = {"blue": 100}
workspace_ml_client.begin_create_or_update(endpoint).result()

### Test the endpoint with sample data

We will fetch some sample data from the test dataset and submit to online endpoint for inference. We will then show the display the scored labels alongside the ground truth labels

In [18]:
import json
import pandas as pd

In [None]:
# Input data
data = [
    "Summarize this dialog:\nHannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye\n---\nSummary:\n"
]

# Constructing the required JSON-like structure
input_data = {
    "input_data": {
        "index": [0],
        "columns": ["text"],
        "data": [[dialog] for dialog in data]
    }
}

# Output the result
print(input_data)


In [26]:

# save the json object to a file named sample_score.json in the ./samsum-dataset folder
with open("sample_score.json", "w") as f:
    json.dump(input_data, f)

In [22]:
online_endpoint_name = "endpt-10291635182376"

In [None]:
my_prompt

In [31]:
# score the sample_score.json file using the online endpoint with the azureml endpoint invoke method
response = workspace_ml_client.online_endpoints.invoke(
    endpoint_name=online_endpoint_name,
    deployment_name="blue",
    request_file="sample_score.json",
)

In [None]:
response

In [None]:
# convert the response to a pandas dataframe and rename the label column as scored_label
response_df = pd.read_json(response)
response_df = response_df.rename(columns={0: "scored_label"})

In [None]:
workspace_ml_client.online_endpoints.begin_delete(name=online_endpoint_name).wait()

In [None]:
import shutil
# Delete the model_download folder if it exists
if os.path.exists(original_model_path):
    shutil.rmtree(original_model_path)

# Delete the model_download folder if it exists
if os.path.exists("llama.cpp"):
    shutil.rmtree("llama.cpp")