# Quantization of a Large Language Model (LLM)

In [1]:
# Load environment variables from a .env file
from dotenv import load_dotenv, find_dotenv
import os

load_dotenv(find_dotenv())

True

In [2]:
import sys

# Set the project root directory and update the system path
project_root_directory = os.getcwd().split("notebooks")[0]
sys.path.insert(0, project_root_directory)
notebook_path = os.path.join(project_root_directory, "notebooks")
sys.path.insert(0, notebook_path)

In [3]:
from azure.ai.ml import MLClient
from azure.identity import (
    DefaultAzureCredential,
    InteractiveBrowserCredential,
)
from azure.ai.ml.entities import AmlCompute
import time

try:
    credential = DefaultAzureCredential()
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    credential = InteractiveBrowserCredential()

try:
    workspace_ml_client = MLClient.from_config(credential=credential)
except:
    workspace_ml_client = MLClient(
        credential,
        subscription_id=os.environ["SUBSCRIPTION_ID"],
        resource_group_name=os.environ["RESOURCE_GROUP"],
        workspace_name=os.environ["WORKSPACE_NAME"],
    )

# the models, fine tuning pipelines and environments are available in the AzureML system registry, "azureml"
registry_ml_client = MLClient(credential, registry_name="azureml")

# generating a unique timestamp that can be used for names and versions that need to be unique
timestamp = str(int(time.time()))

DefaultAzureCredential failed to retrieve a token from the included credentials.
Attempted credentials:
	EnvironmentCredential: EnvironmentCredential authentication unavailable. Environment variables are not fully configured.
Visit https://aka.ms/azsdk/python/identity/environmentcredential/troubleshoot to troubleshoot this issue.
	ManagedIdentityCredential: ManagedIdentityCredential authentication unavailable, no response from the IMDS endpoint.
	SharedTokenCacheCredential: SharedTokenCacheCredential authentication unavailable. No accounts were found in the cache.
	AzureCliCredential: Failed to invoke the Azure CLI
	AzurePowerShellCredential: Az.Account module >= 2.2.0 is not installed
	AzureDeveloperCliCredential: Failed to invoke the Azure Developer CLI
To mitigate this issue, please refer to the troubleshooting guidelines here at https://aka.ms/azsdk/python/identity/defaultazurecredential/troubleshoot.
Found the config file in: C:\Users\karinaa\OneDrive - Microsoft\Documents\codes\a

CredentialUnavailableError: Timed out after waiting 300 seconds for the user to authenticate

## 1 - Quantization of a Model from Marketplace/Hugging Face 

When using a PEFT (Parameter-Efficient Fine-Tuning) model, it is essential to utilize the `convert_lora.py` script. This script is specifically designed to handle the conversion and quantization of PEFT models, ensuring optimal performance and efficiency.

- First step is to clone the llama.cpp repo, because we need it to use the methods within.
- More info about build: [llama.cpp build documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)

In [8]:
from src.core.quantize_model import clone_repo, convert_model_to_gguf, quantize_model, build_project
clone_repo()

In [None]:
build_project()

### Download the model - from your personal registry

In [5]:
local_dir = "./model"

In [None]:
from azureml.core import Workspace, Model

# Connect to Azure ML workspace
workspace = Workspace.from_config()

# Specify the model name you want to download
model_name = "registered_model"

# Get the model
model = Model(workspace, name=model_name, version="x")

# Download the model
model.download(target_dir=local_dir, exist_ok=True)

print(f"Model {model_name} downloaded to {local_dir} directory.")

#### Using Hugging Face Library

- Alternatively, we can download the model directly using the Hugging Face library.

In [6]:
from huggingface_hub import snapshot_download

TOKEN = os.environ.get("TOKEN")
base_model_id = "teknium/OpenHermes-2.5-Mistral-7B"

snapshot_download(
    repo_id=base_model_id, local_dir=local_dir, token=TOKEN
)
# or using component import_model = registry_ml_client.components.get(name="download_model", label="latest")

  from .autonotebook import tqdm as notebook_tqdm
Fetching 15 files: 100%|██████████| 15/15 [32:05<00:00, 128.38s/it]


'C:\\Users\\karinaa\\OneDrive - Microsoft\\Documents\\codes\\azure-samples\\gbbai-quantization-llm-llamacpp\\notebooks\\model'

In [12]:
model_name = "t5-large"
foundation_model = registry_ml_client.models.get(model_name, label="latest")
print(
    "\n\nUsing model name: {0}, version: {1}, id: {2} for fine tuning".format(
        foundation_model.name, foundation_model.version, foundation_model.id
    )
)

2025-01-20 19:05:28,687 - INFO - Request URL: 'https://cert-EastUS.experiments.azureml.net/mferp/managementfrontend/subscriptions/6c6683e9-e5fe-4038-8519-ce6ebec2ba15/resourceGroups/registry-builtin-prod-eastus-01/providers/Microsoft.MachineLearningServices/registries/azureml/models/t5-large/versions?api-version=REDACTED&$orderBy=REDACTED&$top=REDACTED'
Request method: 'GET'
Request headers:
    'Accept': 'application/json'
    'x-ms-client-request-id': 'a8a0e8db-d77a-11ef-82c3-8c3b4a55ecfb'
    'User-Agent': 'azsdk-python-mgmt-machinelearningservices/0.1.0 Python/3.12.8 (Windows-11-10.0.22631-SP0)'
    'Authorization': 'REDACTED'
    'traceparent': '00-d10d931fe13e64a2716b4b97428e9ec8-ce198cde9eca2a23-01'
No body was attached to the request
2025-01-20 19:05:29,509 - INFO - Response status: 200
Response headers:
    'Date': 'Mon, 20 Jan 2025 22:05:24 GMT'
    'Content-Type': 'application/json; charset=utf-8'
    'Transfer-Encoding': 'chunked'
    'Connection': 'keep-alive'
    'Vary': 



Using model name: t5-large, version: 18, id: azureml://registries/azureml/models/t5-large/versions/18 for fine tuning


## 2 - Convert hf to ggu 

In [6]:
model_name = "original_model"
original_model_path = "./model/"
#original_model_path = "./model/mlflow_model_folder/data/model/"
quantized_model_path = "./model_quantized/"

In [None]:
convert_model_to_gguf(original_model_path, quantized_model_path)

In [18]:
quantized_model_path = f"{project_root_directory}notebooks//model_quantized"

In [None]:
quantized_model_path = quantize_model(quantized_model_path, "q4_k_m")

## 3 - How Does the Base Model Perform?

In [None]:
import transformers
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import time

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
local_dir = "./model"

In [None]:
local_dir or base_model_id 

In [1]:
tokenizer = AutoTokenizer.from_pretrained(local_dir)
model = AutoModelForCausalLM.from_pretrained(local_dir)
pipeline = transformers.pipeline(
    model=model, tokenizer=tokenizer, task="text-generation"
)

NameError: name 'AutoTokenizer' is not defined

In [9]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (n

In [10]:
# Inspect the model's properties
print("Model's device:", model.device)
print("Model's dtype:", model.dtype)
print("Model's max lenght:", tokenizer.model_max_length)
print("Model's parameters:")
# for name, param in model.named_parameters():
#     print(f"  {name}: {param.shape}, {param.dtype}")

Model's device: cpu
Model's dtype: torch.float32
Model's max lenght: 1000000000000000019884624838656
Model's parameters:


In [None]:
# We only input table and question, since system prompt is adeed in the prompt template.
my_prompt = "Summarize this dialog:\nHannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye\n---\nSummary:\n"

In [None]:
# Measure the latency
model.generation_config.pad_token_id = model.generation_config.eos_token_id
start_time = time.time()

with torch.no_grad():
    response = pipeline(
        my_prompt,
        max_new_tokens=256,
        repetition_penalty=1.15,
        return_full_text=False,
        pad_token_id=tokenizer.pad_token_id,  # Ensure this is an integer
    )

end_time = time.time()
latency = end_time - start_time

# Calculate the number of tokens generated
generated_text = response[0]["generated_text"]

In [13]:
generated_text

'> Hannah asked Amanda for a phone number. She checked and couldn\'t find it, but knew who to ask next time. She told Hannah not to worry about asking him since he was really friendly. Then she said "just text him" and gave up on helping.\n\nThis is a good example of how Amanda helps people without being pushy or aggressive. She doesn\'t make them feel bad by saying things like "you should just..." instead she gives advice in a way that makes sense to both parties involved (in this case herself).'

## 3 - Quantization and Deployment of a Fine-Tuned Large Language Model (LLM)

In [9]:
from llama_cpp import Llama

# GLOBAL VARIABLES
my_model_path = "./model_quantized/Q4_K_M.gguf"
CONTEXT_SIZE = 512

In [41]:

# LOAD THE MODEL
model_quantized = Llama(model_path=my_model_path, n_ctx=CONTEXT_SIZE)

llama_model_loader: loaded meta data with 26 key-value pairs and 291 tensors from ./model_quantized/Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Model
llama_model_loader: - kv   3:                         general.size_label str              = 8.0B
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                       llama.context_length u32              = 8192
llama_model_loader: - kv   6:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   7:                  llama.feed_forward_length u32       

llm_load_vocab: special_eos_id is not in special_eog_ids - the tokenizer config may be incorrect
llm_load_vocab: special tokens cache size = 257
llm_load_vocab: token to piece cache size = 0.8000 MB
llm_load_print_meta: format           = GGUF V3 (latest)
llm_load_print_meta: arch             = llama
llm_load_print_meta: vocab type       = BPE
llm_load_print_meta: n_vocab          = 128257
llm_load_print_meta: n_merges         = 280147
llm_load_print_meta: vocab_only       = 0
llm_load_print_meta: n_ctx_train      = 8192
llm_load_print_meta: n_embd           = 4096
llm_load_print_meta: n_layer          = 32
llm_load_print_meta: n_head           = 32
llm_load_print_meta: n_head_kv        = 8
llm_load_print_meta: n_rot            = 128
llm_load_print_meta: n_swa            = 0
llm_load_print_meta: n_embd_head_k    = 128
llm_load_print_meta: n_embd_head_v    = 128
llm_load_print_meta: n_gqa            = 4
llm_load_print_meta: n_embd_k_gqa     = 1024
llm_load_print_meta: n_embd_v_gqa     =

In [43]:
model_quantized

<llama_cpp.llama.Llama at 0x7f739329b7a0>

In [37]:
def generate_text_from_prompt(
    user_prompt, max_tokens=256, temperature=0.3, top_p=0.1, echo=True, stop=["Q", "\n"]
):

    # Define the parameters
    model_output = model_quantized(
        user_prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        echo=echo,
        stop=stop,
    )

    return model_output

In [38]:
model_response = generate_text_from_prompt(my_prompt)

print(model_response)

llama_perf_context_print:        load time =    2411.79 ms
llama_perf_context_print: prompt eval time =       0.00 ms /   145 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    43 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    5085.27 ms /   188 tokens


{'id': 'cmpl-f841cb84-436d-4664-afdd-b161f7b62528', 'object': 'text_completion', 'created': 1730126628, 'model': './model_quantized/Q4_K_M.gguf', 'choices': [{'text': "Summarize this dialog:\nHannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye\n---\nSummary:\n://www.google.com/search?q=Hannah+Betty+Larry+number Hannah wants Betty's number. Amanda can't find it. Hannah doesn't know Larry well. Amanda suggests Hannah to text him.", 'index': 0, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 145, 'completion_tokens': 43, 'total_tokens': 188}}


In [39]:
model_response["choices"][0]["text"].strip().split("Summary:")[1].strip()

"://www.google.com/search?q=Hannah+Betty+Larry+number Hannah wants Betty's number. Amanda can't find it. Hannah doesn't know Larry well. Amanda suggests Hannah to text him."

## Convert Model to MLflow Template

We can convert this model into an MLflow template for easier deployment and reproducibility. Below are the steps to achieve this:

1. **Create a Custom Loader
2. **Prepare the Model**: Ensure the model is in the correct format and directory.
3. **Log the Model**: Use MLflow to log the model with the appropriate signature and parameters.


In [18]:
my_model_path = "./model_quantized/Q4_K_M.gguf"

In [20]:
from src.core.tracking_model import tracking_mlflow_model 

code_path = "/home/azureuser/cloudfiles/code/Users/karinaa/fine-tuning-text-to-sql/src/core/custom_loader"
conda_path = "conda.yaml"
tracking_mlflow_model(code_path, my_model_path, conda_path)



## Testing Logged Model

In [10]:
import mlflow
import pandas as pd

In [11]:
# Set your run ID from MLflow

run_id = "23ca6ac6-9191-4854-802c-c9cb72689864"

In [12]:
mlflow_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model")
latest_run = mlflow.search_runs(order_by=["start_time desc"]).iloc[0]
print(f"Latest run ID: {latest_run.run_id}")

  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

_load_pyfunc: Entered. data_path=/tmp/tmpcd3dmy_5/model/data/Q4_K_M.gguf


llama_model_loader: loaded meta data with 26 key-value pairs and 291 tensors from /tmp/tmpcd3dmy_5/model/data/Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Model
llama_model_loader: - kv   3:                         general.size_label str              = 8.0B
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                       llama.context_length u32              = 8192
llama_model_loader: - kv   6:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   7:                  llama.feed_forward_length 

_load_pyfunc: llm=<llama_cpp.llama.Llama object at 0x7f394ab469c0>


AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | AVX512_BF16 = 0 | FMA = 1 | NEON = 0 | SVE = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | RISCV_VECT = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | 
Model metadata: {'tokenizer.chat_template': "{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{{ '<|start_header_id|>assistant<|end_header_id|>\n\n' }}", 'tokenizer.ggml.add_eos_token': 'true', 'tokenizer.ggml.padding_token_id': '128256', 'tokenizer.ggml.eos_token_id': '128001', 'general.quantization_version': '2', 'tokenizer.ggml.model': 'gpt2', 'llama.rope.dimension_count': '128', 'llama.vocab_size': '128257', 'general.architecture': 'llama', 'llama.rope.freq_bas

In [13]:
# create a json object with the key as "input_data" and value as a list of values from the text column of the test dataframe
data = {"text": [my_prompt]}

data

{'text': ["Summarize this dialog:\nHannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye\n---\nSummary:\n"]}

In [14]:
df = pd.DataFrame.from_dict(data)

In [15]:
unwrapped_model = mlflow_model.unwrap_python_model()

In [30]:
pred = unwrapped_model.predict(data, {'max_tokens': 256})

Llama.generate: 65 prefix-match hit, remaining 148 prompt tokens to eval
llama_perf_context_print:        load time =    3884.49 ms
llama_perf_context_print: prompt eval time =       0.00 ms /   148 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /    43 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =    7309.17 ms /   191 tokens


In [33]:
pred['choices'][0]['text']

"://hannah.com/ Hannah asks Amanda if she has Betty's number. Amanda can't find it so she advises Hannah to text Larry as he had talked to her last time they were at the park together."

## Endpoint

In [5]:
from azure.ai.ml.entities import (
    ManagedOnlineEndpoint,
    ManagedOnlineDeployment,
    ProbeSettings,
    OnlineRequestSettings,
)

import datetime

In [11]:

endpoint_name = "endpt-" + datetime.datetime.now().strftime("%m%d%H%M%f")

# create an online endpoint
endpoint = ManagedOnlineEndpoint(
    name=endpoint_name,
    description="Online endpoint for fine tuned  quantized model",
    auth_mode="key",
)
workspace_ml_client.begin_create_or_update(endpoint).wait()

You can find here the list of SKU's supported for deployment - [Managed online endpoints SKU list](https://learn.microsoft.com/en-us/azure/machine-learning/reference-managed-online-endpoints-vm-sku-list)

In [6]:
finetuned_model_name = "quantized_model"
version = "2"

registered_model = workspace_ml_client.models.get(
    name=finetuned_model_name, version=version
)

In [8]:
# Specify the environment name and version you want to retrieve
environment_name = "inferencing-env"
environment_version = "1"

# Get the environment
environment = workspace_ml_client.environments.get(name=environment_name, version=environment_version)

environment

Environment({'arm_type': 'environment_version', 'latest_version': None, 'image': 'mcr.microsoft.com/azureml/mlflow-ubuntu20.04-py38-cpu-inference:20241003.v2', 'intellectual_property': None, 'is_anonymous': False, 'auto_increment_version': False, 'auto_delete_setting': None, 'name': 'inferencing-env', 'description': 'Environment created from a Docker image.', 'tags': {}, 'properties': {'azureml.labels': 'latest'}, 'print_as_yaml': False, 'id': '/subscriptions/e878de60-60e5-4a05-ba42-a9ab14136cc9/resourceGroups/ka-sand-rg/providers/Microsoft.MachineLearningServices/workspaces/ml-sandbox-core/environments/inferencing-env/versions/1', 'Resource__source_path': '', 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/sandboc-ci-v2/code/Users/karinaa/fine-tuning-text-to-sql/notebooks', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x7fe4a84d94f0>, 'serialize': <msrest.serialization.Serializer object at 0x7fe4a84dbcb0>, 'version': '1', 'conda_file': {'ch

In [8]:
# Create a deployment
demo_deployment = ManagedOnlineDeployment(
    name="blue",
    endpoint_name="endpt-10291635182376",
    model=registered_model.id,
    instance_type="Standard_NC48ads_A100_v4",  # use GPU instance type for faster explanations
    instance_count=1,
    #environment=environment,
    request_settings=OnlineRequestSettings(
        max_concurrent_requests_per_instance=1,
        request_timeout_ms=90000,
        max_queue_wait_ms=500,
    ),
    liveness_probe=ProbeSettings(
        failure_threshold=49,
        success_threshold=1,
        timeout=299,
        period=180,
        initial_delay=180,
    ),
    readiness_probe=ProbeSettings(
        failure_threshold=10,
        success_threshold=1,
        timeout=10,
        period=10,
        initial_delay=2000,
    ),
)

In [None]:
workspace_ml_client.online_deployments.begin_create_or_update(demo_deployment).wait()

In [17]:
endpoint.traffic = {"blue": 100}
workspace_ml_client.begin_create_or_update(endpoint).result()

Readonly attribute principal_id will be ignored in class <class 'azure.ai.ml._restclient.v2022_05_01.models._models_py3.ManagedServiceIdentity'>
Readonly attribute tenant_id will be ignored in class <class 'azure.ai.ml._restclient.v2022_05_01.models._models_py3.ManagedServiceIdentity'>


ManagedOnlineEndpoint({'public_network_access': 'Enabled', 'provisioning_state': 'Succeeded', 'scoring_uri': 'https://endpt-10291635182376.eastus2.inference.ml.azure.com/score', 'openapi_uri': 'https://endpt-10291635182376.eastus2.inference.ml.azure.com/swagger.json', 'name': 'endpt-10291635182376', 'description': 'Online endpoint for fine tuned  quantized model', 'tags': {}, 'properties': {'createdBy': 'Karina Assini Andreatta', 'createdAt': '2024-10-29T16:35:56.773624+0000', 'lastModifiedAt': '2024-10-29T16:35:56.773624+0000', 'azureml.onlineendpointid': '/subscriptions/e878de60-60e5-4a05-ba42-a9ab14136cc9/resourcegroups/ka-sand-rg/providers/microsoft.machinelearningservices/workspaces/ml-sandbox-core/onlineendpoints/endpt-10291635182376', 'AzureAsyncOperationUri': 'https://management.azure.com/subscriptions/e878de60-60e5-4a05-ba42-a9ab14136cc9/providers/Microsoft.MachineLearningServices/locations/eastus2/mfeOperationsStatus/oeidp:a2578a1c-1232-4bf3-adf2-38108af4b848:2e6168c3-eba3-43

### Test the endpoint with sample data

We will fetch some sample data from the test dataset and submit to online endpoint for inference. We will then show the display the scored labels alongside the ground truth labels

In [18]:
import json
import pandas as pd

In [25]:
# Input data
data = [
    "Summarize this dialog:\nHannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye\n---\nSummary:\n"
]

# Constructing the required JSON-like structure
input_data = {
    "input_data": {
        "index": [0],
        "columns": ["text"],
        "data": [[dialog] for dialog in data]
    }
}

# Output the result
print(input_data)


{'input_data': {'index': [0], 'columns': ['text'], 'data': [["Summarize this dialog:\nHannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye\n---\nSummary:\n"]]}}


In [26]:

# save the json object to a file named sample_score.json in the ./samsum-dataset folder
with open("sample_score.json", "w") as f:
    json.dump(input_data, f)

In [22]:
online_endpoint_name = "endpt-10291635182376"

In [29]:
my_prompt

"Summarize this dialog:\nHannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye\n---\nSummary:\n"

In [31]:
# score the sample_score.json file using the online endpoint with the azureml endpoint invoke method
response = workspace_ml_client.online_endpoints.invoke(
    endpoint_name=online_endpoint_name,
    deployment_name="blue",
    request_file="sample_score.json",
)

In [32]:
response

'{"id": "cmpl-b2e20cdc-583c-41a5-9e14-3312d06d0915", "object": "text_completion", "created": 1730228203, "model": "/var/azureml-app/azureml-models/quantized_model/2/model/data/Q4_K_M.gguf", "choices": [{"text": "://\\nGPU O1 is a virtual assistant that is available on the website for the company, <name>. It is available for the employees to ask questions in a conversational way to get help or information. This assistant is available in all the 50 states.</INST>", "index": 0, "logprobs": null, "finish_reason": "stop"}], "usage": {"prompt_tokens": 78, "completion_tokens": 56, "total_tokens": 134}}'

In [None]:
# convert the response to a pandas dataframe and rename the label column as scored_label
response_df = pd.read_json(response)
response_df = response_df.rename(columns={0: "scored_label"})

  response_df = pd.read_json(response)


In [None]:
workspace_ml_client.online_endpoints.begin_delete(name=online_endpoint_name).wait()

In [None]:
import shutil
# Delete the model_download folder if it exists
if os.path.exists(original_model_path):
    shutil.rmtree(original_model_path)

# Delete the model_download folder if it exists
if os.path.exists("llama.cpp"):
    shutil.rmtree("llama.cpp")