# Quantization of a Large Language Model (LLM)

In [None]:
# Load environment variables from a .env file
from dotenv import load_dotenv, find_dotenv
import os

load_dotenv(find_dotenv())

False

In [2]:
import sys

# Set the project root directory and update the system path
project_root_directory = os.getcwd().split("notebooks")[0]
sys.path.insert(0, project_root_directory)
notebook_path = os.path.join(project_root_directory, "notebooks")
sys.path.insert(0, notebook_path)

In [3]:
from azure.ai.ml import MLClient
from azure.identity import (
    DefaultAzureCredential,
    InteractiveBrowserCredential,
)
from azure.ai.ml.entities import AmlCompute
import time

try:
    credential = DefaultAzureCredential()
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    credential = InteractiveBrowserCredential()

try:
    workspace_ml_client = MLClient.from_config(credential=credential)
except:
    workspace_ml_client = MLClient(
        credential,
        subscription_id=os.environ["SUBSCRIPTION_ID"],
        resource_group_name=os.environ["RESOURCE_GROUP"],
        workspace_name=os.environ["WORKSPACE_NAME"],
    )

# the models, fine tuning pipelines and environments are available in the AzureML system registry, "azureml"
registry_ml_client = MLClient(credential, registry_name="azureml")

# generating a unique timestamp that can be used for names and versions that need to be unique
timestamp = str(int(time.time()))

Found the config file in: /config.json
Overriding of current TracerProvider is not allowed
Overriding of current LoggerProvider is not allowed
Overriding of current MeterProvider is not allowed
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented
Attempting to instrument while already instrumented


## 1 - Quantization of a Model from Marketplace/Hugging Face 

When using a PEFT (Parameter-Efficient Fine-Tuning) model, it is essential to utilize the `convert_lora.py` script. This script is specifically designed to handle the conversion and quantization of PEFT models, ensuring optimal performance and efficiency.

- First step is to clone the llama.cpp repo, because we need it to use the methods within.
- More info about build: [llama.cpp build documentation](https://github.com/ggerganov/llama.cpp/blob/master/docs/build.md)

In [4]:
from src.core.quantize_model import (
    clone_repo,
    convert_model_to_gguf,
    quantize_model,
    build_project,
)

In [None]:
clone_repo()

In [None]:
build_project()

### Download the model - from your personal registry

In [5]:
local_dir = "./model"

In [None]:
from azureml.core import Workspace, Model

# Connect to Azure ML workspace
workspace = Workspace.from_config()

# Specify the model name you want to download
model_name = "registered_model"

# Get the model
model = Model(workspace, name=model_name, version="x")

# Download the model
model.download(target_dir=local_dir, exist_ok=True)

print(f"Model {model_name} downloaded to {local_dir} directory.")

#### Using Hugging Face Library

- Alternatively, we can download the model directly using the Hugging Face library.

In [6]:
local_dir = "./model"

In [7]:
from huggingface_hub import snapshot_download

TOKEN = os.environ.get("TOKEN")
base_model_id = "teknium/OpenHermes-2.5-Mistral-7B"

snapshot_download(repo_id=base_model_id, local_dir=local_dir, token=TOKEN)
# or using component import_model = registry_ml_client.components.get(name="download_model", label="latest")

Fetching 15 files:   0%|                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        | 0/15 [00:00<?, ?it/s]

Fetching 15 files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 15/15 [02:58<00:00, 11.92s/it]


'/mnt/batch/tasks/shared/LS_root/mounts/clusters/sandboc-ci-v2/code/Users/karinaa/gbbai-quantization-llm-llamacpp/notebooks/model'

In [None]:
model_name = "t5-large"
foundation_model = registry_ml_client.models.get(model_name, label="latest")
print(
    "\n\nUsing model name: {0}, version: {1}, id: {2} for fine tuning".format(
        foundation_model.name, foundation_model.version, foundation_model.id
    )
)

## 2 - Convert hf to ggu 

In [7]:
model_name = "original_model"
original_model_path = "./model/"
# original_model_path = "./model/mlflow_model_folder/data/model/"
quantized_model_path = "./model_quantized/"

In [11]:
convert_model_to_gguf(original_model_path, quantized_model_path)

2025-01-27 13:48:43,000 - INFO - Starting conversion of model to gguf format...
INFO:hf-to-gguf:Loading model: model
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'
INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00002.safetensors'
INFO:hf-to-gguf:token_embd.weight,           torch.bfloat16 --> F16, shape = {4096, 32002}
INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.bfloat16 --> F32, shape = {4096}
INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.bfloat16 --> F16, shape = {14336, 4096}
INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.bfloat16 --> F16, shape = {4096, 14336}
INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.bfloat16 --> F16, shape = {4096, 14336}
INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.bfloat16 --> F32, shape = {4096}
INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.bfloat16 --> F16, shape = {4096, 1024

In [5]:
quantized_model_path = f"{project_root_directory}notebooks//model_quantized"

In [6]:
quantized_model_path = quantize_model(quantized_model_path, "q4_k_m")

2025-01-27 14:09:00,498 - INFO - Quantizing the model...
main: build = 0 (unknown)
main: built with cc (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0 for x86_64-linux-gnu
main: quantizing '/mnt/batch/tasks/shared/LS_root/mounts/clusters/sandboc-ci-v2/code/Users/karinaa/gbbai-quantization-llm-llamacpp/notebooks//model_quantized/FP16.gguf' to '/mnt/batch/tasks/shared/LS_root/mounts/clusters/sandboc-ci-v2/code/Users/karinaa/gbbai-quantization-llm-llamacpp/notebooks//model_quantized/Q4_K_M.gguf' as Q4_K_M
llama_model_loader: loaded meta data with 42 key-value pairs and 291 tensors from /mnt/batch/tasks/shared/LS_root/mounts/clusters/sandboc-ci-v2/code/Users/karinaa/gbbai-quantization-llm-llamacpp/notebooks//model_quantized/FP16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                  


main: quantize time = 174915.79 ms
main:    total time = 174915.79 ms


## 3 - How Does the Base Model Perform?

In [7]:
import transformers
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, LlamaTokenizer
import time

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
tokenizer = LlamaTokenizer.from_pretrained(local_dir)
model = AutoModelForCausalLM.from_pretrained(local_dir)

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [01:29<00:00, 44.52s/it]


In [9]:
# local_dir or base_model_id
pipeline = transformers.pipeline(
    model=model, tokenizer=tokenizer, task="text-generation"
)

Device set to use cpu


In [10]:
model

MistralForCausalLM(
  (model): MistralModel(
    (embed_tokens): Embedding(32002, 4096)
    (layers): ModuleList(
      (0-31): 32 x MistralDecoderLayer(
        (self_attn): MistralAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        )
        (mlp): MistralMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): MistralRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): MistralRMSNorm((4096,), eps=1e-05)
      )
    )
    (norm): MistralRMSNorm((4096,), eps=1e-0

In [11]:
# Inspect the model's properties
print("Model's device:", model.device)
print("Model's dtype:", model.dtype)
print("Model's max lenght:", tokenizer.model_max_length)
print("Model's parameters:")
# for name, param in model.named_parameters():
#     print(f"  {name}: {param.shape}, {param.dtype}")

Model's device: cpu
Model's dtype: torch.float32
Model's max lenght: 1000000000000000019884624838656
Model's parameters:


In [13]:
# We only input table and question, since system prompt is adeed in the prompt template.
my_prompt = "Summarize this dialog:\nHannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye\n---\nSummary:\n"

In [13]:
# Measure the latency
model.generation_config.pad_token_id = model.generation_config.eos_token_id
start_time = time.time()

with torch.no_grad():
    response = pipeline(
        my_prompt,
        max_new_tokens=256,
        repetition_penalty=1.15,
        return_full_text=False,
        pad_token_id=tokenizer.pad_token_id,  # Ensure this is an integer
    )

end_time = time.time()
latency = end_time - start_time

# Calculate the number of tokens generated
generated_text = response[0]["generated_text"]

In [14]:
generated_text

"In this conversation, Hannah asks Amanda if she has Betty's phone number. Amanda checks but cannot find it and suggests that Hannah should ask Larry instead since he had contacted Betty recently. However, Hannah is hesitant to contact someone she doesn't know well and prefers Amanda to send a text message on her behalf. After some reluctance, Hannah agrees and ends the conversation with Amanda."

## 4 - Quantization and Deployment of a Fine-Tuned Large Language Model (LLM)

In [4]:
from llama_cpp import Llama

# GLOBAL VARIABLES
my_model_path = "./model_quantized/Q4_K_M.gguf"
CONTEXT_SIZE = 512

In [5]:
# LOAD THE MODEL
model_quantized = Llama(model_path=my_model_path, n_ctx=CONTEXT_SIZE)

llama_model_loader: loaded meta data with 42 key-value pairs and 291 tensors from ./model_quantized/Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Mistral 7B v0.1
llama_model_loader: - kv   3:                            general.version str              = v0.1
llama_model_loader: - kv   4:                       general.organization str              = Mistralai
llama_model_loader: - kv   5:                           general.basename str              = Mistral
llama_model_loader: - kv   6:                         general.size_label str              = 7B
llama_model_loader: - kv   7:                            general.

In [6]:
model_quantized

<llama_cpp.llama.Llama at 0x7f5ad40fe8a0>

In [7]:
def generate_text_from_prompt(
    user_prompt, max_tokens=256, temperature=0.3, top_p=0.1, echo=True, stop=["Q", "\n"]
):

    # Define the parameters
    model_output = model_quantized(
        user_prompt,
        max_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        echo=echo,
        stop=stop,
    )

    return model_output

In [14]:
model_response = generate_text_from_prompt(my_prompt)

print(model_response)


llama_print_timings:        load time =    4293.05 ms
llama_print_timings:      sample time =      10.39 ms /    64 runs   (    0.16 ms per token,  6157.99 tokens per second)
llama_print_timings: prompt eval time =    4292.89 ms /   173 tokens (   24.81 ms per token,    40.30 tokens per second)
llama_print_timings:        eval time =    4352.55 ms /    63 runs   (   69.09 ms per token,    14.47 tokens per second)
llama_print_timings:       total time =    8804.10 ms /   236 tokens


{'id': 'cmpl-a2ffd82a-ef6a-4cef-ba96-1862ddfd87ce', 'object': 'text_completion', 'created': 1738065827, 'model': './model_quantized/Q4_K_M.gguf', 'choices': [{'text': "Summarize this dialog:\nHannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye\n---\nSummary:\nHannah asked Amanda if she had Betty's number. Amanda couldn't find it and suggested that Hannah ask Larry instead. Hannah was hesitant because she didn't know him well but Amanda reassured her that he was nice. Hannah eventually agreed to let Amanda text him for the number.", 'index': 0, 'logprobs': None, 'finish_reason': 'stop'}], 'usage': {'prompt_tokens': 173, 'co

In [15]:
model_response["choices"][0]["text"].strip().split("Summary:")[1].strip()

"Hannah asked Amanda if she had Betty's number. Amanda couldn't find it and suggested that Hannah ask Larry instead. Hannah was hesitant because she didn't know him well but Amanda reassured her that he was nice. Hannah eventually agreed to let Amanda text him for the number."

## 5 - Convert Model to MLflow Template

We can convert this model into an MLflow template for easier deployment and reproducibility. Below are the steps to achieve this:

1. **Create a Custom Loader
2. **Prepare the Model**: Ensure the model is in the correct format and directory.
3. **Log the Model**: Use MLflow to log the model with the appropriate signature and parameters.


In [16]:
my_model_path = "./model_quantized/Q4_K_M.gguf"

In [20]:
from src.core.tracking_model import tracking_mlflow_model

code_path = "./src/custom_loader"
conda_path = "./src/conda.yaml"
tracking_mlflow_model(code_path, my_model_path, conda_path)

## Testing Logged Model

In [17]:
import mlflow
import pandas as pd

In [10]:
# Set your run ID from MLflow

run_id = "bfb3dd3a-678c-4fb2-94c8-525e6eea0ca0"

In [11]:
mlflow_model = mlflow.pyfunc.load_model(f"runs:/{run_id}/model")
latest_run = mlflow.search_runs(order_by=["start_time desc"]).iloc[0]
print(f"Latest run ID: {latest_run.run_id}")

  from google.protobuf import service as _service
  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████

_load_pyfunc: Entered. data_path=/tmp/tmp9pwcwyaq/model/data/Q4_K_M.gguf


llama_model_loader: loaded meta data with 42 key-value pairs and 291 tensors from /tmp/tmp9pwcwyaq/model/data/Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Mistral 7B v0.1
llama_model_loader: - kv   3:                            general.version str              = v0.1
llama_model_loader: - kv   4:                       general.organization str              = Mistralai
llama_model_loader: - kv   5:                           general.basename str              = Mistral
llama_model_loader: - kv   6:                         general.size_label str              = 7B
llama_model_loader: - kv   7:                          

_load_pyfunc: llm=<llama_cpp.llama.Llama object at 0x7f5937467470>


AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 0 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | 
Model metadata: {'general.quantization_version': '2', 'tokenizer.chat_template': "{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}", 'general.file_type': '15', 'general.dataset.0.organization': 'Teknium', 'general.license': 'apache-2.0', 'tokenizer.ggml.add_bos_token': 'true', 'general.size_label': '7B', 'general.type': 'model', 'tokenizer.ggml.add_eos_token': 'false', 'general.organization': 'Mistralai', 'general.dataset.count': '1', 'general.dataset.0.name': 'OpenHermes 2.5', 'general.base_model.0.repo_url': 'https://huggingface.co/mistralai/Mistral-7B-v0.1', 'general.version': 'v0.1', 'general.base_model.0.name': '

Latest run ID: 11d159ad-1761-4e94-af4f-627227c64ff0


In [54]:
# create a json object with the key as "input_data" and value as a list of values from the text column of the test dataframe
data = {"message": my_prompt, "role": ["system"]}

In [48]:
df = pd.DataFrame.from_dict(data)

In [33]:
unwrapped_model = mlflow_model.unwrap_python_model()

In [51]:
pred = unwrapped_model.predict(df, {"max_tokens": 256})

Llama.generate: prefix-match hit



llama_print_timings:        load time =    5326.90 ms
llama_print_timings:      sample time =      14.72 ms /    88 runs   (    0.17 ms per token,  5977.85 tokens per second)
llama_print_timings: prompt eval time =    3921.41 ms /   180 tokens (   21.79 ms per token,    45.90 tokens per second)
llama_print_timings:        eval time =    6088.45 ms /    87 runs   (   69.98 ms per token,    14.29 tokens per second)
llama_print_timings:       total time =   10234.42 ms /   267 tokens


In [53]:
pred["choices"][0]["text"]

"In this conversation, Hannah asks Amanda if she has Betty's number. Amanda checks her phone but cannot find the number and suggests that Hannah ask Larry for it since he called her the last time they were at the park together. Hannah is hesitant because she does not know Larry well but agrees to let Amanda text him for the number instead of calling him herself. Hannah then ends the conversation with Amanda by saying goodbye."

## 6 - Endpoint

In [5]:
from azure.ai.ml.entities import (
    ManagedOnlineEndpoint,
    ManagedOnlineDeployment,
    ProbeSettings,
    OnlineRequestSettings,
)

import datetime

In [11]:
endpoint_name = "endpt-" + datetime.datetime.now().strftime("%m%d%H%M%f")

# create an online endpoint
endpoint = ManagedOnlineEndpoint(
    name=endpoint_name,
    description="Online endpoint for fine tuned  quantized model",
    auth_mode="key",
)
workspace_ml_client.begin_create_or_update(endpoint).wait()

You can find here the list of SKU's supported for deployment - [Managed online endpoints SKU list](https://learn.microsoft.com/en-us/azure/machine-learning/reference-managed-online-endpoints-vm-sku-list)

In [6]:
finetuned_model_name = "quantized_model"
version = "2"

registered_model = workspace_ml_client.models.get(
    name=finetuned_model_name, version=version
)

In [8]:
# Create a deployment
demo_deployment = ManagedOnlineDeployment(
    name="blue",
    endpoint_name="endpt-10291635182376",
    model=registered_model.id,
    instance_type="Standard_NC48ads_A100_v4",  # use GPU instance type for faster explanations
    instance_count=1,
    # environment=environment,
    request_settings=OnlineRequestSettings(
        max_concurrent_requests_per_instance=1,
        request_timeout_ms=90000,
        max_queue_wait_ms=500,
    ),
    liveness_probe=ProbeSettings(
        failure_threshold=49,
        success_threshold=1,
        timeout=299,
        period=180,
        initial_delay=180,
    ),
    readiness_probe=ProbeSettings(
        failure_threshold=10,
        success_threshold=1,
        timeout=10,
        period=10,
        initial_delay=2000,
    ),
)

In [None]:
workspace_ml_client.online_deployments.begin_create_or_update(demo_deployment).wait()

In [None]:
endpoint.traffic = {"blue": 100}
workspace_ml_client.begin_create_or_update(endpoint).result()

### Test the endpoint with sample data

We will fetch some sample data from the test dataset and submit to online endpoint for inference. We will then show the display the scored labels alongside the ground truth labels

In [18]:
import json
import pandas as pd

In [None]:
# Input data
data = [
    "Summarize this dialog:\nHannah: Hey, do you have Betty's number?\nAmanda: Lemme check\nHannah: <file_gif>\nAmanda: Sorry, can't find it.\nAmanda: Ask Larry\nAmanda: He called her last time we were at the park together\nHannah: I don't know him well\nHannah: <file_gif>\nAmanda: Don't be shy, he's very nice\nHannah: If you say so..\nHannah: I'd rather you texted him\nAmanda: Just text him 🙂\nHannah: Urgh.. Alright\nHannah: Bye\nAmanda: Bye bye\n---\nSummary:\n"
]

# Constructing the required JSON-like structure
input_data = {
    "input_data": {
        "index": [0],
        "columns": ["text"],
        "data": [[dialog] for dialog in data],
    }
}

# Output the resultss
print(input_data)

In [26]:
# save the json object to a file named sample_score.json in the ./samsum-dataset folder
with open("sample_score.json", "w") as f:
    json.dump(input_data, f)

In [22]:
online_endpoint_name = "endpt-10291635182376"

In [None]:
my_prompt

In [31]:
# score the sample_score.json file using the online endpoint with the azureml endpoint invoke method
response = workspace_ml_client.online_endpoints.invoke(
    endpoint_name=online_endpoint_name,
    deployment_name="blue",
    request_file="sample_score.json",
)

In [None]:
response

In [None]:
# convert the response to a pandas dataframe and rename the label column as scored_label
response_df = pd.read_json(response)
response_df = response_df.rename(columns={0: "scored_label"})

In [None]:
workspace_ml_client.online_endpoints.begin_delete(name=online_endpoint_name).wait()

In [None]:
import shutil

# Delete the model_download folder if it exists
if os.path.exists(original_model_path):
    shutil.rmtree(original_model_path)

# Delete the model_download folder if it exists
if os.path.exists("llama.cpp"):
    shutil.rmtree("llama.cpp")