In [16]:

from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential, ClientSecretCredential
from azure.ai.ml.entities import AmlCompute
import time

try:
    credential = DefaultAzureCredential()
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    credential = InteractiveBrowserCredential()

workspace_ml_client = MLClient(
        credential,
        #subscription_id =  "<SUBSCRIPTION_ID>",
        #resource_group_name =  "<RESOURCE_GROUP>",
        #workspace_name =  "WORKSPACE_NAME>"
        subscription_id =  "21d8f407-c4c4-452e-87a4-e609bfb86248", #"<SUBSCRIPTION_ID>"
        resource_group_name =  "rg-contoso-819prod", #"<RESOURCE_GROUP>",
        workspace_name =  "mlw-contoso-819prod", #"WORKSPACE_NAME>",
)

# the models, fine tuning pipelines and environments are available in the AzureML system registry, "azureml-preview"
registry_ml_client = MLClient(credential, registry_name="azureml-preview")

experiment_name = "summarization-news-summary"

# If you already have a gpu cluster, mention it here. Else will create a new one with the name 'gpu-cluster-big'
compute_cluster = "gpu-cluster-big"
try:
    workspace_ml_client.compute.get(compute_cluster)
except Exception as ex:
    compute = AmlCompute(
        name = compute_cluster, 
        size= "Standard_ND40rs_v2",
        max_instances= 2 # For multi node training set this to an integer value more than 1
    )
    workspace_ml_client.compute.begin_create_or_update(compute).wait()

# This is the number of GPUs in a single node of the selected 'vm_size' compute. 
# Setting this to less than the number of GPUs will result in underutilized GPUs, taking longer to train.
# Setting this to more than the number of GPUs will result in an error.
gpus_per_node = 2 

# genrating a unique timestamp that can be used for names and versions that need to be unique
timestamp = str(int(time.time())) 


In [10]:


from azure.ai.ml.dsl import pipeline
from azure.ai.ml.entities import CommandComponent, PipelineComponent, Job, Component
from azure.ai.ml import PyTorchDistribution, Input

# fetch the download_model component from the system registry
download_model_func = registry_ml_client.components.get(name="download_model", version="0.0.1")
# fetch the mlflow converter component from the system registry
mlflow_converter_func = registry_ml_client.components.get(name="mlflow_converter", version="0.0.1")

# define the pipeline job
@pipeline()
def create_pipeline():

    download_model_job = download_model_func(
        model_id = "unitary/toxic-bert",
        model_source = "Huggingface"
    )
    mlflow_converter_job = mlflow_converter_func(
        model_info = download_model_job.outputs.model_info,
        model_path = download_model_job.outputs.model_output,
        mlflow_flavor = "hftransformers",
        task_type = "text-classification"
    )
    return {
        "imported_model": mlflow_converter_job.outputs.mlflow_model_output
    }

pipeline_object = create_pipeline()

# don't use cached results from previous jobs
pipeline_object.settings.force_rerun = True
pipeline_object.settings.default_compute  = compute_cluster

In [11]:
# submit the pipeline job
pipeline_job = workspace_ml_client.jobs.create_or_update(pipeline_object, experiment_name=experiment_name)
# wait for the pipeline job to complete
workspace_ml_client.jobs.stream(pipeline_job.name)

RunId: maroon_seed_d4d52sft59
Web View: https://ml.azure.com/runs/maroon_seed_d4d52sft59?wsid=/subscriptions/21d8f407-c4c4-452e-87a4-e609bfb86248/resourcegroups/rg-contoso-819prod/workspaces/mlw-contoso-819prod

Streaming logs/azureml/executionlogs.txt

[2023-03-24 05:52:27Z] Submitting 1 runs, first five are: 168ccbe2:7ab353a1-4085-48e2-b9ba-7b9bd20a649d


JobException: The output streaming for the run interrupted.
But the run is still executing on the compute target. 
Details for canceling the run can be found here: https://aka.ms/aml-docs-cancel-run

In [18]:
from azure.ai.ml.entities import Model
from azure.ai.ml.constants import AssetTypes
# check if the `trained_model` output is available
print ("pipeline job outputs: ", workspace_ml_client.jobs.get(pipeline_job.name).outputs)

#fetch the model from pipeline job output - not working, hence fetching from fine tune child job
model_path_from_job = ("azureml://jobs/{0}/outputs/{1}".format(pipeline_job.name, "imported_model"))

imported_model_name = "unitary-toxic-bert"

print("path to register model: ", model_path_from_job)
prepare_to_register_model = Model(
    path=model_path_from_job,
    type=AssetTypes.MLFLOW_MODEL,
    name=imported_model_name,
    version=timestamp, # use timestamp as version to avoid version conflict
    description=imported_model_name + " imported from Huggingface"
)
print("prepare to register model: \n", prepare_to_register_model)
#register the model from pipeline job output 
registered_model = workspace_ml_client.models.create_or_update(prepare_to_register_model)
print ("registered model: \n", registered_model)


pipeline job outputs:  {'imported_model': <azure.ai.ml.entities._job.pipeline._io.base.PipelineOutput object at 0x7f3684b8fa30>}
path to register model:  azureml://jobs/maroon_seed_d4d52sft59/outputs/imported_model
prepare to register model: 
 description: unitary-toxic-bert imported from Huggingface
name: unitary-toxic-bert
path: azureml://jobs/maroon_seed_d4d52sft59/outputs/imported_model
properties: {}
tags: {}
type: mlflow_model
version: '1679638111'

registered model: 
 creation_context:
  created_at: '2023-03-24T06:09:11.849047+00:00'
  created_by: Manoj Bableshwar
  created_by_type: User
  last_modified_at: '2023-03-24T06:09:11.849047+00:00'
  last_modified_by: Manoj Bableshwar
  last_modified_by_type: User
description: unitary-toxic-bert imported from Huggingface
flavors:
  hftransformers:
    code: ''
    hf_pretrained_class: AutoModelForSequenceClassification
    huggingface_id: unitary/toxic-bert
    model_data: data
    pytorch_version: 1.11.0
    task_type: text-classifica

In [19]:
import time, sys
from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment

# Create online endpoint - endpoint names need to be unique in a region, hence using timestamp to create unique endpoint name

online_endpoint_name = "emotion-" + timestamp
# create an online endpoint
endpoint = ManagedOnlineEndpoint(
    name=online_endpoint_name,
    description="Online endpoint for " + registered_model.name + ", fine tuned model for emotion detection",
    auth_mode="key"
)
workspace_ml_client.begin_create_or_update(endpoint).wait()

In [21]:
# create a deployment
demo_deployment = ManagedOnlineDeployment(
    name="demo1",
    endpoint_name=online_endpoint_name,
    model=registered_model.id,
    instance_type="Standard_DS4_v2",
    instance_count=1,
)
workspace_ml_client.online_deployments.begin_create_or_update(demo_deployment).wait()
endpoint.traffic = {"demo": 100}
workspace_ml_client.begin_create_or_update(endpoint).result()

Check: endpoint emotion-1679638111 exists
data_collector is not a known attribute of class <class 'azure.ai.ml._restclient.v2022_02_01_preview.models._models_py3.ManagedOnlineDeployment'> and will be ignored


.......................