# How to create an Safety-Enabled LLaMa Online Endpoint
## This notebook will walk you through the steps to create a Safety-Enabled LLaMa Online Endpoint.
### The steps are:
1. Create a Content Safety for moderating the request from user and response from the LLaMa Online Endpoint
2. Create a new LLaMa Online Endpoint
3. Create a new Safety-Enabled LLaMa Online Endpoint with a custom score.py file which will be used to moderate the request and response

### 1. Prerequisites
#### 1.1 Install Dependencies

In [None]:
%pip install azure-identity==1.14.0b1
%pip install azure-mgmt-cognitiveservices==13.4.0
%pip install --pre azure-ai-ml 
%pip install --pre azure-mgmt-msi
%pip install --pre azure-mgmt-authorization

#### 1.2 Assign variables for the workspace and deployment

In [2]:
subscription_id = "<SUBSCRIPTION_ID>"
resource_group = "<RESOURCE_GROUP>"
workspace_name = "<AML_WORKSPACE_NAME>"

#### 1.3 Decide on a name for your Safety LLaMa Online Endpoint

In [None]:
# import random
# rand = random.randint(0, 10000)
# endpoint_name = f"safetyllama{rand}" 
endpoint_name = "<ONLINE-ENDPOINT-NAME>" # the final endpoint name of the safety enabled llama endpoint
print(endpoint_name)

### 2. Connect to your AML Workspace

In [None]:
import os, json
from azure.ai.ml import MLClient, Input, Output 
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

try:
    ml_client = MLClient.from_config(credential=credential)
except Exception as ex:
    # NOTE: Update following workspace information to contain
    #       your subscription ID, resource group name, and workspace name
    client_config = {
        "subscription_id": subscription_id,
        "resource_group": resource_group,
        "workspace_name": workspace_name,
    }
    # write and reload from config file
    config_path = "../.azureml/config.json"
    os.makedirs(os.path.dirname(config_path), exist_ok=True)
    with open(config_path, "w") as fo:
        fo.write(json.dumps(client_config))
    ml_client = MLClient.from_config(credential=credential, path=config_path)
    
print(ml_client)
workspace_location = ml_client.workspaces.get(ml_client.workspace_name).location
subscription_id = ml_client.subscription_id
resource_group_name = ml_client.resource_group_name
workspace_name = ml_client.workspace_name
print(workspace_location)
print(subscription_id)
print(resource_group_name)
print(workspace_name)


### 4. Create Azure AI Content Safety

#### 4.1 Choose a region for your Content Safety
Currently, Azure AI Content Safety only available in the following regions:
- East US
- West Europe
- Central US EUAP

In [5]:
aacs_location = "east us" # please choose the nearest location to your workspace
print(f"will create aacs in {aacs_location}")

will create aacs in east us


In [None]:
from azure.mgmt.cognitiveservices import CognitiveServicesManagementClient
from azure.identity import DefaultAzureCredential, InteractiveBrowserCredential
from azure.mgmt.cognitiveservices.models import Account, Sku, AccountProperties
import time

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()

client = CognitiveServicesManagementClient(credential, subscription_id)
# create a new Cognitive Services Account
# name of the AACS to create
name_of_aacs = f"{endpoint_name}-aacs"
kind = "ContentSafety"
sku_name = "S0"
parameters = Account(sku=Sku(name=sku_name), kind=kind, location=aacs_location, properties= AccountProperties(custom_sub_domain_name=name_of_aacs, public_network_access="Enabled"))
# How many seconds to wait between checking the status of an async operation.
wait_time = 10

poller = client.accounts.begin_create(resource_group_name, name_of_aacs, parameters)
while (False == poller.done()) :
    print ("Waiting {wait_time} seconds for operation to finish.".format (wait_time = wait_time))
    time.sleep (wait_time)
    # This will raise an exception if the server responded with an error.
    result = poller.result()


print("Resource created.")

aacs=client.accounts.get(resource_group_name, name_of_aacs)
aacs_endpoint = aacs.properties.endpoint
aacs_resource_id = aacs.id
print(aacs_endpoint)
print(aacs_resource_id)

### 5. Create LLaMa Oneline endpoint

#### 5.1 Decide on SKU and instance count for the LLama Oneline endpoint.

In [None]:
compute_sku_for_llama="Standard_DS5_v2" # the sku of the compute instance for LLaMa endpoint
compute_instance_count_for_llama=1 # the number of compute instance
llama_endpoint_name=f"{endpoint_name}-llama"
print(f"Will create LLaMa endpoint {llama_endpoint_name} using {compute_instance_count_for_llama} {compute_sku_for_llama} compute instance(s)")

#### 5.2 Check if LLaMa model is available in the aml registry.

In [None]:
model_name = "gpt2" # TODO(mingtwan) change to LLaMa
auth_mode_for_llama = "aml_token" # we will use AML token for authentication
registry_ml_client = MLClient(credential, registry_name="azureml")
version_list = list(registry_ml_client.models.list(model_name))

foundation_model = None
if len(version_list) == 0:
    print("Model not found in registry")
else:
    model_version = version_list[0].version
    foundation_model = registry_ml_client.models.get(model_name, model_version)
    print(
        "\n\nUsing model name: {0}, version: {1}, id: {2} for inferencing".format(
            foundation_model.name, foundation_model.version, foundation_model.id
        )
    )

#### 5.3 Create LLaMa Online endpoint
This step may take a few minutes.

In [None]:
from azure.ai.ml.entities import ManagedOnlineEndpoint, ManagedOnlineDeployment, OnlineRequestSettings

# create an online endpoint
llama_endpoint = ManagedOnlineEndpoint(
        name=llama_endpoint_name,
        description="Online endpoint for LLaMa",
        auth_mode=auth_mode_for_llama,
    )
ml_client.begin_create_or_update(llama_endpoint).result()

deployment_name="demo"
demo_deployment = ManagedOnlineDeployment(
    name=deployment_name,
    endpoint_name=llama_endpoint_name,
    model=foundation_model.id,
    instance_type=compute_sku_for_llama,
    instance_count=compute_instance_count_for_llama,
    request_settings=OnlineRequestSettings(
        request_timeout_ms=60000,
    )
)
ml_client.online_deployments.begin_create_or_update(demo_deployment).wait()
# deployment takes 100 traffic
llama_endpoint.traffic = {deployment_name: 100}
ml_client.online_endpoints.begin_create_or_update(llama_endpoint)

llama_endpoint = ml_client.online_endpoints.get(name=llama_endpoint_name)
print(llama_endpoint)

### 6. Create `score.py` for the Safety enabled LLaMa endpoint


#### 6.1 Create a folder to save the score.py and conda dependencies file.
First create a source folder for the score.py file and conda dependencies file:

In [None]:
import os

scoring_src_dir = "./safety-llama"
os.makedirs(scoring_src_dir, exist_ok=True)
print(f"Scoring script directory: {scoring_src_dir}")

#### 6.2 Create the score.py

#### 6.3 Create the conda.yaml

In [None]:
%%writefile {scoring_src_dir}/conda.yaml
name: aacs-conda
channels:
  - defaults
dependencies:
  - python=3.9
  - pip:
    - azure-identity==1.14.0b1
    - azure-ai-ml==1.8.0
    - azureml-inference-server-http==0.8.4


### 7. Create a Managed Identity for the safety enabled LLaMa endpoint
As you can see in the above steps, we specified auth_mode to aml_token for the LLaMa online endpoint and Content Safey is support token based authentication by default, which means we need to create a managed identity for the safety enabled LLaMa endpoint, so that it can access the Azure AI Content Safety and the LLaMa endpoint.

#### 7.1 Decide on the name of your user identity:

In [None]:
uai_name = f"{endpoint_name}-uai"
print(f"Will create UAI {uai_name}")

#### 7.2 Get a handle to the ManagedServiceIdentityClient

In [19]:
from azure.mgmt.msi import ManagedServiceIdentityClient
from azure.mgmt.msi.models import Identity

try:
    credential = DefaultAzureCredential()
    # Check if given credential can get token successfully.
    credential.get_token("https://management.azure.com/.default")
except Exception as ex:
    # Fall back to InteractiveBrowserCredential in case DefaultAzureCredential not work
    credential = InteractiveBrowserCredential()
    
msi_client = ManagedServiceIdentityClient(
    subscription_id=subscription_id,
    credential=credential,
)

#### 7.2 Create the user identity:

In [None]:
msi_client.user_assigned_identities.create_or_update(
    resource_group_name=resource_group_name,
    resource_name=uai_name,
    parameters=Identity(location=workspace_location),
)

#### 7.3 Retrieve the identity object
we need to retrieve the identity object so that we can use it to configure the Safety-Enabled LLaMa Online Endpoint.

In [None]:
uai_identity = msi_client.user_assigned_identities.get(
    resource_group_name=resource_group_name,
    resource_name=uai_name,
)
uai_principal_id = uai_identity.principal_id
uai_client_id = uai_identity.client_id
uai_id = uai_identity.id
print(f"UAI principal id: {uai_principal_id}")
print(f"UAI client id: {uai_client_id}")
print(f"UAI id: {uai_id}")

### 8. Give Access permission to the Managed Identity we created above.
Note: In order to successfully run scripts in current step, your must have owner permission on the AACS resource and the LLaMa endpoint, which we created in the previous steps.

#### 8.1 Grant the user identity access to the LLaMa endpoint by updating LLaMa online endpoint's tags

In [None]:
llama_endpoint = ml_client.online_endpoints.get(name=llama_endpoint_name)
llama_endpoint.tags = {"AllowlistedObjectIds": uai_principal_id}
ml_client.online_endpoints.begin_create_or_update(llama_endpoint)

#### 8.2 Get an AuthorizationManagementClient to list Role Definitions

In [23]:
from azure.mgmt.authorization import AuthorizationManagementClient
from azure.mgmt.authorization.v2018_01_01_preview.models import RoleDefinition
import uuid

role_definition_client = AuthorizationManagementClient(
    credential=credential,
    subscription_id=subscription_id,
    api_version="2018-01-01-preview",
)

from azure.mgmt.authorization.v2020_10_01_preview.models import RoleAssignmentCreateParameters

role_assignment_client = AuthorizationManagementClient(
    credential=credential,
    subscription_id=subscription_id,
    api_version="2020-10-01-preview",
)

#### 8.3 Grant the user identity access to the Content Safety
Cognitive Services User role is required to access the Content Safety.

In [None]:
role_name = "Cognitive Services User"
scope = aacs_resource_id

role_defs = role_definition_client.role_definitions.list(scope=scope)
role_def = next((r for r in role_defs if r.role_name == role_name))

role_assignment_client.role_assignments.create(
    scope=scope,
    role_assignment_name=str(uuid.uuid4()),
    parameters=RoleAssignmentCreateParameters(
        role_definition_id=role_def.id,
        principal_id=uai_principal_id,
        principal_type="ServicePrincipal",
    ),
)

#### 8.4 Assign AcrPull at the workspace container registry scope
Since we will create the safety enabled llama endpoint with User Assigned Identity, the user's managed identity must have Storage blob data reader permission on the storage account for the workspace, and AcrPull permission on the Azure Container Registry (ACR) for the workspace. Make sure your User Assigned Identity has the right permission.

In [None]:
workspace = ml_client.workspaces.get(workspace_name)
container_registry = workspace.container_registry

role_name = "AcrPull"
acr_scope = container_registry

role_defs = role_definition_client.role_definitions.list(scope=acr_scope)
role_def = next((r for r in role_defs if r.role_name == role_name))

role_assignment_client.role_assignments.create(
    scope=acr_scope,
    role_assignment_name=str(uuid.uuid4()),
    parameters=RoleAssignmentCreateParameters(
        role_definition_id=role_def.id,
        principal_id=uai_principal_id,
        principal_type="ServicePrincipal",
    ),
)
print("Role assignment for AcrPull at the workspace container registry completed.")

In [None]:
role_name = "Storage Blob Data Reader"
blob_scope = workspace.storage_account

role_defs = role_definition_client.role_definitions.list(scope=blob_scope)
role_def = next((r for r in role_defs if r.role_name == role_name))

role_assignment_client.role_assignments.create(
    scope=blob_scope,
    role_assignment_name=str(uuid.uuid4()),
    parameters=RoleAssignmentCreateParameters(
        role_definition_id=role_def.id,
        principal_id=uai_principal_id,
        principal_type="ServicePrincipal",
    ),
)
print("Role assignment for `Storage Blob Data Reader` at the workspace storage account completed.")

### 9. Create Safety-Enabled LLaMa Online Endpoint using above score.py

#### 9.1 Decide on SKU and instance count for the Safety-Enabled LLaMa Online Endpoint.

In [27]:
compute_sku_for_safety_proxy = "Standard_DS5_v2"
compute_count = 1

#### 9.2 Create the Safety-Enabled LLaMa Online Endpoint
This step may take a few minutes.

In [None]:
# environment variables that will be used in the scoring script
env_key_of_aacs_endpoint = "AACS_ENDPOINT"
env_key_of_llama_score_uri = "LLAMA_SCORE_URI"
env_key_of_uai_id = "UAI_CLIENT_ID"
env_key_of_subscription_id = "SUBSCRIPTION_ID"
env_key_of_resource_group_name = "RESOURCE_GROUP_NAME"
env_key_of_workspace_name = "WORKSPACE_NAME"

from azure.ai.ml.entities import (
    ManagedOnlineDeployment,
    ManagedOnlineEndpoint,
    CodeConfiguration,
    Environment,
    ManagedIdentityConfiguration,
    IdentityConfiguration
)

llama_score_uri = llama_endpoint.scoring_uri
if not llama_score_uri:
    raise Exception("LLaMa Endpoint has no scoring uri.")
else:
    print(f"LLaMa Endpoint scoring uri: {llama_score_uri}")

if not aacs_endpoint:
    raise Exception("AACS Endpoint is not valid.")
else:
    print(f"AACS Endpoint: {aacs_endpoint}")
 
deployment = ManagedOnlineDeployment(
        name="blue",
        endpoint_name=endpoint_name,
        code_configuration=CodeConfiguration(
            code=f"{scoring_src_dir}", scoring_script="score.py"
        ),
        environment=Environment(
            conda_file=f"{scoring_src_dir}/conda.yaml",
            image="mcr.microsoft.com/azureml/openmpi4.1.0-ubuntu20.04:latest",
        ),
        instance_type=compute_sku_for_safety_proxy,
        instance_count=compute_count,
        environment_variables={
            env_key_of_uai_id: uai_client_id,
            env_key_of_aacs_endpoint: aacs_endpoint,
            env_key_of_llama_score_uri: llama_score_uri,
            env_key_of_subscription_id: subscription_id,
            env_key_of_resource_group_name: resource_group_name,
            env_key_of_workspace_name: workspace_name
        },
    )


endpoint = ManagedOnlineEndpoint(
        name=endpoint_name,
        auth_mode="key",
        identity=IdentityConfiguration(
            type="user_assigned",
            user_assigned_identities=[
                ManagedIdentityConfiguration(resource_id=uai_id)
            ],
        ),
    )
# create online endpoint
ml_client.online_endpoints.begin_create_or_update(endpoint).result()

endpoint = ml_client.online_endpoints.get(endpoint_name)
print(endpoint.identity.type)
print(endpoint.identity.user_assigned_identities)

# create deployment
ml_client.online_deployments.begin_create_or_update(deployment).result()
# check status
deployment = ml_client.online_deployments.get(
        endpoint_name=endpoint_name, name=deployment.name
    )
print(deployment)
# Set traffic to 100% for deployment
endpoint.traffic = {str(deployment.name): 100}
ml_client.begin_create_or_update(endpoint).result()


### 10. Test the Safety Enabled LLaMa online endpoint.

#### 10.1 Prepare sample request

In [31]:
import os

test_src_dir = "./safety-llama-test"
os.makedirs(test_src_dir, exist_ok=True)
print(f"test script directory: {test_src_dir}")

test script directory: ./safety-llama-test


In [34]:
%%writefile {test_src_dir}/sample-request.json
{"data": "Hello World"}

Overwriting ./safety-llama-test/sample-request.json


In [None]:
sample_data = f"{test_src_dir}/sample-request.json"
ml_client.online_endpoints.invoke(endpoint_name=endpoint_name, request_file=sample_data)