In [None]:
%%writefile deploymentconfig.json
{
    "computeType": "aks",
    "containerResourceRequirements": {"cpu": 1, "memoryInGB": 4},
    "gpuCores": 1,
}

In [None]:
%%writefile bidaf_utils.py
"""score_bidaf.py

Scoring script for use with the Bi-directional Attention Flow model from the ONNX model zoo.
https://github.com/onnx/models/tree/master/text/machine_comprehension/bidirectional_attention_flow
"""

import json
import nltk
import numpy as np
import os

from nltk import word_tokenize
from utils import get_model_info, parse_model_http, triton_init, triton_infer
from tritonclientutils import triton_to_np_dtype


def preprocess(text, dtype):
    """Tokenizes text for use in the bidirectional attention flow model

    Parameters
    ---------
    text : str
        Text to be tokenized

    dtype : numpy datatype
        Datatype of the resulting array

    Returns
    ---------
    (np.array(), np.array())
        Tuple containing two numpy arrays with the tokenized
        words and chars, respectively.

    From: https://github.com/onnx/models/tree/master/text/machine_comprehension/bidirectional_attention_flow  # noqa
    """
    nltk.download("punkt")
    tokens = word_tokenize(text)
    # split into lower-case word tokens, in numpy array with shape of (seq, 1)
    words = np.array([w.lower() for w in tokens], dtype=dtype).reshape(-1, 1)
    # split words into chars, in numpy array with shape of (seq, 1, 1, 16)
    chars = [[c for c in t][:16] for t in tokens]
    chars = [cs + [""] * (16 - len(cs)) for cs in chars]
    chars = np.array(chars, dtype=dtype).reshape(-1, 1, 1, 16)
    return words, chars


def postprocess(context_words, answer):
    """Post-process results to show the chosen result

    Parameters
    --------
    context_words : str
        Original context

    answer : InferResult
        Triton inference result containing start and
        end positions of desired answer

    Returns
    --------
    Numpy array containing the words from the context that
    answer the given query.
    """

    start = answer.as_numpy("start_pos")[0]
    end = answer.as_numpy("end_pos")[0]
    print(f"start is {start}, end is {end}")
    return [w.encode() for w in context_words[start : end + 1].reshape(-1)]

In [None]:
%%writefile model_utils.py
"""download_models

Downloads models needed for Triton example notebooks.
"""
import os
import urllib
from azure.storage.blob import BlobClient


model_names = ["densenet_onnx", "bidaf-9"]


def download_triton_models(prefix):
    for model in model_names:
        folder_path, model_file_path = _generate_paths(model, prefix)
        url = f"https://aka.ms/{model}-model"
        _download_model(model_file_path, folder_path, url)
        print(f"successfully downloaded model: {model}")


def delete_triton_models(prefix):
    for model in model_names:
        _, model_file_path = _generate_paths(model, prefix)
        try:
            os.remove(model_file_path)
            print(f"successfully deleted model: {model}")
        except FileNotFoundError:
            print(f"model: {model} was already deleted")


def _download_model(model_file_path, folder_path, url):
    response = urllib.request.urlopen(url)

    blob_client = BlobClient.from_blob_url(response.url)

    # save the model if it does not already exist
    if not os.path.exists(model_file_path):
        os.makedirs(folder_path, exist_ok=True)
        with open(model_file_path, "wb") as my_blob:
            download_stream = blob_client.download_blob()
            my_blob.write(download_stream.readall())


def _generate_paths(model, prefix):
    folder_path = prefix.joinpath("models", "triton", model, "1")
    model_file_path = prefix.joinpath(folder_path, "model.onnx")
    return folder_path, model_file_path

In [None]:
%%writefile onnxruntimetrion.py
"""
onnxruntimetriton

Offers the class InferenceSession which can be used as a drop-in replacement for the ONNX Runtime
session object.

"""

import tritonclient.http as tritonhttpclient
import numpy as np


class NodeArg:
    def __init__(self, name, shape):
        self.name = name
        self.shape = shape


class InferenceSession:
    def __init__(self, path_or_bytes, sess_options=None, providers=[]):
        self.client = tritonhttpclient.InferenceServerClient("localhost:8000")
        model_metadata = self.client.get_model_metadata(
            model_name=path_or_bytes
        )

        self.request_count = 0
        self.model_name = path_or_bytes
        self.inputs = []
        self.outputs = []
        self.dtype_mapping = {}

        for (src, dest) in (
            (model_metadata["inputs"], self.inputs),
            (model_metadata["outputs"], self.outputs),
        ):
            for element in src:
                dest.append(NodeArg(element["name"], element["shape"]))
                self.dtype_mapping[element["name"]] = element["datatype"]

        self.triton_enabled = True

    def get_inputs(self):
        return self.inputs

    def get_outputs(self):
        return self.outputs

    def run(self, output_names, input_feed, run_options=None):
        inputs = []
        for key, val in input_feed.items():
            val = np.expand_dims(val, axis=0)
            input = tritonhttpclient.InferInput(
                key, val.shape, self.dtype_mapping[key]
            )
            input.set_data_from_numpy(val)
            inputs.append(input)

        outputs = []

        for output_name in output_names:
            output = tritonhttpclient.InferRequestedOutput(output_name)
            outputs.append(output)

        res = self.client.infer(
            self.model_name,
            inputs,
            request_id=str(self.request_count),
            outputs=outputs,
        )
        results = []
        for output_name in output_names:
            results.append(res.as_numpy(output_name))

        return results

In [None]:
%%writefile score_densenet.py
import io
import numpy as np
import os

from azureml.core import Model
from azureml.contrib.services.aml_request import rawhttp
from azureml.contrib.services.aml_response import AMLResponse
from PIL import Image
from utils import get_model_info, parse_model_http, triton_init, triton_infer
from onnxruntimetriton import InferenceSession


def preprocess(img, scaling):  # , dtype):
    """Pre-process an image to meet the size, type and format
    requirements specified by the parameters.
    """
    c = 3
    h = 224
    w = 224
    format = "FORMAT_NCHW"

    if c == 1:
        sample_img = img.convert("L")
    else:
        sample_img = img.convert("RGB")

    resized_img = sample_img.resize((w, h), Image.BILINEAR)
    resized = np.array(resized_img)
    if resized.ndim == 2:
        resized = resized[:, :, np.newaxis]

    # npdtype = triton_to_np_dtype(dtype)
    typed = resized.astype(np.float32)
    # typed = resized

    if scaling == "INCEPTION":
        scaled = (typed / 128) - 1
    elif scaling == "VGG":
        if c == 1:
            scaled = typed - np.asarray((128,), dtype=npdtype)
        else:
            scaled = typed - np.asarray((123, 117, 104), dtype=npdtype)
    else:
        scaled = typed

    # Swap to CHW if necessary
    if format == "FORMAT_NCHW":
        ordered = np.transpose(scaled, (2, 0, 1))
    else:
        ordered = scaled

    # Channels are in RGB order. Currently model configuration data
    # doesn't provide any information as to other channel orderings
    # (like BGR) so we just assume RGB.
    return ordered


def postprocess(output_array):
    """Post-process results to show the predicted label."""

    output_array = output_array[0]
    max_label = np.argmax(output_array)
    final_label = label_dict[max_label]
    return f"{max_label} : {final_label}"


def init():
    global session, label_dict
    session = InferenceSession(path_or_bytes="densenet_onnx")

    model_dir = os.path.join(os.environ["AZUREML_MODEL_DIR"], "models")
    folder_path = os.path.join(model_dir, "triton", "densenet_onnx")
    label_path = os.path.join(
        model_dir, "triton", "densenet_onnx", "densenet_labels.txt"
    )
    label_file = open(label_path, "r")
    labels = label_file.read().split("\n")
    label_dict = dict(enumerate(labels))


@rawhttp
def run(request):
    """This function is called every time your webservice receives a request.

    Notice you need to know the names and data types of the model inputs and
    outputs. You can get these values by reading the model configuration file
    or by querying the model metadata endpoint.
    """

    if request.method == "POST":
        outputs = []

        for output in session.get_outputs():
            outputs.append(output.name)

        input_name = session.get_inputs()[0].name

        reqBody = request.get_data(False)
        img = Image.open(io.BytesIO(reqBody))
        image_data = preprocess(img, scaling="INCEPTION")

        res = session.run(outputs, {input_name: image_data})

        result = postprocess(output_array=res)

        return AMLResponse(result, 200)
    else:
        return AMLResponse("bad request", 500)

In [None]:
%%writefile test_service.py
"""test_service.py

Sends a specified image from the data directory to a deployed ML model
and returns the result.
"""

import argparse
import os
import requests

from azureml.core.webservice import AksWebservice
from azureml.core.workspace import Workspace


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Test a deployed endpoint.")
    parser.add_argument(
        "--endpoint_name",
        type=str,
        default="triton-densenet-onnx",
        help="name of the endpoint to test",
    )
    parser.add_argument(
        "--data_file",
        type=str,
        default="../../data/raw/triton/peacock.jpg",
        help="filename to run through the classifier",
    )
    args = parser.parse_args()

    ws = Workspace.from_config()
    aks_service = AksWebservice(ws, args.endpoint_name)

    # if (key) auth is enabled, fetch keys and include in the request
    key1, _ = aks_service.get_keys()

    headers = {
        "Content-Type": "application/octet-stream",
        "Authorization": "Bearer " + key1,
    }

    file_name = os.path.join(
        os.path.abspath(os.path.dirname(__file__)),
        "..",
        "data",
        args.data_file,
    )
    test_sample = open(file_name, "rb").read()
    resp = requests.post(aks_service.scoring_uri, test_sample, headers=headers)
    print(resp.text)

In [None]:
%%writefile tritonhttpclient.py
# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#  * Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
#  * Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#  * Neither the name of NVIDIA CORPORATION nor the names of its
#    contributors may be used to endorse or promote products derived
#    from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

from geventhttpclient import HTTPClient
from geventhttpclient.url import URL

from urllib.parse import quote, quote_plus
import rapidjson as json
import numpy as np
import gevent
import gevent.pool
import struct

from tritonclientutils import *


def _get_error(response):
    """
    Returns the InferenceServerException object if response
    indicates the error. If no error then return None
    """
    if response.status_code != 200:
        error_response = json.loads(response.read())
        return InferenceServerException(msg=error_response["error"])
    else:
        return None


def _raise_if_error(response):
    """
    Raise InferenceServerException if received non-Success
    response from the server
    """
    error = _get_error(response)
    if error is not None:
        raise error


def _get_query_string(query_params):
    params = []
    for key, value in query_params.items():
        if isinstance(value, list):
            for item in value:
                params.append(
                    "%s=%s" % (quote_plus(key), quote_plus(str(item)))
                )
        else:
            params.append("%s=%s" % (quote_plus(key), quote_plus(str(value))))
    if params:
        return "&".join(params)
    return ""


def _get_inference_request(
    inputs,
    request_id,
    outputs,
    sequence_id,
    sequence_start,
    sequence_end,
    priority,
    timeout,
):
    infer_request = {}
    parameters = {}
    if request_id != "":
        infer_request["id"] = request_id
    if sequence_id != 0:
        parameters["sequence_id"] = sequence_id
        parameters["sequence_start"] = sequence_start
        parameters["sequence_end"] = sequence_end
    if priority != 0:
        parameters["priority"] = priority
    if timeout is not None:
        parameters["timeout"] = timeout

    infer_request["inputs"] = [
        this_input._get_tensor() for this_input in inputs
    ]
    if outputs:
        infer_request["outputs"] = [
            this_output._get_tensor() for this_output in outputs
        ]

    if parameters:
        infer_request["parameters"] = parameters

    request_body = json.dumps(infer_request)
    json_size = len(request_body)
    binary_data = None
    for input_tensor in inputs:
        raw_data = input_tensor._get_binary_data()
        if raw_data is not None:
            if binary_data is not None:
                binary_data += raw_data
            else:
                binary_data = raw_data

    if binary_data is not None:
        request_body = struct.pack(
            "{}s{}s".format(len(request_body), len(binary_data)),
            request_body.encode(),
            binary_data,
        )
        return request_body, json_size

    return request_body, None


class InferenceServerClient:
    """An InferenceServerClient object is used to perform any kind of
    communication with the InferenceServer using http protocol.

    Parameters
    ----------
    url : str
        The inference server URL, e.g. 'localhost:8000'.
    connection_count : int
        The number of connections to create for this client.
        Default value is 1.
    connection_timeout : float
        The timeout value for the connection. Default value
        is 60.0 sec.
    network_timeout : float
        The timeout value for the network. Default value is
        60.0 sec
    verbose : bool
        If True generate verbose output. Default value is False.
    max_greenlets : int
        Determines the maximum allowed number of worker greenlets
        for handling asynchronous inference requests. Default value
        is None, which means there will be no restriction on the
        number of greenlets created.

    Raises
        ------
        Exception
            If unable to create a client.

    """

    def __init__(
        self,
        url,
        connection_count=1,
        connection_timeout=60.0,
        network_timeout=60.0,
        verbose=False,
        max_greenlets=None,
    ):
        if not url.startswith("http://") and not url.startswith("https://"):
            url = "http://" + url
        self._parsed_url = URL(url)
        self._base_uri = self._parsed_url.request_uri
        self._client_stub = HTTPClient.from_url(
            self._parsed_url,
            concurrency=connection_count,
            connection_timeout=connection_timeout,
            network_timeout=network_timeout,
        )
        self._pool = gevent.pool.Pool(max_greenlets)
        self._verbose = verbose

    def __enter__(self):
        return self

    def __exit__(self, type, value, traceback):
        self.close()

    def __del__(self):
        self.close()

    def close(self):
        """Close the client. Any future calls to server
        will result in an Error.

        """
        self._pool.join()
        self._client_stub.close()

    def _get(self, request_uri, headers, query_params):
        """Issues the GET request to the server

         Parameters
        ----------
        request_uri: str
            The request URI to be used in GET request.
        headers: dict
            Additional HTTP headers to include in the request.
        query_params: dict
            Optional url query parameters to use in network
            transaction.

        Returns
        -------
        geventhttpclient.response.HTTPSocketPoolResponse
            The response from server.
        """
        if self._base_uri is not None:
            request_uri = self._base_uri + "/" + request_uri
        if query_params is not None:
            request_uri = request_uri + "?" + _get_query_string(query_params)

        if self._verbose:
            print("GET {}, headers {}".format(request_uri, headers))

        if headers is not None:
            response = self._client_stub.get(request_uri, headers=headers)
        else:
            response = self._client_stub.get(request_uri)

        if self._verbose:
            print(response)

        return response

    def _post(self, request_uri, request_body, headers, query_params):
        """Issues the POST request to the server

        Parameters
        ----------
        request_uri: str
            The request URI to be used in POST request.
        request_body: str
            The body of the request
        headers: dict
            Additional HTTP headers to include in the request.
        query_params: dict
            Optional url query parameters to use in network
            transaction.

        Returns
        -------
        geventhttpclient.response.HTTPSocketPoolResponse
            The response from server.
        """
        if self._base_uri is not None:
            request_uri = self._base_uri + "/" + request_uri
        if query_params is not None:
            request_uri = request_uri + "?" + _get_query_string(query_params)

        if self._verbose:
            print(
                "POST {}, headers {}\n{}".format(
                    request_uri, headers, request_body
                )
            )

        if headers is not None:
            response = self._client_stub.post(
                request_uri=request_uri, body=request_body, headers=headers
            )
        else:
            response = self._client_stub.post(
                request_uri=request_uri, body=request_body
            )

        if self._verbose:
            print(response)

        return response

    def is_server_live(self, headers=None, query_params=None):
        """Contact the inference server and get liveness.

        Parameters
        ----------
        headers: dict
            Optional dictionary specifying additional HTTP
            headers to include in the request.
        query_params: dict
            Optional url query parameters to use in network
            transaction.

        Returns
        -------
        bool
            True if server is live, False if server is not live.

        Raises
        ------
        Exception
            If unable to get liveness.

        """

        request_uri = "v2/health/live"
        response = self._get(
            request_uri=request_uri, headers=headers, query_params=query_params
        )

        return response.status_code == 200

    def is_server_ready(self, headers=None, query_params=None):
        """Contact the inference server and get readiness.

        Parameters
        ----------
        headers: dict
            Optional dictionary specifying additional HTTP
            headers to include in the request.
        query_params: dict
            Optional url query parameters to use in network
            transaction.

        Returns
        -------
        bool
            True if server is ready, False if server is not ready.

        Raises
        ------
        Exception
            If unable to get readiness.

        """
        request_uri = "v2/health/ready"
        response = self._get(
            request_uri=request_uri, headers=headers, query_params=query_params
        )

        return response.status_code == 200

    def is_model_ready(
        self, model_name, model_version="", headers=None, query_params=None
    ):
        """Contact the inference server and get the readiness of specified model.

        Parameters
        ----------
        model_name: str
            The name of the model to check for readiness.
        model_version: str
            The version of the model to check for readiness. The default value
            is an empty string which means then the server will choose a version
            based on the model and internal policy.
        headers: dict
            Optional dictionary specifying additional HTTP
            headers to include in the request.
        query_params: dict
            Optional url query parameters to use in network
            transaction.

        Returns
        -------
        bool
            True if the model is ready, False if not ready.

        Raises
        ------
        Exception
            If unable to get model readiness.

        """
        if type(model_version) != str:
            raise_error("model version must be a string")
        if model_version != "":
            request_uri = "v2/models/{}/versions/{}/ready".format(
                quote(model_name), model_version
            )
        else:
            request_uri = "v2/models/{}/ready".format(quote(model_name))

        response = self._get(
            request_uri=request_uri, headers=headers, query_params=query_params
        )

        return response.status_code == 200

    def get_server_metadata(self, headers=None, query_params=None):
        """Contact the inference server and get its metadata.

        Parameters
        ----------
        headers: dict
            Optional dictionary specifying additional HTTP
            headers to include in the request.
        query_params: dict
            Optional url query parameters to use in network
            transaction.

        Returns
        -------
        dict
            The JSON dict holding the metadata.

        Raises
        ------
        InferenceServerException
            If unable to get server metadata.

        """
        request_uri = "v2"
        response = self._get(
            request_uri=request_uri, headers=headers, query_params=query_params
        )
        _raise_if_error(response)

        content = response.read()
        if self._verbose:
            print(content)

        return json.loads(content)

    def get_model_metadata(
        self, model_name, model_version="", headers=None, query_params=None
    ):
        """Contact the inference server and get the metadata for specified model.

        Parameters
        ----------
        model_name: str
            The name of the model
        model_version: str
            The version of the model to get metadata. The default value
            is an empty string which means then the server will choose
            a version based on the model and internal policy.
        headers: dict
            Optional dictionary specifying additional
            HTTP headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Returns
        -------
        dict
            The JSON dict holding the metadata.

        Raises
        ------
        InferenceServerException
            If unable to get model metadata.

        """
        if type(model_version) != str:
            raise_error("model version must be a string")
        if model_version != "":
            request_uri = "v2/models/{}/versions/{}".format(
                quote(model_name), model_version
            )
        else:
            request_uri = "v2/models/{}".format(quote(model_name))

        response = self._get(
            request_uri=request_uri, headers=headers, query_params=query_params
        )
        _raise_if_error(response)

        content = response.read()
        if self._verbose:
            print(content)

        return json.loads(content)

    def get_model_config(
        self, model_name, model_version="", headers=None, query_params=None
    ):
        """Contact the inference server and get the configuration for specified model.

        Parameters
        ----------
        model_name: str
            The name of the model
        model_version: str
            The version of the model to get configuration. The default value
            is an empty string which means then the server will choose
            a version based on the model and internal policy.
        headers: dict
            Optional dictionary specifying additional
            HTTP headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Returns
        -------
        dict
            The JSON dict holding the model config.

        Raises
        ------
        InferenceServerException
            If unable to get model configuration.

        """
        if model_version != "":
            request_uri = "v2/models/{}/versions/{}/config".format(
                quote(model_name), model_version
            )
        else:
            request_uri = "v2/models/{}/config".format(quote(model_name))

        response = self._get(
            request_uri=request_uri, headers=headers, query_params=query_params
        )
        _raise_if_error(response)

        content = response.read()
        if self._verbose:
            print(content)

        return json.loads(content)

    def get_model_repository_index(self, headers=None, query_params=None):
        """Get the index of model repository contents

        Parameters
        ----------
        headers: dict
            Optional dictionary specifying additional
            HTTP headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Returns
        -------
        dict
            The JSON dict holding the model repository index.

        Raises
        ------
        InferenceServerException
            If unable to get the repository index.

        """
        request_uri = "v2/repository/index"
        response = self._post(
            request_uri=request_uri,
            request_body="",
            headers=headers,
            query_params=query_params,
        )
        _raise_if_error(response)

        content = response.read()
        if self._verbose:
            print(content)

        return json.loads(content)

    def load_model(self, model_name, headers=None, query_params=None):
        """Request the inference server to load or reload specified model.

        Parameters
        ----------
        model_name : str
            The name of the model to be loaded.
        headers: dict
            Optional dictionary specifying additional
            HTTP headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Raises
        ------
        InferenceServerException
            If unable to load the model.

        """
        request_uri = "v2/repository/models/{}/load".format(quote(model_name))
        response = self._post(
            request_uri=request_uri,
            request_body="",
            headers=headers,
            query_params=query_params,
        )
        _raise_if_error(response)
        if self._verbose:
            print("Loaded model '{}'".format(model_name))

    def unload_model(self, model_name, headers=None, query_params=None):
        """Request the inference server to unload specified model.

        Parameters
        ----------
        model_name : str
            The name of the model to be unloaded.
        headers: dict
            Optional dictionary specifying additional
            HTTP headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Raises
        ------
        InferenceServerException
            If unable to unload the model.

        """
        request_uri = "v2/repository/models/{}/unload".format(
            quote(model_name)
        )
        response = self._post(
            request_uri=request_uri,
            request_body="",
            headers=headers,
            query_params=query_params,
        )
        _raise_if_error(response)
        if self._verbose:
            print("Loaded model '{}'".format(model_name))

    def get_inference_statistics(
        self, model_name="", model_version="", headers=None, query_params=None
    ):
        """Get the inference statistics for the specified model name and
        version.

        Parameters
        ----------
        model_name : str
            The name of the model to get statistics. The default value is
            an empty string, which means statistics of all models will
            be returned.
        model_version: str
            The version of the model to get inference statistics. The
            default value is an empty string which means then the server
            will return the statistics of all available model versions.
        headers: dict
            Optional dictionary specifying additional HTTP
            headers to include in the request.
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Returns
        -------
        dict
            The JSON dict holding the model inference statistics.

        Raises
        ------
        InferenceServerException
            If unable to get the model inference statistics.

        """

        if model_name != "":
            if type(model_version) != str:
                raise_error("model version must be a string")
            if model_version != "":
                request_uri = "v2/models/{}/versions/{}/stats".format(
                    quote(model_name), model_version
                )
            else:
                request_uri = "v2/models/{}/stats".format(quote(model_name))
        else:
            request_uri = "v2/models/stats"

        response = self._get(
            request_uri=request_uri, headers=headers, query_params=query_params
        )
        _raise_if_error(response)

        content = response.read()
        if self._verbose:
            print(content)

        return json.loads(content)

    def get_system_shared_memory_status(
        self, region_name="", headers=None, query_params=None
    ):
        """Request system shared memory status from the server.

        Parameters
        ----------
        region_name : str
            The name of the region to query status. The default
            value is an empty string, which means that the status
            of all active system shared memory will be returned.
        headers: dict
            Optional dictionary specifying additional HTTP
            headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Returns
        -------
        dict
            The JSON dict holding system shared memory status.

        Raises
        ------
        InferenceServerException
            If unable to get the status of specified shared memory.

        """
        if region_name != "":
            request_uri = "v2/systemsharedmemory/region/{}/status".format(
                quote(region_name)
            )
        else:
            request_uri = "v2/systemsharedmemory/status"

        response = self._get(
            request_uri=request_uri, headers=headers, query_params=query_params
        )
        _raise_if_error(response)

        content = response.read()
        if self._verbose:
            print(content)

        return json.loads(content)

    def register_system_shared_memory(
        self, name, key, byte_size, offset=0, headers=None, query_params=None
    ):
        """Request the server to register a system shared memory with the
        following specification.

        Parameters
        ----------
        name : str
            The name of the region to register.
        key : str
            The key of the underlying memory object that contains the
            system shared memory region.
        byte_size : int
            The size of the system shared memory region, in bytes.
        offset : int
            Offset, in bytes, within the underlying memory object to
            the start of the system shared memory region. The default
            value is zero.
        headers: dict
            Optional dictionary specifying additional
            HTTP headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Raises
        ------
        InferenceServerException
            If unable to register the specified system shared memory.

        """
        request_uri = "v2/systemsharedmemory/region/{}/register".format(
            quote(name)
        )

        register_request = {
            "key": key,
            "offset": offset,
            "byte_size": byte_size,
        }
        request_body = json.dumps(register_request)

        response = self._post(
            request_uri=request_uri,
            request_body=request_body,
            headers=headers,
            query_params=query_params,
        )
        _raise_if_error(response)
        if self._verbose:
            print(
                "Registered system shared memory with name '{}'".format(name)
            )

    def unregister_system_shared_memory(
        self, name="", headers=None, query_params=None
    ):
        """Request the server to unregister a system shared memory with the
        specified name.

        Parameters
        ----------
        name : str
            The name of the region to unregister. The default value is empty
            string which means all the system shared memory regions will be
            unregistered.
        headers: dict
            Optional dictionary specifying additional
            HTTP headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Raises
        ------
        InferenceServerException
            If unable to unregister the specified system shared memory region.

        """
        if name != "":
            request_uri = "v2/systemsharedmemory/region/{}/unregister".format(
                quote(name)
            )
        else:
            request_uri = "v2/systemsharedmemory/unregister"

        response = self._post(
            request_uri=request_uri,
            request_body="",
            headers=headers,
            query_params=query_params,
        )
        _raise_if_error(response)
        if self._verbose:
            if name is not "":
                print(
                    "Unregistered system shared memory with name '{}'".format(
                        name
                    )
                )
            else:
                print("Unregistered all system shared memory regions")

    def get_cuda_shared_memory_status(
        self, region_name="", headers=None, query_params=None
    ):
        """Request cuda shared memory status from the server.

        Parameters
        ----------
        region_name : str
            The name of the region to query status. The default
            value is an empty string, which means that the status
            of all active cuda shared memory will be returned.
        headers: dict
            Optional dictionary specifying additional
            HTTP headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Returns
        -------
        dict
            The JSON dict holding cuda shared memory status.

        Raises
        ------
        InferenceServerException
            If unable to get the status of specified shared memory.

        """
        if region_name != "":
            request_uri = "v2/cudasharedmemory/region/{}/status".format(
                quote(region_name)
            )
        else:
            request_uri = "v2/cudasharedmemory/status"

        response = self._get(
            request_uri=request_uri, headers=headers, query_params=query_params
        )
        _raise_if_error(response)

        content = response.read()
        if self._verbose:
            print(content)

        return json.loads(content)

    def register_cuda_shared_memory(
        self,
        name,
        raw_handle,
        device_id,
        byte_size,
        headers=None,
        query_params=None,
    ):
        """Request the server to register a system shared memory with the
        following specification.

        Parameters
        ----------
        name : str
            The name of the region to register.
        raw_handle : bytes
            The raw serialized cudaIPC handle in base64 encoding.
        device_id : int
            The GPU device ID on which the cudaIPC handle was created.
        byte_size : int
            The size of the cuda shared memory region, in bytes.
        headers: dict
            Optional dictionary specifying additional
            HTTP headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Raises
        ------
        InferenceServerException
            If unable to register the specified cuda shared memory.

        """
        request_uri = "v2/cudasharedmemory/region/{}/register".format(
            quote(name)
        )

        register_request = {
            "raw_handle": {"b64": raw_handle},
            "device_id": device_id,
            "byte_size": byte_size,
        }
        request_body = json.dumps(register_request)

        response = self._post(
            request_uri=request_uri,
            request_body=request_body,
            headers=headers,
            query_params=query_params,
        )
        _raise_if_error(response)
        if self._verbose:
            print("Registered cuda shared memory with name '{}'".format(name))

    def unregister_cuda_shared_memory(
        self, name="", headers=None, query_params=None
    ):
        """Request the server to unregister a cuda shared memory with the
        specified name.

        Parameters
        ----------
        name : str
            The name of the region to unregister. The default value is empty
            string which means all the cuda shared memory regions will be
            unregistered.
        headers: dict
            Optional dictionary specifying additional
            HTTP headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction

        Raises
        ------
        InferenceServerException
            If unable to unregister the specified cuda shared memory region.

        """
        if name != "":
            request_uri = "v2/cudasharedmemory/region/{}/unregister".format(
                quote(name)
            )
        else:
            request_uri = "v2/cudasharedmemory/unregister"

        response = self._post(
            request_uri=request_uri,
            request_body="",
            headers=headers,
            query_params=query_params,
        )
        _raise_if_error(response)
        if self._verbose:
            if name is not "":
                print(
                    "Unregistered cuda shared memory with name '{}'".format(
                        name
                    )
                )
            else:
                print("Unregistered all cuda shared memory regions")

    def infer(
        self,
        model_name,
        inputs,
        model_version="",
        outputs=None,
        request_id="",
        sequence_id=0,
        sequence_start=False,
        sequence_end=False,
        priority=0,
        timeout=None,
        headers=None,
        query_params=None,
    ):
        """Run synchronous inference using the supplied 'inputs' requesting
        the outputs specified by 'outputs'.

        Parameters
        ----------
        model_name: str
            The name of the model to run inference.
        inputs : list
            A list of InferInput objects, each describing data for a input
            tensor required by the model.
        model_version: str
            The version of the model to run inference. The default value
            is an empty string which means then the server will choose
            a version based on the model and internal policy.
        outputs : list
            A list of InferRequestedOutput objects, each describing how the output
            data must be returned. If not specified all outputs produced
            by the model will be returned using default settings.
        request_id: str
            Optional identifier for the request. If specified will be returned
            in the response. Default value is an empty string which means no
            request_id will be used.
        sequence_id : int
            The unique identifier for the sequence being represented by the
            object. Default value is 0 which means that the request does not
            belong to a sequence.
        sequence_start: bool
            Indicates whether the request being added marks the start of the
            sequence. Default value is False. This argument is ignored if
            'sequence_id' is 0.
        sequence_end: bool
            Indicates whether the request being added marks the end of the
            sequence. Default value is False. This argument is ignored if
            'sequence_id' is 0.
        priority : int
            Indicates the priority of the request. Priority value zero
            indicates that the default priority level should be used
            (i.e. same behavior as not specifying the priority parameter).
            Lower value priorities indicate higher priority levels. Thus
            the highest priority level is indicated by setting the parameter
            to 1, the next highest is 2, etc. If not provided, the server
            will handle the request using default setting for the model.
        timeout : int
            The timeout value for the request, in microseconds. If the request
            cannot be completed within the time the server can take a
            model-specific action such as terminating the request. If not
            provided, the server will handle the request using default setting
            for the model.
        headers: dict
            Optional dictionary specifying additional HTTP
            headers to include in the request.
        query_params: dict
            Optional url query parameters to use in network
            transaction.

        Returns
        -------
        InferResult
            The object holding the result of the inference.

        Raises
        ------
        InferenceServerException
            If server fails to perform inference.
        """

        request_body, json_size = _get_inference_request(
            inputs=inputs,
            request_id=request_id,
            outputs=outputs,
            sequence_id=sequence_id,
            sequence_start=sequence_start,
            sequence_end=sequence_end,
            priority=priority,
            timeout=timeout,
        )

        if json_size is not None:
            if headers is None:
                headers = {}
            headers["Inference-Header-Content-Length"] = json_size

        if type(model_version) != str:
            raise_error("model version must be a string")
        if model_version != "":
            request_uri = "v2/models/{}/versions/{}/infer".format(
                quote(model_name), model_version
            )
        else:
            request_uri = "v2/models/{}/infer".format(quote(model_name))

        response = self._post(
            request_uri=request_uri,
            request_body=request_body,
            headers=headers,
            query_params=query_params,
        )
        _raise_if_error(response)

        return InferResult(response, self._verbose)

    def async_infer(
        self,
        model_name,
        inputs,
        model_version="",
        outputs=None,
        request_id="",
        sequence_id=0,
        sequence_start=False,
        sequence_end=False,
        priority=0,
        timeout=None,
        headers=None,
        query_params=None,
    ):
        """Run asynchronous inference using the supplied 'inputs' requesting
        the outputs specified by 'outputs'.

        Parameters
        ----------
        model_name: str
            The name of the model to run inference.
        inputs : list
            A list of InferInput objects, each describing data for a input
            tensor required by the model.
        model_version: str
            The version of the model to run inference. The default value
            is an empty string which means then the server will choose
            a version based on the model and internal policy.
        outputs : list
            A list of InferRequestedOutput objects, each describing how the output
            data must be returned. If not specified all outputs produced
            by the model will be returned using default settings.
        request_id: str
            Optional identifier for the request. If specified will be returned
            in the response. Default value is 'None' which means no request_id
            will be used.
        sequence_id : int
            The unique identifier for the sequence being represented by the
            object. Default value is 0 which means that the request does not
            belong to a sequence.
        sequence_start: bool
            Indicates whether the request being added marks the start of the
            sequence. Default value is False. This argument is ignored if
            'sequence_id' is 0.
        sequence_end: bool
            Indicates whether the request being added marks the end of the
            sequence. Default value is False. This argument is ignored if
            'sequence_id' is 0.
        priority : int
            Indicates the priority of the request. Priority value zero
            indicates that the default priority level should be used
            (i.e. same behavior as not specifying the priority parameter).
            Lower value priorities indicate higher priority levels. Thus
            the highest priority level is indicated by setting the parameter
            to 1, the next highest is 2, etc. If not provided, the server
            will handle the request using default setting for the model.
        timeout : int
            The timeout value for the request, in microseconds. If the request
            cannot be completed within the time the server can take a
            model-specific action such as terminating the request. If not
            provided, the server will handle the request using default setting
            for the model.
        headers: dict
            Optional dictionary specifying additional HTTP
            headers to include in the request
        query_params: dict
            Optional url query parameters to use in network
            transaction.

        Returns
        -------
        InferAsyncRequest object
            The handle to the asynchronous inference request.

        Raises
        ------
        InferenceServerException
            If server fails to issue inference.
        """

        def wrapped_post(request_uri, request_body, headers, query_params):
            return self._post(request_uri, request_body, headers, query_params)

        request_body, json_size = _get_inference_request(
            inputs=inputs,
            request_id=request_id,
            outputs=outputs,
            sequence_id=sequence_id,
            sequence_start=sequence_start,
            sequence_end=sequence_end,
            priority=priority,
            timeout=timeout,
        )

        if json_size is not None:
            if headers is None:
                headers = {}
            headers["Inference-Header-Content-Length"] = json_size

        if type(model_version) != str:
            raise_error("model version must be a string")
        if model_version != "":
            request_uri = "v2/models/{}/versions/{}/infer".format(
                quote(model_name), model_version
            )
        else:
            request_uri = "v2/models/{}/infer".format(quote(model_name))

        g = self._pool.apply_async(
            wrapped_post, (request_uri, request_body, headers, query_params)
        )

        g.start()

        if self._verbose:
            verbose_message = "Sent request"
            if request_id is not "":
                verbose_message = verbose_message + " '{}'".format(request_id)
            print(verbose_message)

        return InferAsyncRequest(g, self._verbose)


class InferAsyncRequest:
    """An object of InferAsyncRequest class is used to describe
    a handle to an ongoing asynchronous inference request.

    Parameters
    ----------
    greenlet : gevent.Greenlet
        The greenlet object which will provide the results.
        For further details about greenlets refer
        http://www.gevent.org/api/gevent.greenlet.html.

    verbose : bool
        If True generate verbose output. Default value is False.
    """

    def __init__(self, greenlet, verbose=False):
        self._greenlet = greenlet
        self._verbose = verbose

    def get_result(self, block=True, timeout=None):
        """Get the results of the associated asynchronous inference.
        Parameters
        ----------
        block : bool
            If block is True, the function will wait till the
            corresponding response is received from the server.
            Default value is True.
        timeout : int
            The maximum wait time for the function. This setting is
            ignored if the block is set False. Default is None,
            which means the function will block indefinitely till
            the corresponding response is received.

        Returns
        -------
        InferResult
            The object holding the result of the async inference.

        Raises
        ------
        InferenceServerException
            If server fails to perform inference or failed to respond
            within specified timeout.
        """

        try:
            response = self._greenlet.get(block=block, timeout=timeout)
        except gevent.Timeout as e:
            raise_error("failed to obtain inference response")

        _raise_if_error(response)
        return InferResult(response, self._verbose)


class InferInput:
    """An object of InferInput class is used to describe
    input tensor for an inference request.

    Parameters
    ----------
    name : str
        The name of input whose data will be described by this object
    shape : list
        The shape of the associated input.
    datatype : str
        The datatype of the associated input.
    """

    def __init__(self, name, shape, datatype):
        self._name = name
        self._shape = shape
        self._datatype = datatype
        self._parameters = {}
        self._data = None
        self._raw_data = None

    def name(self):
        """Get the name of input associated with this object.

        Returns
        -------
        str
            The name of input
        """
        return self._name

    def datatype(self):
        """Get the datatype of input associated with this object.

        Returns
        -------
        str
            The datatype of input
        """
        return self._datatype

    def shape(self):
        """Get the shape of input associated with this object.

        Returns
        -------
        list
            The shape of input
        """
        return self._shape

    def set_shape(self, shape):
        """Set the shape of input.

        Parameters
        ----------
        shape : list
            The shape of the associated input.
        """
        self._shape = shape

    def set_data_from_numpy(self, input_tensor, binary_data=True):
        """Set the tensor data from the specified numpy array for
        input associated with this object.

        Parameters
        ----------
        input_tensor : numpy array
            The tensor data in numpy array format
        binary_data : bool
            Indicates whether to set data for the input in binary format
            or explicit tensor within JSON. The default value is True,
            which means the data will be delivered as binary data in the
            HTTP body after the JSON object.

        Raises
        ------
        InferenceServerException
            If failed to set data for the tensor.
        """
        if not isinstance(input_tensor, (np.ndarray,)):
            raise_error("input_tensor must be a numpy array")
        dtype = np_to_triton_dtype(input_tensor.dtype)
        if self._datatype != dtype:
            raise_error(
                "got unexpected datatype {} from numpy array, expected {}".format(
                    dtype, self._datatype
                )
            )
        valid_shape = True
        if len(self._shape) != len(input_tensor.shape):
            valid_shape = False
        else:
            for i in range(len(self._shape)):
                if self._shape[i] != input_tensor.shape[i]:
                    valid_shape = False
        if not valid_shape:
            raise_error(
                "got unexpected numpy array shape [{}], expected [{}]".format(
                    str(input_tensor.shape)[1:-1], str(self._shape)[1:-1]
                )
            )

        self._parameters.pop("shared_memory_region", None)
        self._parameters.pop("shared_memory_byte_size", None)
        self._parameters.pop("shared_memory_offset", None)

        if not binary_data:
            self._parameters.pop("binary_data_size", None)
            self._raw_data = None
            if self._datatype == "BYTES":
                self._data = [val for val in input_tensor.flatten()]
            else:
                self._data = [val.item() for val in input_tensor.flatten()]
        else:
            self._data = None
            if self._datatype == "BYTES":
                self._raw_data = serialize_byte_tensor(input_tensor).tobytes()
            else:
                self._raw_data = input_tensor.tobytes()
            self._parameters["binary_data_size"] = len(self._raw_data)

    def set_shared_memory(self, region_name, byte_size, offset=0):
        """Set the tensor data from the specified shared memory region.

        Parameters
        ----------
        region_name : str
            The name of the shared memory region holding tensor data.
        byte_size : int
            The size of the shared memory region holding tensor data.
        offset : int
            The offset, in bytes, into the region where the data for
            the tensor starts. The default value is 0.

        """
        self._data = None
        self._raw_data = None
        self._parameters.pop("binary_data_size", None)

        self._parameters["shared_memory_region"] = region_name
        self._parameters["shared_memory_byte_size"] = byte_size
        if offset != 0:
            self._parameters["shared_memory_offset"].int64_param = offset

    def _get_binary_data(self):
        """Returns the raw binary data if available

        Returns
        -------
        bytes
            The raw data for the input tensor
        """
        return self._raw_data

    def _get_tensor(self):
        """Retrieve the underlying input as json dict.

        Returns
        -------
        dict
            The underlying tensor specification as dict
        """
        if (
            self._parameters.get("shared_memory_region") is not None
            or self._raw_data is not None
        ):
            return {
                "name": self._name,
                "shape": self._shape,
                "datatype": self._datatype,
                "parameters": self._parameters,
            }
        else:
            return {
                "name": self._name,
                "shape": self._shape,
                "datatype": self._datatype,
                "parameters": self._parameters,
                "data": self._data,
            }


class InferRequestedOutput:
    """An object of InferRequestedOutput class is used to describe a
    requested output tensor for an inference request.

    Parameters
    ----------
    name : str
        The name of output tensor to associate with this object.
    binary_data : bool
        Indicates whether to return result data for the output in
        binary format or explicit tensor within JSON. The default
        value is True, which means the data will be delivered as
        binary data in the HTTP body after JSON object. This field
        will be unset if shared memory is set for the output.
    class_count : int
        The number of classifications to be requested. The default
        value is 0 which means the classification results are not
        requested.
    """

    def __init__(self, name, binary_data=True, class_count=0):
        self._name = name
        self._parameters = {}
        if class_count != 0:
            self._parameters["classification"] = class_count
        self._binary = binary_data
        self._parameters["binary_data"] = binary_data

    def name(self):
        """Get the name of output associated with this object.

        Returns
        -------
        str
            The name of output
        """
        return self._name

    def set_shared_memory(self, region_name, byte_size, offset=0):
        """Marks the output to return the inference result in
        specified shared memory region.

        Parameters
        ----------
        region_name : str
            The name of the shared memory region to hold tensor data.
        byte_size : int
            The size of the shared memory region to hold tensor data.
        offset : int
            The offset, in bytes, into the region where the data for
            the tensor starts. The default value is 0.

        """
        if "classification" in self._parameters:
            raise_error("shared memory can't be set on classification output")
        if self._binary:
            self._parameters["binary_data"] = False

        self._parameters["shared_memory_region"] = region_name
        self._parameters["shared_memory_byte_size"] = byte_size
        if offset != 0:
            self._parameters["shared_memory_offset"] = offset

    def unset_shared_memory(self):
        """Clears the shared memory option set by the last call to
        InferRequestedOutput.set_shared_memory(). After call to this
        function requested output will no longer be returned in a
        shared memory region.
        """

        self._parameters["binary_data"] = self._binary
        self._parameters.pop("shared_memory_region", None)
        self._parameters.pop("shared_memory_byte_size", None)
        self._parameters.pop("shared_memory_offset", None)

    def _get_tensor(self):
        """Retrieve the underlying input as json dict.

        Returns
        -------
        dict
            The underlying tensor as a dict
        """
        return {"name": self._name, "parameters": self._parameters}


class InferResult:
    """An object of InferResult class holds the response of
    an inference request and provide methods to retrieve
    inference results.

    Parameters
    ----------
    result : dict
        The inference response from the server
    verbose : bool
        If True generate verbose output. Default value is False.
    """

    def __init__(self, response, verbose):
        header_length = response.get("Inference-Header-Content-Length")
        if header_length is None:
            content = response.read()
            if verbose:
                print(content)
            self._result = json.loads(content)
        else:
            header_length = int(header_length)
            content = response.read(length=header_length)
            if verbose:
                print(content)
            self._result = json.loads(content)

            # Maps the output name to the index in buffer for quick retrieval
            self._output_name_to_buffer_map = {}
            # Read the remaining data off the response body.
            self._buffer = response.read()
            buffer_index = 0
            for output in self._result["outputs"]:
                parameters = output.get("parameters")
                if parameters is not None:
                    this_data_size = parameters.get("binary_data_size")
                    if this_data_size is not None:
                        self._output_name_to_buffer_map[
                            output["name"]
                        ] = buffer_index
                        buffer_index = buffer_index + this_data_size

    def as_numpy(self, name):
        """Get the tensor data for output associated with this object
        in numpy format

        Parameters
        ----------
        name : str
            The name of the output tensor whose result is to be retrieved.

        Returns
        -------
        numpy array
            The numpy array containing the response data for the tensor or
            None if the data for specified tensor name is not found.
        """
        if self._result.get("outputs") is not None:
            for output in self._result["outputs"]:
                if output["name"] == name:
                    datatype = output["datatype"]
                    has_binary_data = False
                    parameters = output.get("parameters")
                    if parameters is not None:
                        this_data_size = parameters.get("binary_data_size")
                        if this_data_size is not None:
                            has_binary_data = True
                            if this_data_size != 0:
                                start_index = self._output_name_to_buffer_map[
                                    name
                                ]
                                end_index = start_index + this_data_size
                                if datatype == "BYTES":
                                    # String results contain a 4-byte string length
                                    # followed by the actual string characters. Hence,
                                    # need to decode the raw bytes to convert into
                                    # array elements.
                                    np_array = deserialize_bytes_tensor(
                                        self._buffer[start_index:end_index]
                                    )
                                else:
                                    np_array = np.frombuffer(
                                        self._buffer[start_index:end_index],
                                        dtype=triton_to_np_dtype(datatype),
                                    )
                            else:
                                np_array = np.empty(0)
                    if not has_binary_data:
                        np_array = np.array(
                            output["data"], dtype=triton_to_np_dtype(datatype)
                        )
                    np_array = np.resize(np_array, output["shape"])
                    return np_array
        return None

    def get_output(self, name):
        """Retrieves the output tensor corresponding to the named ouput.

        Parameters
        ----------
        name : str
            The name of the tensor for which Output is to be
            retrieved.

        Returns
        -------
        Dict
            If an output tensor with specified name is present in
            the infer resonse then returns it as a json dict,
            otherwise returns None.
        """
        for output in self._result["outputs"]:
            if output["name"] == name:
                return output

        return None

    def get_response(self):
        """Retrieves the complete response

        Returns
        -------
        dict
            The underlying response dict.
        """
        return self._result

In [None]:
%%writefile utils.py
import tritonhttpclient


def triton_init(url="localhost:8000"):
    """Initializes the triton client to point at the specified URL

    Parameter
    ----------
    url : str
        The URL on which to address the Triton server, defaults to
        localhost:8000
    """
    global triton_client
    triton_client = tritonhttpclient.InferenceServerClient(url)
    return triton_client


def get_model_info():
    """Gets metadata for all models hosted behind the Triton endpoint.
    Useful for confirming that your models were loaded into memory.

    Prints the data to STDOUT.
    """
    repo_index = triton_client.get_model_repository_index()
    for model in repo_index:
        model_name = model["name"]
        model_version = model["version"]
        (
            input_meta,
            input_config,
            output_meta,
            output_config,
        ) = parse_model_http(
            model_name=model_name, model_version=model_version
        )
        print(
            f"Found model: {model_name}, version: {model_version}, \
              input meta: {input_meta}, input config: {input_config}, \
              output_meta: {output_meta}, output config: {output_config}"
        )


def parse_model_http(model_name, model_version=""):
    """Check the configuration of a model to make sure it meets the
    requirements for an image classification network (as expected by
    this client)

    Arguments
    --------
    model_name : str
        Name of the model whose metadata you want to fetch

    model_version : str
        Optional, the version of the model, defaults to empty string.

    From https://github.com/triton-inference-server/server/blob/master/src/clients/python/examples/image_client.py  # noqa
    """
    model_metadata = triton_client.get_model_metadata(
        model_name=model_name, model_version=model_version
    )
    model_config = triton_client.get_model_config(
        model_name=model_name, model_version=model_version
    )

    return (
        model_metadata["inputs"],
        model_config["input"],
        model_metadata["outputs"],
        model_config["output"],
    )


def triton_infer(
    input_mapping,
    model_name,
    binary_data=False,
    binary_output=False,
    class_count=0,
):
    """Helper function for setting Triton inputs and executing a request

    Arguments
    ----------
    input_mapping : dict
        A dictionary mapping strings to numpy arrays. The keys should
        be the names of the model inputs, and the values should be the
        inputs themselves.

    model_name : str
        The name of the model on which you are running inference.

    binary_data : bool
        Whether you are expecting binary input and output. Defaults to False

    class_count : int
        If the model is a classification model, the number of output classes.
        Defaults to 0, indicating this is not a classification model.

    Returns
    ----------
    res : InferResult
        Triton inference result containing output from running prediction
    """
    input_meta, _, output_meta, _ = parse_model_http(model_name)

    inputs = []
    outputs = []

    # Populate the inputs array
    for in_meta in input_meta:
        input_name = in_meta["name"]
        data = input_mapping[input_name]

        input = tritonhttpclient.InferInput(
            input_name, data.shape, in_meta["datatype"]
        )

        input.set_data_from_numpy(data, binary_data=binary_data)
        inputs.append(input)

    # Populate the outputs array
    for out_meta in output_meta:
        output_name = out_meta["name"]
        output = tritonhttpclient.InferRequestedOutput(
            output_name, binary_data=binary_output, class_count=class_count
        )
        outputs.append(output)

    # Run inference
    res = triton_client.infer(
        model_name, inputs, request_id="0", outputs=outputs
    )

    return res

In [None]:
%%writefile triton.dockerfile
FROM mcr.microsoft.com/azureml/aml-triton

RUN pip install azureml-defaults
# RUN pip install numpy inference-schema[numpy-support]
RUN pip install pillow
RUN pip install nvidia-pyindex
RUN pip install tritonclient[http]
# RUN apt-get update && apt-get install -y libcurl4-openssl-dev

In [None]:
%%writefile densenet_labels.txt
TENCH
GOLDFISH
WHITE SHARK
TIGER SHARK
HAMMERHEAD SHARK
ELECTRIC RAY
STINGRAY
ROOSTER
HEN
OSTRICH
BRAMBLING
GOLDFINCH
HOUSE FINCH
SNOWBIRD
INDIGO FINCH
ROBIN
BULBUL
JAY
MAGPIE
CHICKADEE
WATER OUZEL
KITE
BALD EAGLE
VULTURE
GREAT GREY OWL
FIRE SALAMANDER
NEWT
EFT
SPOTTED SALAMANDER
AXOLOTL
BULL FROG
TREE FROG
TAILED FROG
LOGGERHEAD
LEATHERBACK TURTLE
MUD TURTLE
TERRAPIN
BOX TURTLE
BANDED GECKO
COMMON IGUANA
AMERICAN CHAMELEON
WHIPTAIL
AGAMA
FRILLED LIZARD
ALLIGATOR LIZARD
GILA MONSTER
GREEN LIZARD
AFRICAN CHAMELEON
KOMODO DRAGON
AFRICAN CROCODILE
AMERICAN ALLIGATOR
TRICERATOPS
THUNDER SNAKE
RINGNECK SNAKE
HOGNOSE SNAKE
GREEN SNAKE
KING SNAKE
GARTER SNAKE
WATER SNAKE
VINE SNAKE
NIGHT SNAKE
BOA
ROCK PYTHON
COBRA
GREEN MAMBA
SEA SNAKE
HORNED VIPER
DIAMONDBACK
SIDEWINDER
TRILOBITE
HARVESTMAN
SCORPION
GARDEN SPIDER
BARN SPIDER
GARDEN SPIDER
BLACK WIDOW
TARANTULA
WOLF SPIDER
TICK
CENTIPEDE
GROUSE
PTARMIGAN
RUFFED GROUSE
PRAIRIE CHICKEN
PEACOCK
QUAIL
PARTRIDGE
AFRICAN GREY
MACAW
COCKATOO
LORIKEET
COUCAL
BEE EATER
HORNBILL
HUMMINGBIRD
JACAMAR
TOUCAN
DRAKE
MERGANSER
GOOSE
BLACK SWAN
TUSKER
ECHIDNA
PLATYPUS
WALLABY
KOALA
WOMBAT
JELLYFISH
SEA ANEMONE
BRAIN CORAL
FLATWORM
NEMATODE
CONCH
SNAIL
SLUG
SEA SLUG
CHITON
CHAMBERED NAUTILUS
DUNGENESS CRAB
ROCK CRAB
FIDDLER CRAB
KING CRAB
AMERICAN LOBSTER
SPINY LOBSTER
CRAYFISH
HERMIT CRAB
ISOPOD
WHITE STORK
BLACK STORK
SPOONBILL
FLAMINGO
LITTLE BLUE HERON
AMERICAN EGRET
BITTERN
CRANE
LIMPKIN
EUROPEAN GALLINULE
AMERICAN COOT
BUSTARD
RUDDY TURNSTONE
RED-BACKED SANDPIPER
REDSHANK
DOWITCHER
OYSTERCATCHER
PELICAN
KING PENGUIN
ALBATROSS
GREY WHALE
KILLER WHALE
DUGONG
SEA LION
CHIHUAHUA
JAPANESE SPANIEL
MALTESE DOG
PEKINESE
SHIH-TZU
BLENHEIM SPANIEL
PAPILLON
TOY TERRIER
RHODESIAN RIDGEBACK
AFGHAN HOUND
BASSET
BEAGLE
BLOODHOUND
BLUETICK
COONHOUND
WALKER HOUND
ENGLISH FOXHOUND
REDBONE
BORZOI
IRISH WOLFHOUND
ITALIAN GREYHOUND
WHIPPET
IBIZAN HOUND
NORWEGIAN ELKHOUND
OTTERHOUND
SALUKI
SCOTTISH DEERHOUND
WEIMARANER
STAFFORDSHIRE BULLTERRIER
STAFFORDSHIRE TERRIER
BEDLINGTON TERRIER
BORDER TERRIER
KERRY BLUE TERRIER
IRISH TERRIER
NORFOLK TERRIER
NORWICH TERRIER
YORKSHIRE TERRIER
WIRE-HAIRED FOX TERRIER
LAKELAND TERRIER
SEALYHAM TERRIER
AIREDALE
CAIRN
AUSTRALIAN TERRIER
DANDIE DINMONT
BOSTON BULL
MINIATURE SCHNAUZER
GIANT SCHNAUZER
STANDARD SCHNAUZER
SCOTCH TERRIER
TIBETAN TERRIER
SILKY TERRIER
WHEATEN TERRIER
WHITE TERRIER
LHASA
RETRIEVER
CURLY-COATED RETRIEVER
GOLDEN RETRIEVER
LABRADOR RETRIEVER
CHESAPEAKE BAY RETRIEVER
SHORT-HAIRED POINTER
VISLA
ENGLISH SETTER
IRISH SETTER
GORDON SETTER
BRITTANY SPANIEL
CLUMBER
ENGLISH SPRINGER
WELSH SPRINGER SPANIEL
COCKER SPANIEL
SUSSEX SPANIEL
IRISH WATERSPANIEL
KUVASZ
SCHIPPERKE
GROENENDAEL
MALINOIS
BRIARD
KELPIE
KOMONDOR
OLD ENGLISH SHEEPDOG
SHETLAND SHEEPDOG
COLLIE
BORDER COLLIE
BOUVIER DES FLANDRES
ROTTWEILER
GERMAN SHEPHERD
DOBERMAN
MINIATURE PINSCHER
GREATER SWISS MOUNTAIN DOG
BERNESE MOUNTAIN DOG
APPENZELLER
ENTLEBUCHER
BOXER
BULL MASTIFF
TIBETAN MASTIFF
FRENCH BULLDOG
GREAT DANE
SAINT BERNARD
ESKIMO DOG
MALAMUTE
SIBERIAN HUSKY
DALMATIAN
AFFENPINSCHER
BASENJI
PUG
LEONBERG
NEWFOUNDLAND
GREAT PYRENEES
SAMOYED
POMERANIAN
CHOW
KEESHOND
BRABANCON GRIFFON
PEMBROKE
CARDIGAN
TOY POODLE
MINIATURE POODLE
STANDARD POODLE
MEXICAN HAIRLESS
TIMBER WOLF
WHITE WOLF
RED WOLF
COYOTE
DINGO
DHOLE
AFRICAN HUNTING DOG
HYENA
RED FOX
KIT FOX
ARCTIC FOX
GREY FOX
TABBY
TIGER CAT
PERSIAN CAT
SIAMESE CAT
EGYPTIAN CAT
COUGAR
LYNX
LEOPARD
SNOW LEOPARD
JAGUAR
LION
TIGER
CHEETAH
BROWN BEAR
AMERICAN BLACK BEAR
ICE BEAR
SLOTH BEAR
MONGOOSE
MEERKAT
TIGER BEETLE
LADYBUG
GROUND BEETLE
LONG-HORNED BEETLE
LEAF BEETLE
DUNG BEETLE
RHINOCEROS BEETLE
WEEVIL
FLY
BEE
ANT
GRASSHOPPER
CRICKET
WALKING STICK
COCKROACH
MANTIS
CICADA
LEAFHOPPER
LACEWING
DRAGONFLY
DAMSELFLY
ADMIRAL
RINGLET
MONARCH
CABBAGE BUTTERFLY
SULPHUR BUTTERFLY
LYCAENID
STARFISH
SEA URCHIN
SEA CUCUMBER
WOOD RABBIT
HARE
ANGORA
HAMSTER
PORCUPINE
FOX SQUIRREL
MARMOT
BEAVER
GUINEA PIG
SORREL
ZEBRA
HOG
WILD BOAR
WARTHOG
HIPPOPOTAMUS
OX
WATER BUFFALO
BISON
RAM
BIGHORN
IBEX
HARTEBEEST
IMPALA
GAZELLE
ARABIAN CAMEL
LLAMA
WEASEL
MINK
POLECAT
BLACK-FOOTED FERRET
OTTER
SKUNK
BADGER
ARMADILLO
THREE-TOED SLOTH
ORANGUTAN
GORILLA
CHIMPANZEE
GIBBON
SIAMANG
GUENON
PATAS
BABOON
MACAQUE
LANGUR
COLOBUS
PROBOSCIS MONKEY
MARMOSET
CAPUCHIN
HOWLER MONKEY
TITI
SPIDER MONKEY
SQUIRREL MONKEY
MADAGASCAR CAT
INDRI
INDIAN ELEPHANT
AFRICAN ELEPHANT
LESSER PANDA
GIANT PANDA
BARRACOUTA
EEL
COHO
ROCK BEAUTY
ANEMONE FISH
STURGEON
GAR
LIONFISH
PUFFER
ABACUS
ABAYA
ACADEMIC GOWN
ACCORDION
ACOUSTIC GUITAR
AIRCRAFT CARRIER
AIRLINER
AIRSHIP
ALTAR
AMBULANCE
AMPHIBIAN
ANALOG CLOCK
APIARY
APRON
ASHCAN
ASSAULT RIFLE
BACKPACK
BAKERY
BALANCE BEAM
BALLOON
BALLPOINT
BAND AID
BANJO
BANNISTER
BARBELL
BARBER CHAIR
BARBERSHOP
BARN
BAROMETER
BARREL
BARROW
BASEBALL
BASKETBALL
BASSINET
BASSOON
BATHING CAP
BATH TOWEL
BATHTUB
BEACH WAGON
BEACON
BEAKER
BEARSKIN
BEER BOTTLE
BEER GLASS
BELL COTE
BIB
BICYCLE-BUILT-FOR-TWO
BIKINI
BINDER
BINOCULARS
BIRDHOUSE
BOATHOUSE
BOBSLED
BOLO TIE
BONNET
BOOKCASE
BOOKSHOP
BOTTLECAP
BOW
BOW TIE
BRASS
BRASSIERE
BREAKWATER
BREASTPLATE
BROOM
BUCKET
BUCKLE
BULLETPROOF VEST
BULLET TRAIN
BUTCHER SHOP
CAB
CALDRON
CANDLE
CANNON
CANOE
CAN OPENER
CARDIGAN
CAR MIRROR
CAROUSEL
CARPENTERS KIT
CARTON
CAR WHEEL
CASH MACHINE
CASSETTE
CASSETTE PLAYER
CASTLE
CATAMARAN
CD PLAYER
CELLO
CELLULAR TELEPHONE
CHAIN
CHAINLINK FENCE
CHAIN MAIL
CHAIN SAW
CHEST
CHIFFONIER
CHIME
CHINA CABINET
CHRISTMAS STOCKING
CHURCH
CINEMA
CLEAVER
CLIFF DWELLING
CLOAK
CLOG
COCKTAIL SHAKER
COFFEE MUG
COFFEEPOT
COIL
COMBINATION LOCK
COMPUTER KEYBOARD
CONFECTIONERY
CONTAINER SHIP
CONVERTIBLE
CORKSCREW
CORNET
COWBOY BOOT
COWBOY HAT
CRADLE
CRANE
CRASH HELMET
CRATE
CRIB
CROCK POT
CROQUET BALL
CRUTCH
CUIRASS
DAM
DESK
DESKTOP COMPUTER
DIAL TELEPHONE
DIAPER
DIGITAL CLOCK
DIGITAL WATCH
DINING TABLE
DISHRAG
DISHWASHER
DISK BRAKE
DOCK
DOGSLED
DOME
DOORMAT
DRILLING PLATFORM
DRUM
DRUMSTICK
DUMBBELL
DUTCH OVEN
ELECTRIC FAN
ELECTRIC GUITAR
ELECTRIC LOCOMOTIVE
ENTERTAINMENT CENTER
ENVELOPE
ESPRESSO MAKER
FACE POWDER
FEATHER BOA
FILE
FIREBOAT
FIRE ENGINE
FIRE SCREEN
FLAGPOLE
FLUTE
FOLDING CHAIR
FOOTBALL HELMET
FORKLIFT
FOUNTAIN
FOUNTAIN PEN
FOUR-POSTER
FREIGHT CAR
FRENCH HORN
FRYING PAN
FUR COAT
GARBAGE TRUCK
GASMASK
GAS PUMP
GOBLET
GO-KART
GOLF BALL
GOLFCART
GONDOLA
GONG
GOWN
GRAND PIANO
GREENHOUSE
GRILLE
GROCERY STORE
GUILLOTINE
HAIR SLIDE
HAIR SPRAY
HALF TRACK
HAMMER
HAMPER
HAND BLOWER
HAND-HELD COMPUTER
HANDKERCHIEF
HARD DISC
HARMONICA
HARP
HARVESTER
HATCHET
HOLSTER
HOME THEATER
HONEYCOMB
HOOK
HOOPSKIRT
HORIZONTAL BAR
HORSE CART
HOURGLASS
IPOD
IRON
JACK-O-LANTERN
JEAN
JEEP
JERSEY
JIGSAW PUZZLE
JINRIKISHA
JOYSTICK
KIMONO
KNEE PAD
KNOT
LAB COAT
LADLE
LAMPSHADE
LAPTOP
LAWN MOWER
LENS CAP
LETTER OPENER
LIBRARY
LIFEBOAT
LIGHTER
LIMOUSINE
LINER
LIPSTICK
LOAFER
LOTION
LOUDSPEAKER
LOUPE
LUMBERMILL
MAGNETIC COMPASS
MAILBAG
MAILBOX
MAILLOT
MAILLOT
MANHOLE COVER
MARACA
MARIMBA
MASK
MATCHSTICK
MAYPOLE
MAZE
MEASURING CUP
MEDICINE CHEST
MEGALITH
MICROPHONE
MICROWAVE
MILITARY UNIFORM
MILK CAN
MINIBUS
MINISKIRT
MINIVAN
MISSILE
MITTEN
MIXING BOWL
MOBILE HOME
MODEL T
MODEM
MONASTERY
MONITOR
MOPED
MORTAR
MORTARBOARD
MOSQUE
MOSQUITO NET
MOTOR SCOOTER
MOUNTAIN BIKE
MOUNTAIN TENT
MOUSE
MOUSETRAP
MOVING VAN
MUZZLE
NAIL
NECK BRACE
NECKLACE
NIPPLE
NOTEBOOK
OBELISK
OBOE
OCARINA
ODOMETER
OIL FILTER
ORGAN
OSCILLOSCOPE
OVERSKIRT
OXCART
OXYGEN MASK
PACKET
PADDLE
PADDLEWHEEL
PADLOCK
PAINTBRUSH
PAJAMA
PALACE
PANPIPE
PAPER TOWEL
PARACHUTE
PARALLEL BARS
PARK BENCH
PARKING METER
PASSENGER CAR
PATIO
PAY-PHONE
PEDESTAL
PENCIL BOX
PENCIL SHARPENER
PERFUME
PETRI DISH
PHOTOCOPIER
PICK
PICKELHAUBE
PICKET FENCE
PICKUP
PIER
PIGGY BANK
PILL BOTTLE
PILLOW
PING-PONG BALL
PINWHEEL
PIRATE
PITCHER
PLANE
PLANETARIUM
PLASTIC BAG
PLATE RACK
PLOW
PLUNGER
POLAROID CAMERA
POLE
POLICE VAN
PONCHO
POOL TABLE
POP BOTTLE
POT
POTTERS WHEEL
POWER DRILL
PRAYER RUG
PRINTER
PRISON
PROJECTILE
PROJECTOR
PUCK
PUNCHING BAG
PURSE
QUILL
QUILT
RACER
RACKET
RADIATOR
RADIO
RADIO TELESCOPE
RAIN BARREL
RECREATIONAL VEHICLE
REEL
REFLEX CAMERA
REFRIGERATOR
REMOTE CONTROL
RESTAURANT
REVOLVER
RIFLE
ROCKING CHAIR
ROTISSERIE
RUBBER ERASER
RUGBY BALL
RULE
RUNNING SHOE
SAFE
SAFETY PIN
SALTSHAKER
SANDAL
SARONG
SAX
SCABBARD
SCALE
SCHOOL BUS
SCHOONER
SCOREBOARD
SCREEN
SCREW
SCREWDRIVER
SEAT BELT
SEWING MACHINE
SHIELD
SHOE SHOP
SHOJI
SHOPPING BASKET
SHOPPING CART
SHOVEL
SHOWER CAP
SHOWER CURTAIN
SKI
SKI MASK
SLEEPING BAG
SLIDE RULE
SLIDING DOOR
SLOT
SNORKEL
SNOWMOBILE
SNOWPLOW
SOAP DISPENSER
SOCCER BALL
SOCK
SOLAR DISH
SOMBRERO
SOUP BOWL
SPACE BAR
SPACE HEATER
SPACE SHUTTLE
SPATULA
SPEEDBOAT
SPIDER WEB
SPINDLE
SPORTS CAR
SPOTLIGHT
STAGE
STEAM LOCOMOTIVE
STEEL ARCH BRIDGE
STEEL DRUM
STETHOSCOPE
STOLE
STONE WALL
STOPWATCH
STOVE
STRAINER
STREETCAR
STRETCHER
STUDIO COUCH
STUPA
SUBMARINE
SUIT
SUNDIAL
SUNGLASS
SUNGLASSES
SUNSCREEN
SUSPENSION BRIDGE
SWAB
SWEATSHIRT
SWIMMING TRUNKS
SWING
SWITCH
SYRINGE
TABLE LAMP
TANK
TAPE PLAYER
TEAPOT
TEDDY
TELEVISION
TENNIS BALL
THATCH
THEATER CURTAIN
THIMBLE
THRESHER
THRONE
TILE ROOF
TOASTER
TOBACCO SHOP
TOILET SEAT
TORCH
TOTEM POLE
TOW TRUCK
TOYSHOP
TRACTOR
TRAILER TRUCK
TRAY
TRENCH COAT
TRICYCLE
TRIMARAN
TRIPOD
TRIUMPHAL ARCH
TROLLEYBUS
TROMBONE
TUB
TURNSTILE
TYPEWRITER KEYBOARD
UMBRELLA
UNICYCLE
UPRIGHT
VACUUM
VASE
VAULT
VELVET
VENDING MACHINE
VESTMENT
VIADUCT
VIOLIN
VOLLEYBALL
WAFFLE IRON
WALL CLOCK
WALLET
WARDROBE
WARPLANE
WASHBASIN
WASHER
WATER BOTTLE
WATER JUG
WATER TOWER
WHISKEY JUG
WHISTLE
WIG
WINDOW SCREEN
WINDOW SHADE
WINDSOR TIE
WINE BOTTLE
WING
WOK
WOODEN SPOON
WOOL
WORM FENCE
WRECK
YAWL
YURT
WEB SITE
COMIC BOOK
CROSSWORD PUZZLE
STREET SIGN
TRAFFIC LIGHT
BOOK JACKET
MENU
PLATE
GUACAMOLE
CONSOMME
HOT POT
TRIFLE
ICE CREAM
ICE LOLLY
FRENCH LOAF
BAGEL
PRETZEL
CHEESEBURGER
HOTDOG
MASHED POTATO
HEAD CABBAGE
BROCCOLI
CAULIFLOWER
ZUCCHINI
SPAGHETTI SQUASH
ACORN SQUASH
BUTTERNUT SQUASH
CUCUMBER
ARTICHOKE
BELL PEPPER
CARDOON
MUSHROOM
GRANNY SMITH
STRAWBERRY
ORANGE
LEMON
FIG
PINEAPPLE
BANANA
JACKFRUIT
CUSTARD APPLE
POMEGRANATE
HAY
CARBONARA
CHOCOLATE SAUCE
DOUGH
MEAT LOAF
PIZZA
POTPIE
BURRITO
RED WINE
ESPRESSO
CUP
EGGNOG
ALP
BUBBLE
CLIFF
CORAL REEF
GEYSER
LAKESIDE
PROMONTORY
SANDBAR
SEASHORE
VALLEY
VOLCANO
BALLPLAYER
GROOM
SCUBA DIVER
RAPESEED
DAISY
LADY SLIPPER
CORN
ACORN
HIP
BUCKEYE
CORAL FUNGUS
AGARIC
GYROMITRA
STINKHORN
EARTHSTAR
HEN-OF-THE-WOODS
BOLETE
EAR
TOILET TISSUE