## Imports

In [1]:
import numpy as np
import torch

import time
from time import strftime,gmtime

import json
import requests
import boto3

import os
import io

import sagemaker
from sagemaker.pytorch.model import PyTorchModel
from sagemaker.predictor import Predictor

In [4]:
from sagemaker import get_execution_role, Session, image_uris

role = get_execution_role()
sess = Session()
region = sess.boto_region_name
bucket = sess.default_bucket()
sm_client = boto3.client("sagemaker", region_name=region)

In [5]:
boto_session = boto3.session.Session()
sm_runtime = boto_session.client("sagemaker-runtime")

### Create model archive

In [8]:
model_archive_name = 'yolo5smodel-video.tar.gz'

In [9]:
!tar -cvzf {model_archive_name} model.pth 

model.pth


In [None]:
# model package tarball (model artifact + inference code)
model_url = sess.upload_data(path=model_archive_name, key_prefix='model')
print('model uploaded to: {}'.format(model_url))

### Deploy Realtime Endpoint

In [11]:
from sagemaker.pytorch.model import PyTorchModel
from sagemaker.predictor import Predictor

framework_version = '1.7.1'
py_version = 'py36'
env= {
            'TS_MAX_REQUEST_SIZE': '1000000000', #default max request size is 6 Mb for torchserve, need to update it to support the 70 mb input payload
            'TS_MAX_RESPONSE_SIZE': '1000000000',
            'TS_DEFAULT_RESPONSE_TIMEOUT': '1000'
        }

sm_model = PyTorchModel(model_data=model_url,
                               framework_version=framework_version,
                               role=role,
                               sagemaker_session=sess,
                               entry_point='inference.py',
                               source_dir= 'code',
                               env=env,
                               py_version=py_version
                              )

In [12]:
instance_type = 'ml.g4dn.xlarge'
uncompiled_predictor = sm_model.deploy(initial_instance_count=1, instance_type=instance_type)

--------!

# Invoke model

In [None]:
feed_data = io.open('test_images/bus.jpg', 'rb')

In [None]:
type(feed_data)

In [None]:
t0 = time.time()
rv = sm_client.invoke_endpoint(EndpointName=uncompiled_predictor.endpoint_name, Body=feed_data, ContentType=content_type)
t1 = time.time()

time_elapsed = (t1-t0)*1000
print(time_elapsed)

In [None]:
predictions= json.loads(rv['Body'].read().decode())

In [None]:
predictions_arr = np.array(predictions)
predictions_arr.shape

# Deploy Async Endpoint

In [None]:
bucket= '' # output bucket name

In [13]:
from sagemaker.async_inference.async_inference_config import AsyncInferenceConfig

async_config = AsyncInferenceConfig(
                output_path= f"s3://{bucket}/output",
                max_concurrent_invocations_per_instance=2,)

In [16]:
framework_version = '1.7.1'
py_version = 'py36'
env= {
            'TS_MAX_REQUEST_SIZE' : '1000000000', #default max request size is 6 Mb for torchserve, need to update it to support the 70 mb input payload
            'TS_MAX_RESPONSE_SIZE': '1000000000',
            'TS_DEFAULT_RESPONSE_TIMEOUT': '1000'
        }

sm_model = PyTorchModel(model_data=model_url,
                               framework_version=framework_version,
                               role=role,
                               sagemaker_session=sess,
                               entry_point='inference.py',
                               source_dir= 'code',
                               env=env,
                               py_version=py_version
                              )

In [17]:
instance_type = 'ml.g4dn.xlarge'

async_uncompiled_predictor = sm_model.deploy(async_inference_config=async_config,
                                       initial_instance_count=1,
                                       instance_type=instance_type)

--------!

# Test endpoint

In [18]:
# Upload input top s3
def upload_file(input_location):
    prefix = "input"
    return sess.upload_data(
        input_location, 
        bucket = '',   # Input bucket name
        key_prefix=prefix
        )

In [19]:
input_1_location = ''   # path to inference video 
input_1_s3_location = upload_file(input_1_location)

### Invoke async endpoint

In [21]:
endpoint_name= 'pytorch-inference-2022-10-17-12-07-29-325'

In [None]:
response = sm_runtime.invoke_endpoint_async(
    EndpointName=endpoint_name, 
    InputLocation=input_1_s3_location)
output_location = response['OutputLocation']
print(f"OutputLocation: {output_location}")