## Setup

In [1]:
import numpy as np
import torch

import time
import json
import requests

import os
import io

import sagemaker
import boto3



In [2]:
from time import strftime,gmtime

In [4]:
from sagemaker import get_execution_role, Session, image_uris

role = get_execution_role()
sess = Session()
region = sess.boto_region_name
bucket = sess.default_bucket()
sm_client = boto3.client("sagemaker", region_name=region)

In [5]:
boto_session = boto3.session.Session()
sm_runtime = boto_session.client("sagemaker-runtime")

### Create model archive

In [9]:
model_archive_name = 'yolo5smodel.tar.gz'

In [10]:
!tar -cvzf {model_archive_name} model.pth code/

model.pth
code/
code/requirements.txt
code/inference.py
code/.ipynb_checkpoints/


In [11]:
# model package tarball (model artifact + inference code)
model_url = sess.upload_data(path=model_archive_name, key_prefix='model')
print('model uploaded to: {}'.format(model_url))

model uploaded to: s3://sagemaker-ca-central-1-333752261573/model/yolo5smodel.tar.gz


### Create model and test inference

In [12]:
from sagemaker.pytorch.model import PyTorchModel
from sagemaker.predictor import Predictor

framework_version = '1.7.1'
py_version = 'py36'
env= {
            'TS_MAX_REQUEST_SIZE': '100000000', #default max request size is 6 Mb for torchserve, need to update it to support the 70 mb input payload
            'TS_MAX_RESPONSE_SIZE': '100000000',
            'TS_DEFAULT_RESPONSE_TIMEOUT': '1000'
        }

sm_model = PyTorchModel(model_data=model_url,
                               framework_version=framework_version,
                               role=role,
                               sagemaker_session=sess,
                               entry_point='inference.py',
                               source_dir= 'code',
                               env=env,
                               py_version=py_version
                              )

In [13]:
instance_type = 'ml.g4dn.xlarge'
uncompiled_predictor = sm_model.deploy(initial_instance_count=1, instance_type=instance_type)

--------!

# Invoke model

In [23]:
feed_data = io.open('test_images/bus.jpg', 'rb')

In [12]:
type(feed_data)

_io.BufferedReader

In [24]:
t0 = time.time()
rv = client.invoke_endpoint(EndpointName=uncompiled_predictor.endpoint_name, Body=feed_data, ContentType=content_type)
t1 = time.time()

time_elapsed = (t1-t0)*1000
print(time_elapsed)

3147.6593017578125


In [25]:
predictions= json.loads(rv['Body'].read().decode())

In [26]:
predictions_arr = np.array(predictions)
predictions_arr.shape

(1, 25200, 85)