# Standard instruction for using LMI container on SageMaker
In this tutorial, you will use LMI container from DLC to SageMaker and run inference with it.

Please make sure the following permission granted before running the notebook:

- S3 bucket push access
- SageMaker access

## Step 1: Let's bump up SageMaker and import stuff

In [None]:
%pip install sagemaker boto3 awscli --upgrade  --quiet
%pip install huggingface_hub

In [3]:
import boto3
import sagemaker
from sagemaker import Model, image_uris, serializers, deserializers

role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
region = sess._region_name  # region name of the current SageMaker Studio environment
account_id = sess.account_id()  # account_id of the current SageMaker Studio environment
bucket = sess.default_bucket() # Set a default S3 bucket

  from pandas.core.computation.check import NUMEXPR_INSTALLED


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


## Step 2: Start preparing model artifacts
In LMI contianer, we expect some artifacts to help setting up the model
- serving.properties (required): Defines the model server settings
- model.py (optional): A python file to define the core inference logic
- requirements.txt (optional): Any additional pip wheel need to install

In [4]:
import os

# function to download model and upload to S3
def download_model(bucket, model_id, commit_hash=None):
    from huggingface_hub import snapshot_download
    from pathlib import Path

    local_model_folder_name = f"LLM_{model_id.replace('/', '_')}_model"
    s3_model_prefix = f"LLM/{local_model_folder_name}"  # folder where model checkpoint will go

    local_model_path = Path(local_model_folder_name)

    local_model_path.mkdir(exist_ok=True)
    snapshot_download(repo_id=model_id, revision=commit_hash, cache_dir=local_model_path, allow_patterns=["*.md", "*.json", "*.bin", "*.txt"])

    model_snapshot_path = list(local_model_path.glob("**/snapshots/*"))[0]

    print(f"model_snapshot_path: {model_snapshot_path}")

    os.system(f'aws s3 cp --recursive {model_snapshot_path} s3://{bucket}/{s3_model_prefix}')


In [None]:
for model_id in ["facebook/opt-350m", "bigscience/bloomz-560m"]:
    download_model(bucket, model_id)


In [40]:
%%writefile model.py
from djl_python import Input, Output
import os
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
import logging

predictor = None
models = {}

def my_predict(data):
    model_opt = models['LLM_facebook_opt-350m_model']
    model_bloom = models['LLM_bigscience_bloomz-560m_model']

    result_opt = model_opt(data, do_sample=True)
    result_bloom = model_bloom(data, do_sample=True)

    return {
        'opt': result_opt,
        'bloom': result_bloom
    }

def init_models(properties):
    """load all models"""
    device_id = properties.get('device_id')
    model_base_id = properties.get('model_id')

    for model_id in ["LLM_facebook_opt-350m_model", "LLM_bigscience_bloomz-560m_model"]:
        local_model_dir = f'{model_base_id}/{model_id}'

        dtype = torch.float16
        model = AutoModelForCausalLM.from_pretrained(local_model_dir, torch_dtype=dtype)
        tokenizer = AutoTokenizer.from_pretrained(local_model_dir)
        # specify a device id.
        generator = pipeline(task='text-generation', model=local_model_dir, tokenizer=tokenizer, device=f'cuda:{device_id}')

        models[model_id] = generator

def handle(inputs: Input) -> None:
    if inputs.is_empty():
        # Model server makes an empty call to warmup the model on startup
        properties = inputs.get_properties()
        logging.info('init models with properties:', properties)
        # init models
        init_models(properties)
        return None

    data = inputs.get_as_json()['prompt']
    try:
        result = my_predict(data)
    except Exception as err:
        logging.info(err)
        raise err

    result = {'ipt_properties': inputs.get_properties(), 'r': result}

    return Output().add(result)


Overwriting model.py


In [6]:
# this config will load a model copy on each GPU card, i.e run model.py with different device_id.
with open('serving.properties', 'w') as f:
    f.write(f"""engine=Python
option.model_id=s3://{bucket}/LLM/
""")

## Step 3: Start building SageMaker endpoint
In this step, we will build SageMaker endpoint from scratch

### Getting the container image URI

In [38]:
image_uri = image_uris.retrieve(
        framework="djl-deepspeed",
        region=sess.boto_session.region_name,
        version="0.26.0"
    )
image_uri

'763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.26.0-deepspeed0.12.6-cu121'

### Upload artifact on S3 and create SageMaker model

In [41]:
! tar -czvf model.tar.gz model.py requirements.txt serving.properties

model.py
requirements.txt
serving.properties


In [None]:
# upload model data to S3
model_s3_path = f"s3://{bucket}/LLM-code/multi-models/0406/model.tar.gz"
!aws s3 cp model.tar.gz $model_s3_path

In [54]:
model = Model(image_uri=image_uri, model_data=model_s3_path, role=role)

## Step 4 Create SageMaker endpoint

You need to specify the instance to use and endpoint names

In [55]:
instance_type = "ml.g5.12xlarge"  # g5.12xlarge instance has 4 GPUs
endpoint_name = sagemaker.utils.name_from_base("lmi-multi-model")

model.deploy(initial_instance_count=1,
             instance_type=instance_type,
             endpoint_name=endpoint_name,
             # container_startup_health_check_timeout=3600
            )

# our requests and responses will be in json format so we specify the serializer and the deserializer
predictor = sagemaker.Predictor(
    endpoint_name=endpoint_name,
    sagemaker_session=sess,
    serializer=serializers.JSONSerializer(),
    deserializer=deserializers.JSONDeserializer(),
)

----------------!

## Step 5: Test and benchmark the inference

Note the 'device_id' of 'ipt_properties'.

In [57]:
print(predictor.predict( {"prompt": "Large model inference is"}))
print(predictor.predict( {"prompt": "Large model inference is"}))
print(predictor.predict( {"prompt": "Large model inference is"}))
print(predictor.predict( {"prompt": "Large model inference is"}))

{'ipt_properties': {'Accept': 'application/json', 'Content-Length': '38', 'Content-Type': 'application/json', 'handler': 'handle', 'Host': '169.254.180.2:8080', 'User-Agent': 'AHC/2.0', 'device_id': '0', 'model_dir': '/opt/ml/model'}, 'r': {'opt': [{'generated_text': "Large model inference is supported to help understand the model's complexity. Large models are widely used in scientific"}], 'bloom': [{'generated_text': 'Large model inference is a method that yields large estimators for the parameters of a given model; with'}]}}
{'ipt_properties': {'Accept': 'application/json', 'Content-Length': '38', 'Content-Type': 'application/json', 'handler': 'handle', 'Host': '169.254.180.2:8080', 'User-Agent': 'AHC/2.0', 'device_id': '1', 'model_dir': '/opt/ml/model'}, 'r': {'opt': [{'generated_text': 'Large model inference is a complex and confusing problem, and our work shows that large models may be the'}], 'bloom': [{'generated_text': 'Large model inference is often used by scientists, who ar

## Clean up the environment

In [58]:
sess.delete_endpoint(endpoint_name)
sess.delete_endpoint_config(endpoint_name)
model.delete_model()