In [1]:
!pip install sagemaker boto3 --upgrade  --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
awscli 1.27.150 requires botocore==1.29.150, but you have botocore 1.31.1 which is incompatible.
awscli 1.27.150 requires PyYAML<5.5,>=3.10, but you have pyyaml 6.0 which is incompatible.[0m[31m
[0m

In [2]:
import sagemaker
import jinja2
from sagemaker import image_uris
import boto3
import os
import time
import json
from pathlib import Path
from sagemaker.utils import name_from_base

In [3]:
role = sagemaker.get_execution_role()  # execution role for the endpoint
sess = sagemaker.session.Session()  # sagemaker session for interacting with different AWS APIs
bucket = sess.default_bucket()  # bucket to house artifacts
model_bucket = sess.default_bucket()  # bucket to house artifacts
s3_code_prefix_deepspeed = "hf-large-model-djl-/code_falcon7b/deepspeed"  # folder within bucket where code artifact will go

region = sess._region_name
account_id = sess.account_id()

s3_client = boto3.client("s3")
sm_client = boto3.client("sagemaker")
smr_client = boto3.client("sagemaker-runtime")

jinja_env = jinja2.Environment()

In [4]:
!mkdir -p code_falcon7b_deepspeed

In [5]:
# define a variable to contain the s3url of the location that has the model
pretrained_model_location = f"s3://sagemaker-us-east-1-303179550733/huggingface-peft-2023-07-06-08-45-07-2023-07-06-08-45-14-350/output/model.tar.gz"
print(f"Pretrained model will be downloaded from ---- > {pretrained_model_location}")

Pretrained model will be downloaded from ---- > s3://sagemaker-us-east-1-303179550733/huggingface-peft-2023-07-06-08-45-07-2023-07-06-08-45-14-350/output/model.tar.gz


In [8]:
%%writefile ./code_falcon7b_deepspeed/serving.properties
engine=DeepSpeed
option.model_id=tiiuae/falcon-7b
option.tensor_parallel_degree=1
#option.s3url = {{s3url}}

Overwriting ./code_falcon7b_deepspeed/serving.properties


In [9]:
%%writefile ./code_falcon7b_deepspeed/requirements.txt
einops
torch==2.0.1
git+https://github.com/lanking520/DeepSpeed.git@falcon

Overwriting ./code_falcon7b_deepspeed/requirements.txt


In [10]:
%%writefile ./code_falcon7b_deepspeed/model.py
from djl_python import Input, Output
import os
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from typing import Any, Dict, Tuple
import deepspeed
import warnings

predictor = None


def get_model(properties):
    model_name = properties["model_id"]
    local_rank = int(os.getenv("LOCAL_RANK", "0"))
    model = AutoModelForCausalLM.from_pretrained(
        model_name, low_cpu_mem_usage=True, trust_remote_code=True, torch_dtype=torch.bfloat16
    )
    model = deepspeed.init_inference(model, mp_size=properties["tensor_parallel_degree"])
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    generator = pipeline(
        task="text-generation", model=model, tokenizer=tokenizer, device=local_rank
    )
    return generator


def handle(inputs: Input) -> None:
    global predictor
    if not predictor:
        predictor = get_model(inputs.get_properties())

    if inputs.is_empty():
        # Model server makes an empty call to warmup the model on startup
        return None
    data = inputs.get_as_json()
    text = data["text"]
    text_length = data["text_length"]
    result = predictor(text, do_sample=True, min_length=text_length, max_length=text_length)
    return Output().add(result)

Writing ./code_falcon7b_deepspeed/model.py


In [11]:
!rm -f model.tar.gz
!rm -rf code_falcon7b_deepspeed/.ipynb_checkpoints
!tar czvf model.tar.gz -C code_falcon7b_deepspeed .
s3_code_artifact_deepspeed = sess.upload_data("model.tar.gz", bucket, s3_code_prefix_deepspeed)
print(f"S3 Code or Model tar for deepspeed uploaded to --- > {s3_code_artifact_deepspeed}")

./
./model.py
./serving.properties
./requirements.txt
S3 Code or Model tar for deepspeed uploaded to --- > s3://sagemaker-us-east-1-303179550733/hf-large-model-djl-/code_falcon7b/deepspeed/model.tar.gz


In [12]:
# inference_image_uri = f"{account_id}.dkr.ecr.{region}.amazonaws.com/djl-ds:latest"
inference_image_uri = (
    f"763104351884.dkr.ecr.{region}.amazonaws.com/djl-inference:0.22.1-deepspeed0.9.2-cu118"
)
print(f"Image going to be used is ---- > {inference_image_uri}")

Image going to be used is ---- > 763104351884.dkr.ecr.us-east-1.amazonaws.com/djl-inference:0.22.1-deepspeed0.9.2-cu118


In [13]:
model_name_ds = name_from_base(f"falcon7b-model-ds")
print(model_name_ds)

falcon7b-model-ds-2023-07-08-11-03-30-763


In [14]:
create_model_response = sm_client.create_model(
    ModelName=model_name_ds,
    ExecutionRoleArn=role,
    PrimaryContainer={"Image": inference_image_uri, "ModelDataUrl": s3_code_artifact_deepspeed},
)
model_arn = create_model_response["ModelArn"]

print(f"Created Model: {model_arn}")

Created Model: arn:aws:sagemaker:us-east-1:303179550733:model/falcon7b-model-ds-2023-07-08-11-03-30-763


In [15]:
model_name = model_name_ds
print(f"Building EndpointConfig and Endpoint for: {model_name}")

Building EndpointConfig and Endpoint for: falcon7b-model-ds-2023-07-08-11-03-30-763


In [16]:
endpoint_config_name = f"{model_name}-config"
endpoint_name = f"{model_name}-endpoint"

endpoint_config_response = sm_client.create_endpoint_config(
    EndpointConfigName=endpoint_config_name,
    ProductionVariants=[
        {
            "VariantName": "variant1",
            "ModelName": model_name,
            "InstanceType": "ml.g5.2xlarge",
            "InitialInstanceCount": 1,
            "ModelDataDownloadTimeoutInSeconds": 3600,
            "ContainerStartupHealthCheckTimeoutInSeconds": 3600,
            # "VolumeSizeInGB": 512
        },
    ],
)
endpoint_config_response

{'EndpointConfigArn': 'arn:aws:sagemaker:us-east-1:303179550733:endpoint-config/falcon7b-model-ds-2023-07-08-11-03-30-763-config',
 'ResponseMetadata': {'RequestId': '51da8451-7200-407e-a74e-a0d5cae30175',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '51da8451-7200-407e-a74e-a0d5cae30175',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '129',
   'date': 'Sat, 08 Jul 2023 11:03:56 GMT'},
  'RetryAttempts': 0}}

In [17]:
create_endpoint_response = sm_client.create_endpoint(
    EndpointName=f"{endpoint_name}", EndpointConfigName=endpoint_config_name
)
print(f"Created Endpoint: {create_endpoint_response['EndpointArn']}")

Created Endpoint: arn:aws:sagemaker:us-east-1:303179550733:endpoint/falcon7b-model-ds-2023-07-08-11-03-30-763-endpoint


In [18]:
import time

resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
status = resp["EndpointStatus"]
print("Status: " + status)

while status == "Creating":
    time.sleep(60)
    resp = sm_client.describe_endpoint(EndpointName=endpoint_name)
    status = resp["EndpointStatus"]
    print("Status: " + status)

print("Arn: " + resp["EndpointArn"])
print("Status: " + status)

Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: Creating
Status: InService
Arn: arn:aws:sagemaker:us-east-1:303179550733:endpoint/falcon7b-model-ds-2023-07-08-11-03-30-763-endpoint
Status: InService


In [19]:
%%time

response_model = smr_client.invoke_endpoint(
    EndpointName=endpoint_name,
    Body=json.dumps({"text": "What is the purpose of life?", "text_length": 150}),
    ContentType="application/json",
)

response_model["Body"].read().decode("utf8")

CPU times: user 20 ms, sys: 785 µs, total: 20.7 ms
Wall time: 9.3 s


'[\n  {\n    "generated_text":"What is the purpose of life?\\nIn the next few posts we are going to deal with the question: “what is the purpose of life?” I believe as we search for the answer to this question we are looking for an eternal, eternal purpose that will be an expression of God’s glory, that is, we want God’s purpose to be our purpose. But, what is an eternal purpose? Is this a contradiction?\\nThe word eternal in English means, “without beginning or ending in time.” So we see that in the phrase eternal purpose there is no beginning or ending. So if we want to go to the beginning we want to know what God was doing before there was anything in the universe. “What"\n  }\n]'

In [20]:
endpoint_name

'falcon7b-model-ds-2023-07-08-11-03-30-763-endpoint'