In [1]:
from sagemaker.huggingface import HuggingFaceModel
import sagemaker
role = sagemaker.get_execution_role()

ENV = {
    "HF_MODEL_ID": "gpt2",
    "HF_TASK": "text-generation",
}

model = HuggingFaceModel(
    env=ENV, role=role,
    transformers_version="4.49.0", pytorch_version="2.6.0", py_version="py312",
)

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/ec2-user/.config/sagemaker/config.yaml


In [2]:
print("starting to deploy the model")
ENDPOINT_NAME = "genai-gpt2-api"
predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    endpoint_name=ENDPOINT_NAME)
print("model has been deployed")

starting to deploy the model
------!model has been deployed


In [3]:
predictor.predict({
  "inputs": "The future of artificial intelligence is",
  "parameters": {
    "max_new_tokens": 80,
    "temperature": 0.7,
    "top_p": 0.9,
    "do_sample": True,
    "repetition_penalty": 1.1
  }
})


[{'generated_text': 'The future of artificial intelligence is very bright and we are looking at a time when it\'s more likely that the next generation will be able to tackle some major problems such as climate change, diseases like AIDS and cancer.\n"It would seem quite an extraordinary idea for AI," he said. "If you can\'t get people involved in science then maybe there isn\'n going any sense."'}]

In [4]:
import time, json, statistics, boto3
smrt = boto3.client("sagemaker-runtime")
PROMPT = "Hello, today I want to talk about"
PARAMS = {"max_new_tokens": 50, "temperature": 0.7, "top_p": 0.9, "do_sample": True}

def once():
    t0 = time.time()
    r = smrt.invoke_endpoint(
        EndpointName=ENDPOINT_NAME,
        ContentType="application/json",
        Body=json.dumps({"inputs": PROMPT, "parameters": PARAMS})
    )
    _ = r["Body"].read()
    return (time.time() - t0) * 1000.0

# warmup
for _ in range(3): once()

N = 10
times = sorted(once() for _ in range(N))
print(json.dumps({
    "count": N,
    "p50_ms": times[N//2],
    "p95_ms": times[int(0.95*N)-1],
    "avg_ms": sum(times)/N
}, indent=2))


{
  "count": 10,
  "p50_ms": 2643.0187225341797,
  "p95_ms": 2687.7570152282715,
  "avg_ms": 2648.7853288650513
}


In [6]:
import boto3
sm = boto3.client("sagemaker")
ep = "genai-gpt2-api"
cfg = sm.describe_endpoint(EndpointName=ep)["EndpointConfigName"]
sm.delete_endpoint(EndpointName=ep)
sm.delete_endpoint_config(EndpointConfigName=cfg)
print("Deleted:", ep, cfg)


Deleted: genai-gpt2-api genai-gpt2-api
