## Running a model with optimized inference by just providing the model id
Get started by reading [here](https://docs.djl.ai/docs/serving/serving/docs/lmi/user_guides/starting-guide.html)

In [8]:
# Assumes SageMaker Python SDK is installed. For example: "pip install sagemaker"
import sagemaker
from sagemaker import image_uris, Model, Predictor
from sagemaker.serializers import JSONSerializer
from sagemaker.deserializers import JSONDeserializer

# Setup role and sagemaker session
iam_role = sagemaker.get_execution_role() 
sagemaker_session = sagemaker.session.Session()
region = sagemaker_session._region_name

In [None]:
# Fetch the uri of the LMI container that supports vLLM, LMI-Dist, HuggingFace Accelerate backends
lmi_image_uri = image_uris.retrieve(framework="djl-lmi", version="0.28.0", region=region)

# Create the SageMaker Model object. In this example we let LMI configure the deployment settings based on the model architecture  
model = Model(
  image_uri=lmi_image_uri,
  role=iam_role,
  env={
    "HF_MODEL_ID": "TheBloke/Llama-2-7B-fp16",
  }
)

# Deploy your model to a SageMaker Endpoint and create a Predictor to make inference requests
endpoint_name = sagemaker.utils.name_from_base("llama-7b-endpoint")
model.deploy(instance_type="ml.g5.2xlarge", initial_instance_count=1, endpoint_name=endpoint_name)

----

In [None]:
#endpoint_name = "lmi-llama2-7b-2024-07-05-15-34-11-247-endpoint"
#inference_component_name = "lmi-llama2-7b-1720193833-0851-inference-component"
predictor = Predictor(
  #component_name=inference_component_name,
  endpoint_name=endpoint_name,
  sagemaker_session=sagemaker_session,
  serializer=JSONSerializer(),
  deserializer=JSONDeserializer(),
)

In [None]:
# Make an inference request against the llama2-7b endpoint
outputs = predictor.predict({
  "inputs": "The diamondback terrapin was the first reptile to be",
  "parameters": {
    "do_sample": True,
    "max_new_tokens": 256,
  }
})

outputs