# Create a custom Sagemaker endpoint from an Hugging Face repository

### this notebook deploys vision model [PaliGemma](https://huggingface.co/blog/Paligemma) to a custom realtime infernece endpoint of Sagemaker Than, it testes the model with a simple segmentation task on a test image

## Deploy the model

In [None]:
!pip install --upgrade huggingface_hub
!pip install --upgrade sagemaker

In [None]:
import sagemaker
import boto3
sess = sagemaker.Session()
# sagemaker session bucket -> used for uploading data, models and logs
# sagemaker will automatically create this bucket if it not exists
sagemaker_session_bucket=None
if sagemaker_session_bucket is None and sess is not None:
    # set to default bucket if a bucket name is not given
    sagemaker_session_bucket = sess.default_bucket()

try:
    role = sagemaker.get_execution_role()
except ValueError:
    iam = boto3.client('iam')
    role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn']

sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

print(f"sagemaker role arn: {role}")
print(f"sagemaker bucket: {sess.default_bucket()}")
print(f"sagemaker session region: {sess.boto_region_name}")


In [None]:
!mkdir code

In [None]:
%%writefile code/requirements.txt
accelerate
bitsandbytes
git+https://github.com/huggingface/transformers.git
Pillow

In [None]:
%%writefile code/inference.py
from transformers import AutoTokenizer, PaliGemmaForConditionalGeneration, PaliGemmaProcessor
import torch
import base64
from io import BytesIO
from PIL import Image

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def get_RGB_image_io(image_bytes):
    image_io = BytesIO(image_bytes)
    image = Image.open(image_io)
    resized_image = image.convert("RGB")
    return resized_image

def model_fn(model_dir):
    model = PaliGemmaForConditionalGeneration.from_pretrained(model_dir, torch_dtype=torch.bfloat16).to(device)
    # Load the processor
    processor = PaliGemmaProcessor.from_pretrained(model_dir)
    return [model, processor]


def predict_fn(data, model_process):
    model = model_process[0]
    processor = model_process[1]
    
    # get prompt & parameters
    prompt = data.get("prompt", "")
    # set valid HP for stable diffusion
    image_b64=data.get("image", "")
    
    # decode image from Base64
    image_data = base64.b64decode(image_b64)
    input_image = get_RGB_image_io(image_data)

    inputs = processor(text=prompt, images=input_image, padding="longest", do_convert_rgb=True, return_tensors="pt").to(device)
    inputs = inputs.to(dtype=model.dtype)

    with torch.no_grad():
        output = model.generate(**inputs, max_length=496)
        str_out = processor.decode(output[0], skip_special_tokens=True)
    return {"response": str_out} 

In [None]:
from distutils.dir_util import copy_tree
from pathlib import Path
from huggingface_hub import snapshot_download
import random
HF_MODEL_ID="google/paligemma-3b-mix-224"
# you need to accept the Gemma terms and conditions at: https://huggingface.co/google/paligemma-3b-mix-224
HF_TOKEN="YOUR_HF_TOKEN" # your hugging face token: https://huggingface.co/settings/tokens
assert len(HF_TOKEN) > 0, "Please set HF_TOKEN to your huggingface token. You can find it here: https://huggingface.co/settings/tokens"

# download snapshot
snapshot_dir = snapshot_download(repo_id=HF_MODEL_ID,use_auth_token=HF_TOKEN)

# create model dir
model_folder_name=f"model-{random.getrandbits(16)}"
model_tar = Path(model_folder_name)
model_tar.mkdir(exist_ok=True)

# copy snapshot to model dir
copy_tree(snapshot_dir, str(model_tar))

In [None]:
# If you executed the above cell do not execute this cell. We already filled the model_tar variable
# this is needed if you already downloaded the model in a folder with the cell above
model_folder_name="model-45095"
model_tar = Path(model_folder_name)

In [None]:
# copy code/ to model dir
from pathlib import Path
copy_tree("code/", str(model_tar.joinpath("code")))

In [None]:
# please use the below tar command to use multithreading compression - use a good ec2 for this (like a m5.16xlarge)
# this command is single thread. Too slow for the size of an LLM

import tarfile
import os

# helper to create the model.tar.gz
def compress(tar_dir=None,output_file="model.tar.gz"):
    parent_dir=os.getcwd()
    os.chdir(tar_dir)
    with tarfile.open(os.path.join(parent_dir, output_file), "w:gz") as tar:
        for item in os.listdir('.'):
            print(item)
            tar.add(item, arcname=item)
    os.chdir(parent_dir)

compress(str(model_tar))


In [None]:
# if you want to use multithreading compression you need pigz. This command installs it on the notebook
!conda install -y pigz

In [None]:
# multi-threading compression with pigz

!cd {model_folder_name}; tar cvf model.tar.gz --use-compress-program=pigz ./*

In [None]:
#move the tar in the current folder

!mv ./{model_folder_name}/model.tar.gz ./

In [None]:
from sagemaker.s3 import S3Uploader

# upload model.tar.gz to s3
s3_model_uri=S3Uploader.upload(local_path="model.tar.gz", desired_s3_uri=f"s3://{sess.default_bucket()}/paligemma")

print(f"model uploaded to: {s3_model_uri}")

In [None]:
from sagemaker.huggingface.model import HuggingFaceModel
import datetime

timestamp = str(datetime.datetime.now()).replace(" ","-").replace(".","-").replace(":","-")

# create Hugging Face Model Class
huggingface_model = HuggingFaceModel(
   model_data=s3_model_uri,      # path to your model and script
   role=role,                    # iam role with permissions to create an Endpoint
   transformers_version="4.37.0",  # transformers version used
   pytorch_version="2.1.0",        # pytorch version used
   py_version='py310',           # python version used
)

# deploy the endpoint
predictor = huggingface_model.deploy(
    initial_instance_count=1,
    instance_type="ml.g4dn.xlarge",
    endpoint_name=f"paligemma-{timestamp}"
    )


## Run the model

In [None]:
import base64
from PIL import Image
from io import BytesIO

# helper decoder
def decode_base64_image(image_string):
    base64_image = base64.b64decode(image_string)
    buffer = BytesIO(base64_image)
    return Image.open(buffer)

In [None]:
# download a dog picture from picsum as test
!wget --no-check-certificate https://picsum.photos/id/237/1000 -O test.jpg

In [None]:
# content_type = 'application/json;jpeg', endpoint expects payload to be a json with the original image and the mask image as bytes encoded with base64.b64 encoding.
# To send raw image to the endpoint, you can encoded_image as np.array(PIL.Image.open(input_img_file_name.jpg)).tolist()
content_type = "application/json;jpeg"


input_img_path = './test.jpg'
with open(input_img_path, "rb") as f:
    input_img_image_bytes = f.read()

encoded_input_image = base64.b64encode(bytearray(input_img_image_bytes)).decode()

In [None]:
from IPython.display import Image as DisplayImage

DisplayImage(filename=input_img_path)

In [59]:
payload = {
    "prompt": "detect dog",
    "image": encoded_input_image
}

query_response = predictor.predict(data=payload)

In [None]:
query_response