In [None]:
import sagemaker
import boto3
from sagemaker.huggingface.model import HuggingFaceModel
from sagemaker.serverless import ServerlessInferenceConfig
from sagemaker.image_uris import retrieve

# --- Setup ---
sagemaker_session = sagemaker.Session()
# Use the default S3 bucket for uploading model artifacts
bucket = sagemaker_session.default_bucket()
role = sagemaker.get_execution_role()

# Define the local folder containing inference.py and requirements.txt
source_dir = 'code' 

# --- 1. Model Packaging (Requires model weights to be downloaded first) ---
# NOTE: This step assumes you have already downloaded the Stable Diffusion 1.5,
# ControlNet, and IP-Adapter weights and placed them alongside the 'code' folder 
# in the structure required by inference.py.

# Create the model.tar.gz artifact
!tar -czvf model.tar.gz ./code ./models/stable-diffusion-v1-5 ./models/controlnet ./models/ip_adapter_models

# Upload the artifact to S3
model_data_uri = sagemaker_session.upload_data(
    path='model.tar.gz', 
    key_prefix='stable-diffusion-inference/model_artifacts'
)
print(f"Model artifact uploaded to: {model_data_uri}")

# --- 2. Define Container Image ---
# Retrieve the necessary HuggingFace PyTorch CPU Inference Image (Serverless doesn't use GPU)
# IMPORTANT: Use the correct versions compatible with your downloaded models.
# The memory size limitation requires a CPU image.
region = boto3.Session().region_name
inference_image_uri = retrieve(
    framework='huggingface',
    region=region,
    version='1.13', # PyTorch version (needs to be compatible with Diffusers)
    py_version='py39', 
    instance_type='ml.c5.xlarge', # Specify a compatible instance type for URI retrieval
    accelerator_type=None,
    image_scope='inference'
)

# --- 3. Create Serverless Configuration (Addressing Limitations) ---
# Set the maximum memory size and concurrency. 
# 6144 MB is required to load the full SD + ControlNet + IP-Adapter pipeline.
serverless_config = ServerlessInferenceConfig(
    memory_size_in_mb=6144,  # Max memory to prevent OOM errors (16GB RAM hit limitation)
    max_concurrency=2,       # Low concurrency for initial cost control
    # min_concurrency=1      # For V2, uncomment this to reduce cold start latency
)

# --- 4. Create HuggingFace Model Object ---
huggingface_model = HuggingFaceModel(
    model_data=model_data_uri,
    role=role,
    image_uri=inference_image_uri,
    # The entry_point points to the custom inference script inside the S3 artifact
    entry_point='inference.py',
    source_dir='./code', # Tells SageMaker where to find the inference script inside the tarball
    env={
        'SAGEMAKER_PROGRAM': 'inference.py', 
        'MMS_MAX_WORKERS': '1' # Control the number of workers to manage memory
    }
)

# --- 5. Deploy Endpoint ---
# Deploy the model using the serverless configuration
predictor = huggingface_model.deploy(
    serverless_inference_config=serverless_config,
    endpoint_name=f'sd-serverless-endpoint-{sagemaker.utils.unique_name_from_base("sd")}',
    wait=True 
)

print(f"Endpoint Name: {predictor.endpoint_name}")
# Save the endpoint name to an environment variable for the FastAPI Lambda function
# os.environ['SAGEMAKER_ENDPOINT_NAME'] = predictor.endpoint_name

# --- 6. Cleanup (Crucial for cost control) ---
# predictor.delete_endpoint() 
# predictor.delete_model()