In [None]:
import os
import boto3
import sagemaker
import logging
from sagemaker.pytorch import PyTorch, PyTorchModel
from sagemaker.tuner import IntegerParameter, CategoricalParameter, ContinuousParameter, HyperparameterTuner
from sagemaker.debugger import (
    Rule,
    DebuggerHookConfig,
    rule_configs,
    ProfilerRule,
    ProfilerConfig,
    FrameworkProfile,
)
from smdebug.trials import create_trial
from smdebug.core.modes import ModeKeys
from glob import glob
import numpy as np
import requests
import json
from PIL import Image
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import host_subplot

#### Configure Logging

In [None]:
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("notebook")

#### Environment Variables and SageMaker Session Setup

In [None]:
# Environment Variables
BUCKET = os.getenv("S3_BUCKET", "default-bucket-name")
ROLE = os.getenv("IAM_ROLE", "default-role")
DEFAULT_REGION = os.getenv("AWS_DEFAULT_REGION", "us-east-1")

In [None]:
# SageMaker Session
sagemaker_session = sagemaker.Session()
boto3_session = boto3.Session(region_name=DEFAULT_REGION)
logger.info(f"Using S3 Bucket: {BUCKET}, IAM Role: {ROLE}, Region: {DEFAULT_REGION}")

#### Step 1: Download and Prepare Dataset

In [None]:
def download_and_prepare_data():
    """Download and upload the dataset to S3."""
    logger.info("Downloading and preparing dataset...")
    os.makedirs("data", exist_ok=True)
    if not os.path.exists("data/dogImages.zip"):
        !wget -O data/dogImages.zip https://s3-us-west-1.amazonaws.com/udacity-aind/dog-project/dogImages.zip
    !unzip -o data/dogImages.zip -d data
    s3_data_path = f"s3://{BUCKET}/data/"
    !aws s3 sync data/dogImages/ {s3_data_path}
    logger.info(f"Data uploaded to {s3_data_path}")
    return s3_data_path

s3_data_path = download_and_prepare_data()

#### Step 2: Hyperparameter Tuning

In [None]:
def hyperparameter_tuning(s3_data_path):
    """Perform hyperparameter tuning using SageMaker."""
    logger.info("Starting hyperparameter tuning...")

    hyperparameter_ranges = {
        "learning_rate": ContinuousParameter(0.001, 0.1),
        "batch_size": CategoricalParameter([32, 64, 128, 256]),
    }

    objective_metric_name = "Validation Loss"
    metric_definitions = [
        {"Name": "Validation Loss", "Regex": "valid loss: ([0-9\\.]+), acc: [0-9\\.]+.*"},
    ]

    estimator = PyTorch(
        entry_point="hpo.py",
        source_dir="code",
        base_job_name="pytorch-dog-hpo",
        role=ROLE,
        framework_version="1.9",
        instance_count=1,
        instance_type="ml.m5.large",
        py_version="py38",
    )

    tuner = HyperparameterTuner(
        estimator,
        objective_metric_name,
        hyperparameter_ranges,
        metric_definitions,
        max_jobs=10,
        max_parallel_jobs=2,
        objective_type="Minimize",
    )

    tuner.fit({"training": s3_data_path}, wait=True)
    best_estimator = tuner.best_estimator()
    logger.info("Hyperparameter tuning complete.")
    return best_estimator

best_estimator = hyperparameter_tuning(s3_data_path)

#### Step 3: Model Training and Profiling

In [None]:
def model_training_and_profiling(s3_data_path, hyperparameters):
    """Train the model with profiling and debugging enabled."""
    logger.info("Starting model training and profiling...")

    rules = [
        Rule.sagemaker(rule_configs.vanishing_gradient()),
        ProfilerRule.sagemaker(rule_configs.ProfilerReport()),
    ]

    profiler_config = ProfilerConfig(
        system_monitor_interval_millis=500, framework_profile_params=FrameworkProfile(num_steps=10)
    )

    estimator = PyTorch(
        entry_point="train_model.py",
        source_dir="code",
        base_job_name="pytorch-dog-training",
        role=ROLE,
        instance_count=1,
        instance_type="ml.m5.xlarge",
        framework_version="1.9",
        py_version="py38",
        hyperparameters=hyperparameters,
        output_path=f"s3://{BUCKET}/output/",
        rules=rules,
        profiler_config=profiler_config,
    )

    estimator.fit({"training": s3_data_path}, wait=True)
    logger.info("Model training complete.")
    return estimator

hyperparameters = best_estimator.hyperparameters()
estimator = model_training_and_profiling(s3_data_path, hyperparameters)

#### Step 4: Model Deployment

In [None]:
def deploy_model(estimator):
    """Deploy the trained model to a SageMaker endpoint."""
    logger.info("Deploying the model...")

    class ImagePredictor(sagemaker.predictor.Predictor):
        """Custom Predictor for image classification."""

        def __init__(self, endpoint_name, sagemaker_session):
            super().__init__(
                endpoint_name,
                sagemaker_session=sagemaker_session,
                serializer=sagemaker.serializers.IdentitySerializer("image/jpeg"),
                deserializer=sagemaker.deserializers.JSONDeserializer(),
            )

    model = PyTorchModel(
        entry_point="inference.py",
        source_dir="code",
        role=ROLE,
        model_data=estimator.model_data,
        framework_version="1.9",
        py_version="py38",
        predictor_cls=ImagePredictor,
    )

    predictor = model.deploy(initial_instance_count=1, instance_type="ml.t2.medium")
    logger.info("Model deployed.")
    return predictor

In [None]:
predictor = deploy_model(estimator)
logger.info(f"Endpoint deployed at: {predictor.endpoint_name}")

#### Additional: Interact with the Deployed Model

In [None]:
image_url = "https://example.com/sample_image.jpg"
response = requests.get(image_url)
image_bytes = response.content

#### Clean Up Resources

In [None]:
logger.info("Sending image to the endpoint for prediction...")
result = predictor.predict(image_bytes)
logger.info(f"Prediction result: {result}")

In [None]:
logger.info("Deleting the endpoint...")
predictor.delete_endpoint()
logger.info("Endpoint deleted.")