In [42]:
# =====================================================================
# STEP 1 — Imports and Configuration
# =====================================================================

import sagemaker
import boto3
import json
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost import XGBoost
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.steps import TrainingStep
from sagemaker.model_metrics import ModelMetrics, MetricsSource

# Initialize sessions
region = boto3.Session().region_name
pipeline_session = PipelineSession()

sm_client = boto3.client("sagemaker")

# IAM role (automatically retrieved inside SageMaker Studio)
role = sagemaker.get_execution_role()

# Common configuration
project_name = "titanic"
model_package_group_name = "TitanicModel"

# S3 locations
bucket = "ml-pipeline-project-aniolmg"
train_s3 = f"s3://{bucket}/data/titanic_data.csv"
output_s3 = f"s3://{bucket}/models/"

print(f"✅ Using region: {region}")
print(f"✅ Execution role: {role}")
print(f"✅ Output S3: {output_s3}")

✅ Using region: eu-west-3
✅ Execution role: arn:aws:iam::344809604964:role/sage-maker-full-acess-role
✅ Output S3: s3://ml-pipeline-project-aniolmg/models/


In [43]:
# =====================================================================
# STEP 2 — Define and Configure the XGBoost Estimator
# =====================================================================

xgb_estimator = XGBoost(
    entry_point="train_model.py",
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version="1.7-1",
    py_version="py3",
    output_path=f"s3://{bucket}/{project_name}/output/",
    base_job_name=f"{project_name}-train",
    hyperparameters={
        "max_depth": 8,
        "eta": 0.3,
        "objective": "binary:logistic",
        "num_round": 200,
        "bucket": bucket,   # for metric storage in your training script
    },
    sagemaker_session=pipeline_session,
)

INFO:sagemaker.image_uris:Ignoring unnecessary Python version: py3.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: ml.m5.large.


In [45]:
# =====================================================================
# STEP 3 — Create Training, Metrics, and Registration Steps
# =====================================================================

from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.model_metrics import ModelMetrics, MetricsSource
from sagemaker.workflow.step_collections import RegisterModel

# --- Training step (already defined) ---
train_step = TrainingStep(
    name="TrainTitanicModel",
    estimator=xgb_estimator,
    inputs={"train": TrainingInput(train_s3, content_type="csv")},
)

# --- Define a ScriptProcessor for computing metrics ---
script_processor = ScriptProcessor(
    image_uri=sagemaker.image_uris.retrieve(
        framework="xgboost",
        region=region,
        version="1.7-1",
        py_version="py3",
    ),
    command=["python3"],
    instance_type="ml.t3.medium",  # cheapest option
    instance_count=1,
    role=role,
)

# --- ProcessingStep to compute metrics ---
metrics_step = ProcessingStep(
    name="ComputeTitanicMetrics",
    processor=script_processor,
    inputs=[
        ProcessingInput(
            source=train_step.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/model",
        ),
        ProcessingInput(
            source=f"s3://{bucket}/data/",
            destination="/opt/ml/processing/data",
        ),
    ],
    outputs=[
        ProcessingOutput(
            output_name="metrics",
            source="/opt/ml/processing/metrics",
            destination=f"s3://{bucket}/metrics/",
        )
    ],
    code="compute_metrics.py",
    job_arguments=[
        "--input-model", "/opt/ml/processing/model",
        "--input-data", "/opt/ml/processing/data",
        "--output-metrics", "/opt/ml/processing/metrics"
    ],
    depends_on=[train_step],
)

# --- Define model metrics using the ProcessingStep output ---
model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        # Reference the metrics output dynamically from the ProcessingStep
        s3_uri=metrics_step.properties.ProcessingOutputConfig.Outputs["metrics"].S3Output.S3Uri,
        content_type="application/json",
    )
)

# --- Register the model using the metrics from ProcessingStep ---
register_step = RegisterModel(
    name="RegisterTitanicModel",
    estimator=xgb_estimator,
    model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.t2.medium"],
    transform_instances=["ml.m5.large"],
    model_package_group_name=model_package_group_name,
    approval_status="PendingManualApproval",
    model_metrics=model_metrics,
    depends_on=[metrics_step],  # ensure registration runs after metrics
)

INFO:sagemaker.image_uris:Ignoring unnecessary Python version: py3.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [None]:
# =====================================================================
# STEP 4 — Define and Execute SageMaker Pipeline
# =====================================================================

from sagemaker.workflow.pipeline import Pipeline

pipeline = Pipeline(
    name="TitanicPipeline",
    steps=[train_step, metrics_step, register_step],
    sagemaker_session=pipeline_session,
)

pipeline.upsert(role_arn=role)
execution = pipeline.start()
execution.wait()


print("✅ Pipeline executed successfully!")




In [36]:
import boto3

sm_client = boto3.client("sagemaker")

# Retrieve the latest model package in the group
response = sm_client.list_model_packages(
    ModelPackageGroupName="TitanicModel",
    SortBy="CreationTime",
    SortOrder="Descending",
    MaxResults=1
)

latest_package = response["ModelPackageSummaryList"][0]
print(f"✅ Latest registered model version: {latest_package['ModelPackageVersion']}")
print(f"Model Package ARN: {latest_package['ModelPackageArn']}")
print(f"Approval status: {latest_package['ModelApprovalStatus']}")


✅ Latest registered model version: 17
Model Package ARN: arn:aws:sagemaker:eu-west-3:344809604964:model-package/TitanicModel/17
Approval status: PendingManualApproval


In [29]:
execution_desc = sm_client.describe_pipeline_execution(
    PipelineExecutionArn=execution.arn
)
print(execution_desc["PipelineExecutionStatus"])


Failed


In [30]:
for step in sm_client.list_pipeline_execution_steps(PipelineExecutionArn=execution.arn)["PipelineExecutionSteps"]:
    print(step["StepName"], step["StepStatus"])


ComputeTitanicMetrics Failed
TrainTitanicModel Succeeded


In [31]:
import boto3

sm_client = boto3.client("sagemaker")

# List all steps in this pipeline execution
steps = sm_client.list_pipeline_execution_steps(
    PipelineExecutionArn=execution.arn
)["PipelineExecutionSteps"]

# Print the status of each step
for step in steps:
    print(f"Step: {step['StepName']}, Status: {step['StepStatus']}")
    if "FailureReason" in step:
        print(f"  Failure reason: {step['FailureReason']}")


Step: ComputeTitanicMetrics, Status: Failed
  Failure reason: ClientError: AlgorithmError: , exit code: 1
Step: TrainTitanicModel, Status: Succeeded


In [27]:
# =====================================================================
# STEP 5 — Register Model in the Model Registry (with metrics)
# =====================================================================

# Create a pipeline session (even if you’re running interactively)
pipeline_session = PipelineSession()

# --- Define model metrics object from your S3 JSON ---
model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri=f"s3://{bucket}/metrics/titanic_metrics.json",
        content_type="application/json"
    )
)

# --- Register the model ---
register_step = RegisterModel(
    name="RegisterTitanicModel",
    estimator=train_step.estimator,
    model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.m5.large", "ml.m5.xlarge"],
    transform_instances=["ml.m5.large"],
    model_package_group_name=model_package_group_name,
    approval_status="PendingManualApproval",
    model_metrics=model_metrics,
    role=role,
    sagemaker_session=pipeline_session
)

# Execute the registration step manually
register_step_args = register_step.to_request()
response = sm_client.create_model_package(**register_step_args)

print("✅ Model registered successfully with metrics!")
print("Model Package ARN:", response["ModelPackageArn"])


In [33]:
# --- List registered versions in the Model Registry ---
packages = sm_client.list_model_packages(ModelPackageGroupName=model_package_group_name)
for pkg in packages["ModelPackageSummaryList"]:
    print(pkg["ModelPackageArn"], "|", pkg["ModelApprovalStatus"])


arn:aws:sagemaker:eu-west-3:344809604964:model-package/TitanicModel/15 | PendingManualApproval
arn:aws:sagemaker:eu-west-3:344809604964:model-package/TitanicModel/14 | PendingManualApproval
arn:aws:sagemaker:eu-west-3:344809604964:model-package/TitanicModel/13 | PendingManualApproval
arn:aws:sagemaker:eu-west-3:344809604964:model-package/TitanicModel/12 | PendingManualApproval
arn:aws:sagemaker:eu-west-3:344809604964:model-package/TitanicModel/11 | PendingManualApproval
arn:aws:sagemaker:eu-west-3:344809604964:model-package/TitanicModel/10 | PendingManualApproval
arn:aws:sagemaker:eu-west-3:344809604964:model-package/TitanicModel/9 | PendingManualApproval
arn:aws:sagemaker:eu-west-3:344809604964:model-package/TitanicModel/8 | PendingManualApproval
arn:aws:sagemaker:eu-west-3:344809604964:model-package/TitanicModel/7 | PendingManualApproval
arn:aws:sagemaker:eu-west-3:344809604964:model-package/TitanicModel/6 | PendingManualApproval
arn:aws:sagemaker:eu-west-3:344809604964:model-package