In [11]:
# --- Imports and Configuration ---

import sagemaker
import boto3
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost import XGBoost
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.steps import TrainingStep

In [12]:
# Initialize basic SageMaker clients
region = boto3.Session().region_name
sagemaker_session = sagemaker.session.Session()
role = sagemaker.get_execution_role()

In [13]:
# Common project configuration
project_name = "titanic"
model_package_group_name = "TitanicModel"

In [14]:
# S3 paths
train_s3 = "s3://ml-pipeline-project-aniolmg/data/titanic_data.csv"
output_s3 = "s3://ml-pipeline-project-aniolmg/models/"

In [15]:
print(f"Using region: {region}")
print(f"Execution role: {role}")

Using region: eu-west-3
Execution role: arn:aws:iam::344809604964:role/service-role/AmazonSageMaker-ExecutionRole-20251103T114943


In [16]:
# --- Define and configure the XGBoost estimator ---

xgb_estimator = XGBoost(
    entry_point="train_model.py",
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version="1.7-1",
    py_version="py3",
    output_path=output_s3,
    hyperparameters={
        "max_depth": 8,
        "eta": 0.3,
        "objective": "binary:logistic",
        "num_round": 200,
    },
    sagemaker_session=sagemaker_session,
)


INFO:sagemaker.image_uris:Ignoring unnecessary Python version: py3.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: ml.m5.large.


In [17]:
# --- Define a pipeline session for CI/CD integration ---
pipeline_session = PipelineSession()

In [18]:
# --- Define a training step for the pipeline ---
train_step = TrainingStep(
    name="TrainTitanicModel",
    estimator=xgb_estimator,
    inputs={"train": TrainingInput(train_s3, content_type="csv")},
)

In [20]:
# --- Define model registration step ---
register_step = RegisterModel(
    name="RegisterTitanicModel",
    estimator=xgb_estimator,
    model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.m5.large", "ml.t2.medium"],
    transform_instances=["ml.m5.large"],
    model_package_group_name=model_package_group_name,
    approval_status="PendingManualApproval",  # Or "Approved" to allow auto-promotion
)


In [21]:
# --- Execute the training job manually ---
train_step.estimator.fit({"train": TrainingInput(train_s3, content_type="csv")})

# After completion, register model directly using the same RegisterModel logic
import sagemaker.image_uris

inference_image = sagemaker.image_uris.retrieve(
    framework="xgboost",
    region=region,
    version="1.7-1",
    py_version="py3",
    image_scope="inference",
)

sm_client = boto3.client("sagemaker")

response = sm_client.create_model_package(
    ModelPackageGroupName=model_package_group_name,
    ModelPackageDescription="Auto-registration of Titanic model",
    InferenceSpecification={
        "Containers": [
            {
                "Image": inference_image,
                "ModelDataUrl": train_step.estimator.model_data,
            }
        ],
        "SupportedContentTypes": ["text/csv"],
        "SupportedResponseMIMETypes": ["text/csv"],
    },
    ModelApprovalStatus="PendingManualApproval",
)

print("âœ… Model registered successfully!")
print("Model Package ARN:", response["ModelPackageArn"])


INFO:sagemaker.telemetry.telemetry_logging:SageMaker Python SDK will collect telemetry to help us better understand our user's needs, diagnose issues, and deliver additional features.
To opt out of telemetry, please disable via TelemetryOptOut parameter in SDK defaults config. For more information, refer to https://sagemaker.readthedocs.io/en/stable/overview.html#configuring-and-using-defaults-with-the-sagemaker-python-sdk.
INFO:sagemaker:Creating training-job with name: sagemaker-xgboost-2025-11-04-10-37-01-443


2025-11-04 10:37:06 Starting - Starting the training job...
2025-11-04 10:37:21 Starting - Preparing the instances for training...
2025-11-04 10:37:43 Downloading - Downloading input data...
2025-11-04 10:38:30 Failed - Training job failed
..

UnexpectedStatusException: Error for Training job sagemaker-xgboost-2025-11-04-10-37-01-443: Failed. Reason: ClientError: Data download failed:Failed to download data. AccessDenied (403): User: arn:aws:sts::344809604964:assumed-role/AmazonSageMaker-ExecutionRole-20251103T114943/SageMaker is not authorized to perform: s3:GetObject on resource: "arn:aws:s3:::ml-pipeline-project-aniolmg/data/titanic_data.csv" because no identity-based policy allows the s3:GetObject action. Check troubleshooting guide for common errors: https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-python-sdk-troubleshooting.html

In [22]:
# --- List registered versions in the Model Registry ---
packages = sm_client.list_model_packages(ModelPackageGroupName=model_package_group_name)
for pkg in packages["ModelPackageSummaryList"]:
    print(pkg["ModelPackageArn"], "|", pkg["ModelApprovalStatus"])


NameError: name 'sm_client' is not defined