In [1]:
# =====================================================================
# STEP 1 — Imports and Configuration
# =====================================================================

import sagemaker
import boto3
from sagemaker.inputs import TrainingInput
from sagemaker.xgboost import XGBoost
from sagemaker.workflow.pipeline_context import PipelineSession
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.steps import TrainingStep
from sagemaker.model_metrics import ModelMetrics, MetricsSource
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.workflow.functions import Join
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.execution_variables import ExecutionVariables

from sagemaker.workflow.parameters import ParameterInteger, ParameterString, ParameterFloat

region = boto3.Session().region_name

bucket_param = ParameterString(name="Bucket", default_value="ml-pipeline-project-aniolmg")
project_name_param = ParameterString(name="ProjectName", default_value="titanic")
train_file_param = ParameterString(name="TrainFile", default_value="titanic_train.csv")
test_file_param = ParameterString(name="TestFile", default_value="titanic_test.csv")

train_s3 = Join(on="/", values=["s3:/", bucket_param, "data", train_file_param])

pipeline_session = PipelineSession(default_bucket="ml-pipeline-project-aniolmg")

sm_client = boto3.client("sagemaker")
role = sagemaker.get_execution_role()
output_s3 = Join(on="/", values=["s3:/", bucket_param, project_name_param, "output"])

print(f"✅ Using region: {region}")

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml
✅ Using region: eu-west-3


In [2]:
# =====================================================================
# STEP 2 — Define and Configure the XGBoost Estimator
# =====================================================================

# Hyperparameters
max_depth_param = ParameterInteger(name="MaxDepth", default_value=8)
eta_param = ParameterFloat(name="Eta", default_value=0.3)
num_round_param = ParameterInteger(name="NumRound", default_value=200)
objective_param = ParameterString(name="Objective", default_value="binary:logistic")
target_param = ParameterString(name="Target")
feature_columns_param = ParameterString(name="FeatureColumns")
categorical_columns_param = ParameterString(name="CategoricalColumns")

xgb_estimator = XGBoost(
    entry_point="train_model.py",
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version="1.7-1",
    py_version="py3",
    output_path=Join(on="/", values=["s3:/", bucket_param, project_name_param, "output"]),
    base_job_name="xgboost-train",
    hyperparameters={
        "max_depth": max_depth_param,
        "eta": eta_param,
        "objective": objective_param,
        "num_round": num_round_param,
        "train_file": train_file_param,
        "target_column": target_param,
        "feature_columns": feature_columns_param,
        "categorical_columns": categorical_columns_param,
    },
    sagemaker_session=pipeline_session,
)

INFO:sagemaker.image_uris:Ignoring unnecessary Python version: py3.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: ml.m5.large.


In [3]:
# =====================================================================
# STEP 3 — Create Training, Metrics, and Registration Steps
# =====================================================================

# --- Training step ---
train_step = TrainingStep(
    name="TrainTitanicModel",
    estimator=xgb_estimator,
    inputs={"train": TrainingInput(train_s3, content_type="csv")},
)

# --- Define a ScriptProcessor for computing metrics ---
script_processor = ScriptProcessor(
    image_uri=sagemaker.image_uris.retrieve(
        framework="xgboost",
        region=region,
        version="1.7-1",
        py_version="py3",
    ),
    command=["python3"],
    instance_type="ml.t3.medium",
    instance_count=1,
    role=role,
)

# --- ProcessingStep to compute metrics ---

metrics_output_path = Join(
    on="/",
    values=[
        "s3:/",
        bucket_param,
        project_name_param,
        "metrics",
        ExecutionVariables.PIPELINE_EXECUTION_ID,
    ]
)

metrics_step = ProcessingStep(
    name="ComputeTitanicMetrics",
    processor=script_processor,
    inputs=[
        ProcessingInput(
            source=train_step.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/model",
        ),
        ProcessingInput(
            source=Join(
                on="/",
                values=[
                    "s3:/",
                    bucket_param,
                    "data",
                    test_file_param,
                ]
            ),
            destination="/opt/ml/processing/data",
        ),
    ],
    outputs=[
        ProcessingOutput(
            output_name="metrics",
            source="/opt/ml/processing/metrics",
            destination=metrics_output_path,
        )
    ],
    code="compute_metrics.py",
    job_arguments=[
        "--input-model", "/opt/ml/processing/model",
        "--input-data", Join(on="/", values=["/opt/ml/processing/data", test_file_param]),
        "--output-metrics", "/opt/ml/processing/metrics",
        "--target_column", target_param,
        "--feature_columns", feature_columns_param,
        "--categorical_columns", categorical_columns_param,
    ],
    depends_on=[train_step],
)

# --- Define model metrics using the ProcessingStep output ---
model_metrics = ModelMetrics(
    model_statistics=MetricsSource(
        s3_uri=Join(
            on="/",
            values=[
                metrics_step.properties.ProcessingOutputConfig.Outputs["metrics"].S3Output.S3Uri,
                "metrics.json"
            ],
        ),
        content_type="application/json",
    )
)

# --- Register the model using the metrics from ProcessingStep ---
model_package_group_name = "TitanicModel"

register_step = RegisterModel(
    name="RegisterTitanicModel",
    estimator=xgb_estimator,
    model_data=train_step.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["text/csv"],
    response_types=["text/csv"],
    inference_instances=["ml.t2.medium"],
    transform_instances=["ml.m5.large"],
    model_package_group_name=model_package_group_name,
    approval_status="PendingManualApproval",
    model_metrics=model_metrics,
    depends_on=[metrics_step],
)

INFO:sagemaker.image_uris:Ignoring unnecessary Python version: py3.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [4]:
# =====================================================================
# STEP 4 — Define and Execute SageMaker Pipeline
# =====================================================================

pipeline = Pipeline(
    name="TitanicPipeline",
    steps=[train_step, metrics_step, register_step],
    sagemaker_session=pipeline_session,
    parameters=[
        bucket_param,
        project_name_param,
        max_depth_param,
        eta_param,
        num_round_param,
        objective_param,
        train_file_param,
        test_file_param,
        target_param,
        feature_columns_param,
        categorical_columns_param,
    ],
)

print("⏳ Starting pipeline...")

pipeline.upsert(role_arn=role)
execution = pipeline.start(
    parameters={
        "MaxDepth": 6,
        "TrainFile": "titanic_train_2.csv",
        "Target": "Survived",
        "FeatureColumns": "Age,Sex,Pclass",
        "CategoricalColumns": "Sex",
    }
)
execution.wait()


print("✅ Pipeline executed successfully!")



⏳ Starting pipeline...




✅ Pipeline executed successfully!
