In [1]:
import sagemaker
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.processing import FrameworkProcessor, ProcessingInput, ProcessingOutput
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.steps import CacheConfig
from sagemaker.tuner import HyperparameterTuner, ContinuousParameter, IntegerParameter
from sagemaker.inputs import TrainingInput
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TuningStep
from sagemaker.xgboost import XGBoost
from sagemaker.estimator import InstanceGroup
from sagemaker.workflow.functions import Join
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.conditions import ConditionGreaterThan
from sagemaker.sklearn.model import SKLearnModel

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = sagemaker_session.boto_region_name
raw_data_s3_uri = "s3://sagemakerantdata/smallchurndataset/raw/"

In [3]:
sklearn_processor = SKLearnProcessor(
    framework_version="1.2-1",
    role=role,
    instance_type="ml.t3.medium",
    instance_count=1,
)


INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [4]:
cache_config = CacheConfig(enable_caching=True, expire_after="30d")

data_prep = ProcessingStep(
    name="ChurnDataPrep",
    processor=sklearn_processor,
    code="scripts/data_ingestion.py",
    inputs=[
        ProcessingInput(source=raw_data_s3_uri, destination="/opt/ml/processing/input")
    ],
    outputs=[
        ProcessingOutput(
            output_name="train",
            source="/opt/ml/processing/output/train",
            destination="s3://sagemakerantdata/smallchurndataset/processed/train"
        ),
        ProcessingOutput(
            output_name="test",
            source="/opt/ml/processing/output/test",
            destination="s3://sagemakerantdata/smallchurndataset/processed/test"
        ),
    ],
)

In [48]:
# pipeline = Pipeline(
#     name="ChurnPredictionPipeline",
#     parameters=[], # You can define pipeline-level parameters here
#     steps=[data_prep],
#     sagemaker_session=sagemaker_session,
# # )


In [22]:
# In your pipeline.ipynb file, after the data_prep step definition

from sagemaker.sklearn.estimator import SKLearn
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, ContinuousParameter
from sagemaker.workflow.steps import TuningStep
from sagemaker.inputs import TrainingInput

# --- Random Forest HPO ---
# Use the SKLearn estimator for scikit-learn models
output_bucket = "sagemakerantdata"
output_prefix = "smallchurndataset/artifacts"
output_path = f"s3://{output_bucket}/{output_prefix}"

sklearn_estimator_rf = SKLearn(
    entry_point="scripts/train.py",
    framework_version="1.2-1",
    instance_type="ml.m5.large",
    instance_count=1,  # <--- ADD THIS LINE
    role=role,
    hyperparameters={"model_type": "randomforest"},
    output_path=output_path
)

tuner_rf = HyperparameterTuner(
    estimator=sklearn_estimator_rf,
    objective_metric_name="Validation F1 Score",
    hyperparameter_ranges={
        "n_estimators": IntegerParameter(100, 300),
        "max_depth": IntegerParameter(5, 20),
    },
    metric_definitions=[
        {"Name": "Validation F1 Score", "Regex": "Validation F1 Score: ([0-9\\.]+)"}
    ],
    max_jobs=6,
    max_parallel_jobs=2,
    objective_type="Maximize",
)

tune_step_rf = TuningStep(
    name="TuneRandomForest",
    tuner=tuner_rf,
    inputs={
        "train": TrainingInput(
            s3_data=data_prep.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri
        ),
        "test": TrainingInput(
            s3_data=data_prep.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri
        ),
    },
    cache_config=cache_config
)

### Logistic Regression
logistic_estimator = SKLearn(
    entry_point="scripts/train.py",
    framework_version="1.2-1",
    instance_type="ml.m5.large",
    instance_count=1,
    role=role,
    hyperparameters={"model_type": "logistic"},
    output_path=output_path
)

tuner_logistic = HyperparameterTuner(
    estimator=logistic_estimator,
    objective_metric_name="Validation F1 Score",
    hyperparameter_ranges={
        "C": ContinuousParameter(0.001, 10.0),
        # penalty can only take ["l1", "l2"], so not tunable like numeric params
    },
    metric_definitions=[
        {"Name": "Validation F1 Score", "Regex": "Validation F1 Score: ([0-9\\.]+)"}
    ],
    max_jobs=4,
    max_parallel_jobs=2,
    objective_type="Maximize",
)

tune_step_logistic = TuningStep(
    name="TuneLogisticRegression",
    tuner=tuner_logistic,
    inputs={
        "train": TrainingInput(
            s3_data=data_prep.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri
        ),
        "test": TrainingInput(
            s3_data=data_prep.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri
        ),
    },
    cache_config=cache_config
)


In [23]:

# Best F1 from RandomForest
best_rf_f1 = tune_step_rf.properties.BestTrainingJob.FinalHyperParameterTuningJobObjectiveMetric.Value

# Best F1 from Logistic Regression
best_logistic_f1 = tune_step_logistic.properties.BestTrainingJob.FinalHyperParameterTuningJobObjectiveMetric.Value


In [30]:
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.conditions import ConditionGreaterThan
from sagemaker.workflow.model_step import CreateModelStep

best_rf_f1 = tune_step_rf.properties.BestTrainingJob.FinalHyperParameterTuningJobObjectiveMetric.Value
best_logistic_f1 = tune_step_logistic.properties.BestTrainingJob.FinalHyperParameterTuningJobObjectiveMetric.Value

# Define S3 bucket for top model artifacts
s3_bucket = "sagemakerantdata/smallchurndataset/artifacts"

# Create SKLearnModel objects pointing to top models
rf_model = SKLearnModel(
    model_data=tune_step_rf.get_top_model_s3_uri(s3_bucket=s3_bucket, top_k=1),
    role=role,
    entry_point="scripts/train.py",
    framework_version="1.2-1"
)

logistic_model = SKLearnModel(
    model_data=tune_step_logistic.get_top_model_s3_uri(s3_bucket=s3_bucket, top_k=1),
    role=role,
    entry_point="scripts/train.py",
    framework_version="1.2-1"
)

# ===============================
# Create CreateModelStep for each model
# ===============================
step_rf_model = CreateModelStep(
    name="CreateRFModel",
    model=rf_model  # Only the SKLearnModel object
)
step_logistic_model = CreateModelStep(
    name="CreateLogisticModel",
    model=logistic_model
)
# ===============================
# ConditionStep to choose best model dynamically
# ===============================
choose_best_model_step = ConditionStep(
    name="ChooseBestModel",
    conditions=[ConditionGreaterThan(left=best_rf_f1, right=best_logistic_f1)],
    if_steps=[step_rf_model],
    else_steps=[step_logistic_model],
)


In [31]:
# Submit the pipeline definition to SageMaker
pipeline = Pipeline(
    name="ChurnPredictionPipeline",
    steps=[data_prep, tune_step_rf, tune_step_logistic,choose_best_model_step],
    sagemaker_session=sagemaker_session,
)

pipeline.upsert(role_arn=role)

# Start a pipeline execution
execution = pipeline.start()
execution.describe()

INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.


{'PipelineArn': 'arn:aws:sagemaker:ap-south-1:891377324517:pipeline/ChurnPredictionPipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:ap-south-1:891377324517:pipeline/ChurnPredictionPipeline/execution/j9eqmm7zg7ry',
 'PipelineExecutionDisplayName': 'execution-1758282653554',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2025, 9, 19, 11, 50, 53, 501000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2025, 9, 19, 11, 50, 53, 501000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:ap-south-1:891377324517:user-profile/d-51dx373wnss0/default-20250903T125584',
  'UserProfileName': 'default-20250903T125584',
  'DomainId': 'd-51dx373wnss0',
  'IamIdentity': {'Arn': 'arn:aws:sts::891377324517:assumed-role/AmazonSageMaker-ExecutionRole-20250903T125584/SageMaker',
   'PrincipalId': 'AROA47CR2SHSWJ6BV5G2F:SageMaker'}},
 'LastModifiedBy': {'UserProfileArn': 'arn:aws:sagemaker:ap-south-1:891377324517:user-profile/d-51dx373wnss0/defa

In [41]:
steps = execution.list_steps()
for step in steps:
    print(f"{step['StepName']} → {step['StepStatus']}")
    if 'FailureReason' in step:
        print(f"   Reason: {step ['FailureReason']}")


CreateLogisticModel → Failed
   Reason: ClientError: Failed to invoke sagemaker:CreateModel. Error Details: Could not find model data at s3://sagemakerantdata/smallchurndataset/artifacts//j9eqmm7zg7ry-TuneLogi-jDG8twbv3y-004-dc3081a8/output/model.tar.gz.
ChooseBestModel → Succeeded
TuneLogisticRegression → Succeeded
TuneRandomForest → Succeeded
ChurnDataPrep → Succeeded


In [None]:
ExecutionRole-20250903T125584