In [1]:
import sagemaker
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import ProcessingStep, TrainingStep
from sagemaker.processing import FrameworkProcessor, ProcessingInput, ProcessingOutput
from sagemaker.sklearn.processing import SKLearnProcessor
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.properties import PropertyFile
from sagemaker.workflow.steps import CacheConfig
from sagemaker.tuner import HyperparameterTuner, ContinuousParameter, IntegerParameter
from sagemaker.inputs import TrainingInput
from sagemaker.inputs import TrainingInput
from sagemaker.workflow.steps import TuningStep
from sagemaker.xgboost import XGBoost
from sagemaker.estimator import InstanceGroup


sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [32]:
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = sagemaker_session.boto_region_name
raw_data_s3_uri = "s3://sagemakerantdata/smallchurndataset/raw/"

In [46]:
sklearn_processor = SKLearnProcessor(
    framework_version="1.2-1",
    role=role,
    instance_type="ml.t3.medium",
    instance_count=1,
)


INFO:sagemaker.image_uris:Defaulting to only available Python version: py3


In [27]:
cache_config = CacheConfig(enable_caching=True, expire_after="30d")

data_prep = ProcessingStep(
    name="ChurnDataPrep",
    processor=sklearn_processor,
    code="scripts/data_ingestion.py",
    inputs=[
        ProcessingInput(source=raw_data_s3_uri, destination="/opt/ml/processing/input")
    ],
    outputs=[
        ProcessingOutput(
            output_name="train",
            source="/opt/ml/processing/output/train",
            destination="s3://sagemakerantdata/smallchurndataset/processed/train"
        ),
        ProcessingOutput(
            output_name="test",
            source="/opt/ml/processing/output/test",
            destination="s3://sagemakerantdata/smallchurndataset/processed/test"
        ),
    ],
)

In [48]:
# pipeline = Pipeline(
#     name="ChurnPredictionPipeline",
#     parameters=[], # You can define pipeline-level parameters here
#     steps=[data_prep],
#     sagemaker_session=sagemaker_session,
# # )


In [58]:
# In your pipeline.ipynb file, after the data_prep step definition

from sagemaker.sklearn.estimator import SKLearn
from sagemaker.xgboost.estimator import XGBoost
from sagemaker.tuner import HyperparameterTuner, IntegerParameter, ContinuousParameter
from sagemaker.workflow.steps import TuningStep
from sagemaker.inputs import TrainingInput

# --- Random Forest HPO ---
# Use the SKLearn estimator for scikit-learn models
sklearn_estimator_rf = SKLearn(
    entry_point="scripts/train.py",
    framework_version="1.2-1",
    instance_type="ml.m5.large",
    instance_count=1,  # <--- ADD THIS LINE
    role=role,
    hyperparameters={"model_type": "randomforest"},
)

tuner_rf = HyperparameterTuner(
    estimator=sklearn_estimator_rf,
    objective_metric_name="Validation F1 Score",
    hyperparameter_ranges={
        "n_estimators": IntegerParameter(100, 300),
        "max_depth": IntegerParameter(5, 20),
    },
    metric_definitions=[
        {"Name": "Validation F1 Score", "Regex": "Validation F1 Score: ([0-9\\.]+)"}
    ],
    max_jobs=6,
    max_parallel_jobs=2,
    objective_type="Maximize",
)

tune_step_rf = TuningStep(
    name="TuneRandomForest",
    tuner=tuner_rf,
    inputs={
        "train": TrainingInput(
            s3_data=data_prep.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri
        ),
        "test": TrainingInput(
            s3_data=data_prep.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri
        ),
    },
    cache_config=cache_config
)

### Logistic Regression
logistic_estimator = SKLearn(
    entry_point="scripts/train.py",
    framework_version="1.2-1",
    instance_type="ml.m5.large",
    instance_count=1,
    role=role,
    hyperparameters={"model_type": "logistic"},
)

tuner_logistic = HyperparameterTuner(
    estimator=logistic_estimator,
    objective_metric_name="Validation F1 Score",
    hyperparameter_ranges={
        "C": ContinuousParameter(0.001, 10.0),
        # penalty can only take ["l1", "l2"], so not tunable like numeric params
    },
    metric_definitions=[
        {"Name": "Validation F1 Score", "Regex": "Validation F1 Score: ([0-9\\.]+)"}
    ],
    max_jobs=4,
    max_parallel_jobs=2,
    objective_type="Maximize",
)

tune_step_logistic = TuningStep(
    name="TuneLogisticRegression",
    tuner=tuner_logistic,
    inputs={
        "train": TrainingInput(
            s3_data=data_prep.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri
        ),
        "test": TrainingInput(
            s3_data=data_prep.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri
        ),
    },
    cache_config=cache_config
)


# # --- XGBoost HPO ---
# # Use the dedicated XGBoost estimator for XGBoost models
# xgboost_estimator = SKLearn(
#     entry_point="scripts/train.py",
#     framework_version="1.2-1", # Use a supported XGBoost version
#     instance_type="ml.m5.large",
#     instance_count=1,  # <--- ADD THIS LINE
#     role=role,
#     hyperparameters={"model_type": "xgboost"},
# )

# tuner_xgb = HyperparameterTuner(
#     estimator=xgboost_estimator,
#     objective_metric_name="Validation F1 Score", # <--- Correct built-in metric name
#     hyperparameter_ranges={
#         "num_round": IntegerParameter(100, 400),
#         "max_depth": IntegerParameter(3, 10),
#         "eta": ContinuousParameter(0.01, 0.3),
#     },
#     metric_definitions=[ 
#         {"Name": "Validation F1 Score", "Regex": "Validation F1 Score: ([0-9\\.]+)"}
#     ],
#     # The metric_definitions argument is removed entirely
#     max_jobs=2,
#     max_parallel_jobs=1,
#     objective_type="Maximize",
# )

# tune_step_xgb = TuningStep(
#     name="TuneXGBoost",
#     tuner=tuner_xgb,
#     inputs={
#         "train": TrainingInput(
#             s3_data=data_prep.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri
#         ),
#         "test": TrainingInput(
#             s3_data=data_prep.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri
#         ),
#     },
#     cache_config=cache_config
# )

In [59]:
# Submit the pipeline definition to SageMaker
pipeline = Pipeline(
    name="ChurnPredictionPipeline",
    steps=[data_prep, tune_step_rf, tune_step_logistic],
    sagemaker_session=sagemaker_session,
)

pipeline.upsert(role_arn=role)

# Start a pipeline execution
execution = pipeline.start()
execution.describe()



{'PipelineArn': 'arn:aws:sagemaker:ap-south-1:891377324517:pipeline/ChurnPredictionPipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:ap-south-1:891377324517:pipeline/ChurnPredictionPipeline/execution/qv0zika1yq9f',
 'PipelineExecutionDisplayName': 'execution-1758267349891',
 'PipelineExecutionStatus': 'Executing',
 'CreationTime': datetime.datetime(2025, 9, 19, 7, 35, 49, 846000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2025, 9, 19, 7, 35, 49, 846000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:ap-south-1:891377324517:user-profile/d-51dx373wnss0/default-20250903T125584',
  'UserProfileName': 'default-20250903T125584',
  'DomainId': 'd-51dx373wnss0',
  'IamIdentity': {'Arn': 'arn:aws:sts::891377324517:assumed-role/AmazonSageMaker-ExecutionRole-20250903T125584/SageMaker',
   'PrincipalId': 'AROA47CR2SHSWJ6BV5G2F:SageMaker'}},
 'LastModifiedBy': {'UserProfileArn': 'arn:aws:sagemaker:ap-south-1:891377324517:user-profile/d-51dx373wnss0/defaul

In [63]:
steps = execution.list_steps()
for step in steps:
    print(f"{step['StepName']} → {step['StepStatus']}")
    if 'FailureReason' in step:
        print(f"   Reason: {step ['FailureReason']}")


TuneLogisticRegression → Succeeded
TuneRandomForest → Executing
ChurnDataPrep → Succeeded
