In [None]:
# The SageMaker Studio environment comes with most of these pre-installed.
# This cell ensures all dependencies are present.
!pip install -q boto3 sagemaker mlflow "scikit-learn>=1.0" "pandas>=1.2"

In [2]:
import sys
import subprocess

# Ensure MLflow is installed
try:
    import mlflow
    import sagemaker_mlflow
except ImportError:
    print("Installing MLflow...")
    subprocess.check_call([sys.executable, "-m", "pip", "install",  "boto3==1.37.1", "botocore==1.37.1", "s3transfer", "mlflow==2.22.0", "sagemaker-mlflow==0.1.0"])
    import mlflow
    import sagemaker_mlflow

In [3]:
pip show sagemaker_mlflow

Name: sagemaker-mlflow
Version: 0.1.0
Summary: AWS Plugin for MLFlow with SageMaker
Home-page: https://github.com/aws/sagemaker-mlflow
Author: Amazon Web Services
Author-email: 
License: Apache License 2.0
Location: /opt/conda/lib/python3.12/site-packages
Requires: boto3, mlflow
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [4]:
import sagemaker
import boto3
import mlflow
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification
import os

# Setup SageMaker session
sagemaker_session = sagemaker.Session()
sagemaker_client = boto3.client("sagemaker")

# --- IMPORTANT: CONFIGURE THESE VARIABLES ---
# s3_bucket = sagemaker_session.default_bucket()
# ----------------------
# UPDATE THESE VARIABLES
bucket_name = 'iti113-team2-bucket'
base_folder = 'Team2'
# ----------------------

# Create source folder
folder_path = "source"
os.makedirs(folder_path, exist_ok=True)
print(f"Folder created (or already exists): {folder_path}")

s3_client = boto3.client('s3')

data_path = f"s3://{bucket_name}/{base_folder}"

# Assuming you have your boto3 client and server name
tracking_server_name = "mlflow-ITI113-Team2"

try:
    response = sagemaker_client.describe_mlflow_tracking_server(
        TrackingServerName=tracking_server_name
    )
    tracking_server_arn = response['TrackingServerArn']
    print(f"Found MLflow Tracking Server ARN: {tracking_server_arn}")
except Exception as e:
    print(f"Could not find tracking server: {e}")
    tracking_server_arn = None

# ARN of your MLflow Tracking Server
mlflow_tracking_server_arn = tracking_server_arn

# IAM role for SageMaker execution
role = sagemaker.get_execution_role()

print(f"S3 Bucket: {data_path}")
print(f"SageMaker Role ARN: {role}")
print(f"MLflow Tracking Server ARN: {mlflow_tracking_server_arn}")

Folder created (or already exists): source
Found MLflow Tracking Server ARN: arn:aws:sagemaker:ap-southeast-1:837028399719:mlflow-tracking-server/mlflow-ITI113-Team2
S3 Bucket: s3://iti113-team2-bucket/Team2
SageMaker Role ARN: arn:aws:iam::837028399719:role/iti113-team2-sagemaker-iti113-team2-domain-iti113-team2-Role
MLflow Tracking Server ARN: arn:aws:sagemaker:ap-southeast-1:837028399719:mlflow-tracking-server/mlflow-ITI113-Team2


-----
### Upload dataset to S3

In [5]:
# Upload to S3
s3_client.upload_file('Team2Dataset.csv', bucket_name, f'{base_folder}/data/Team2Dataset.csv')
s3_path = f"s3://{bucket_name}/{base_folder}/data/Team2Dataset.csv"
data_s3_uri = os.path.dirname(s3_path) # Log the directory URI

print(data_s3_uri)

s3://iti113-team2-bucket/Team2/data


-----

### Creating the SageMaker Pipeline

Create the pipeline scripts that will be executed as steps in our SageMaker Pipeline.

#### Preprocessing Script

This script will take the raw data, prepocess, split it into training and testing sets, and save them back to S3.

In [6]:
%%writefile source/preprocess.py

import argparse
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import os

def preprocess(df):
    print('preprocess-start')
    print(f"Dataset shape: {df.shape}")

    #drop patient identifier
    df.drop(columns=['patientid'], inplace=True)
    
    df['oldpeak'] = df['oldpeak'].apply(lambda x: 0 if x < 0 else x)
    df['oldpeak_log'] = np.log1p(df['oldpeak'])
    df.drop(columns=['oldpeak'], inplace=True)
    print('transform oldpeak-end')
    
    num_cols = ['age', 'restingBP', 'serumcholestrol', 'maxheartrate', 'oldpeak_log']
    for col in num_cols:
        #Replace invalid zeros with NaN 
        if col in ['restingBP', 'serumcholestrol']:  # zero Cholesterol/RestingBP is invalid
            df[col] = df[col].replace(0, np.nan)
        
        #Impute NaNs with median
        median_val = df[col].median()
        df[col] = df[col].fillna(median_val)
        
        #IQR-based Outlier Capping (Winsorization)
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        
        # Cap the outliers to within the IQR bounds
        df[col] = df[col].clip(lower=lower_bound, upper=upper_bound)
    print('impute and remove outlier-end')

    #To group categorical variable
    # df['ChestPainType_Grouped'] = df['ChestPainType'].replace({'TA': 'Other', 'ATA': 'Other'})
    # df = df.drop(columns=['ChestPainType'])
    # df = pd.get_dummies(df, columns=['ChestPainType_Grouped'], drop_first=False)
    
    #One-Hot Encoding on categorical variable
    df = pd.get_dummies(df, columns=['chestpain'], drop_first=False) #Keep all dummy columns
    df = pd.get_dummies(df, columns=['restingrelectro'], drop_first=False)
    df = pd.get_dummies(df, columns=['slope'], drop_first=False)
    df = pd.get_dummies(df, columns=['noofmajorvessels'], drop_first=False) 
    print('encoding-end')
    
    print(f"Dataset: {df.head(2)}")
    print('preprocess-end')

    return df
    
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--input-path", type=str, help="Directory containing Team2Dataset.csv")
    parser.add_argument("--output-train-path", type=str, help="Output directory for train.csv")
    parser.add_argument("--output-test-path", type=str, help="Output directory for test.csv")
    args = parser.parse_args()

    # Use provided paths or fall back to SageMaker defaults
    input_path = args.input_path or "/opt/ml/processing/input"
    output_train_path = args.output_train_path or "/opt/ml/processing/train"
    output_test_path = args.output_test_path or "/opt/ml/processing/test"

    input_file = os.path.join(input_path, "Team2Dataset.csv")
    print(f"Reading input file from {input_file}...")
    df = pd.read_csv(input_file)
    df = preprocess(df) #clean data
    
    print("Splitting into train/test...")
    train, test = train_test_split(df, test_size=0.2, random_state=42)

    os.makedirs(output_train_path, exist_ok=True)
    os.makedirs(output_test_path, exist_ok=True)

    train_output = os.path.join(output_train_path, "train.csv")
    test_output = os.path.join(output_test_path, "test.csv")

    print(f"Saving train to {train_output}")
    train.to_csv(train_output, index=False)

    print(f"Saving test to {test_output}")
    test.to_csv(test_output, index=False)

    print("Preprocessing complete.")


Overwriting source/preprocess.py


#### Training Script

This script will train a model on the preprocessed data and log the results to MLflow.

In [11]:
%%writefile source/train.py

import sys
import subprocess

# # Ensure MLflow is installed
try:
    import mlflow
    import sagemaker_mlflow
except ImportError:
    print("Installing MLflow...")
    subprocess.check_call([sys.executable, "-m", "pip", "install",  "boto3==1.37.1", "botocore==1.37.1", "s3transfer", "mlflow==2.22.0", "sagemaker-mlflow==0.1.0"])
    import mlflow
    import sagemaker_mlflow
    
import mlflow.sklearn
import os
import argparse
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import joblib
import glob

parser = argparse.ArgumentParser()
parser.add_argument("--tracking_server_arn", type=str, required=True)
parser.add_argument("--experiment_name", type=str, default="Default")
parser.add_argument("--model_output_path", type=str, default="/opt/ml/model")
parser.add_argument("-C", "--C", type=float, default=0.5)
parser.add_argument("--run_name", type=str, default="Experiment-LR")
args, _ = parser.parse_known_args()

print('Start-Train')
# Load training data
train_path = glob.glob("/opt/ml/input/data/train/*.csv")[0]
df = pd.read_csv(train_path)
print(df.head())

X = df.drop("target", axis=1)
y = df["target"]

# # Set up MLflow
mlflow.set_tracking_uri(args.tracking_server_arn)
mlflow.set_experiment(args.experiment_name)

with mlflow.start_run(run_name=args.run_name) as run:
    mlflow.log_param("C", args.C)
    model = LogisticRegression(C=args.C)
    model.fit(X, y)
    acc = accuracy_score(y, model.predict(X))
    mlflow.log_metric("accuracy", acc)

    mlflow.sklearn.log_model(sk_model=model, artifact_path="model")

    os.makedirs(args.model_output_path, exist_ok=True)
    joblib.dump(model, os.path.join(args.model_output_path, "model.joblib"))
    with open(os.path.join(args.model_output_path, "run_id.txt"), "w") as f:
        f.write(run.info.run_id)

    print(f"Training complete. Accuracy: {acc:.4f}")
    print(f"MLflow Run ID: {run.info.run_id}")


Overwriting source/train.py


#### Evaluation Script

This script evaluates the model and creates an evaluation report.

In [8]:
%%writefile source/evaluate.py
import argparse
import pandas as pd
from sklearn.metrics import accuracy_score
import joblib
import os
import json
import boto3
import tarfile

if __name__ == "__main__":
    # --- Parse Arguments ---
    parser = argparse.ArgumentParser()
    parser.add_argument("--model-path", type=str, required=True, help="Path to the directory containing the model.tar.gz file.")
    parser.add_argument("--test-path", type=str, required=True, help="Path to the directory containing test.csv.")
    parser.add_argument("--output-path", type=str, required=True, help="Path to save the evaluation.json report.")
    parser.add_argument("--model-package-group-name", type=str, required=True, help="Name of the SageMaker Model Package Group.")
    parser.add_argument("--region", type=str, required=True, help="The AWS region for creating the boto3 client.")
    args = parser.parse_args()

    # --- Extract and Load Model ---
    # SageMaker packages models in a .tar.gz file. We need to extract it first.
    model_archive_path = os.path.join(args.model_path, 'model.tar.gz')
    print(f"Extracting model from archive: {model_archive_path}")
    with tarfile.open(model_archive_path, "r:gz") as tar:
        tar.extractall(path=args.model_path)

    # Load the model using joblib
    model_file_path = os.path.join(args.model_path, "model.joblib")
    if not os.path.exists(model_file_path):
        raise FileNotFoundError(f"Model file 'model.joblib' not found after extraction in: {args.model_path}")
    
    print(f"Loading model from: {model_file_path}")
    model = joblib.load(model_file_path)

    # --- Prepare Data and Evaluate ---
    test_file_path = os.path.join(args.test_path, "test.csv")
    if not os.path.exists(test_file_path):
        raise FileNotFoundError(f"Test data not found: {test_file_path}")
    
    test_df = pd.read_csv(test_file_path)
    X_test = test_df.drop("target", axis=1)
    y_test = test_df["target"]
    
    print("Running predictions on the test dataset.")
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    report = {"accuracy": accuracy}
    print(f"Calculated accuracy: {accuracy:.4f}")

    # --- Check for Existing Baseline Model in SageMaker Model Registry ---
    print(f"Checking for baseline model in region: {args.region}")
    sagemaker_client = boto3.client("sagemaker", region_name=args.region)
    try:
        response = sagemaker_client.list_model_packages(
            ModelPackageGroupName=args.model_package_group_name,
            ModelApprovalStatus="Approved",
            SortBy="CreationTime",
            SortOrder="Descending",
            MaxResults=1,
        )
        # If the list is not empty, an approved model already exists
        report["baseline_exists"] = len(response["ModelPackageSummaryList"]) > 0
        if report["baseline_exists"]:
            print(f"An approved baseline model was found in '{args.model_package_group_name}'.")
        else:
             print(f"No approved baseline model was found in '{args.model_package_group_name}'.")

    except sagemaker_client.exceptions.ClientError as e:
        # If the ModelPackageGroup doesn't exist, there is no baseline
        if "ResourceNotFound" in str(e):
            report["baseline_exists"] = False
            print(f"Model Package Group '{args.model_package_group_name}' not found. Assuming no baseline exists.")
        else:
            raise

    # --- Write Final Report ---
    os.makedirs(args.output_path, exist_ok=True)
    report_path = os.path.join(args.output_path, "evaluation.json")
    with open(report_path, "w") as f:
        json.dump(report, f, indent=4)
        
    print(f"Evaluation complete. Report written to: {report_path}")
    print("Evaluation Report:")
    print(json.dumps(report, indent=4))

Overwriting source/evaluate.py


### Pipeline Definition

Define the SageMaker Pipeline using the scripts we just created.

In [12]:
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.steps import ProcessingStep, TrainingStep, TrainingInput
from sagemaker.processing import ScriptProcessor, ProcessingInput, ProcessingOutput
from sagemaker.workflow.properties import PropertyFile
from sagemaker.sklearn.estimator import SKLearn
from sagemaker.workflow.step_collections import RegisterModel
from sagemaker.workflow.conditions import ConditionGreaterThanOrEqualTo
from sagemaker.workflow.conditions import ConditionNot
from sagemaker.workflow.condition_step import ConditionStep
from sagemaker.workflow.conditions import ConditionEquals
from sagemaker.workflow.functions import JsonGet
from sagemaker.workflow.functions import Join
from sagemaker.workflow.parameters import ParameterFloat, ParameterString
from sagemaker.model_metrics import ModelMetrics, FileSource

# Parameters
model_package_group_name = "Team2PredictorModels"
processing_instance_type = "ml.t3.medium"
training_instance_type = "ml.m5.large"
experiment_name_param = ParameterString(name="ExperimentName", default_value="Team2-Prediction")
accuracy_threshold_param = ParameterFloat(name="AccuracyThreshold", default_value=0.80)
model_c_param = ParameterFloat(name="C", default_value=0.5)
run_name_param = ParameterString(name="RunName", default_value="Experiment-LR")

#set processing test folder dest
s3_process_train_path = f"s3://{bucket_name}/{base_folder}/processing/train"
s3_process_test_path = f"s3://{bucket_name}/{base_folder}/processing/test"
# print(s3_process_train_path)
# print(s3_process_test_path)

preprocessor = ScriptProcessor(
    image_uri=sagemaker.image_uris.retrieve("sklearn", sagemaker_session.boto_region_name, "1.2-1"),
    command=[
        "python3",
    ],
    instance_type=processing_instance_type,
    instance_count=1,
    base_job_name="preprocess-data",
    role=role,
)

step_preprocess = ProcessingStep(
    name="PreprocessData",
    processor=preprocessor,
    inputs=[ProcessingInput(source=data_s3_uri, destination="/opt/ml/processing/input")],
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/train", destination=s3_process_train_path),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/test", destination=s3_process_test_path),
    ],
    code="source/preprocess.py",
)

# Training Step
sklearn_estimator = SKLearn(
    entry_point="train.py",
    source_dir="source",
    framework_version="1.2-1",
    instance_type=training_instance_type,
    role=role,
    hyperparameters={
        "tracking_server_arn": mlflow_tracking_server_arn,
        "experiment_name": experiment_name_param,
        "C": model_c_param,
        "model_output_path": "/opt/ml/model",
        "run_name": run_name_param,
    },
    py_version="py3",
    requirements="source/requirements.txt"
)

step_train = TrainingStep(
    name="TrainModel",
    estimator=sklearn_estimator,
    inputs={
        "train": TrainingInput(
            s3_data=step_preprocess.properties.ProcessingOutputConfig.Outputs["train"].S3Output.S3Uri,
            content_type="text/csv",
        )
    },
)

# Evaluation Step
evaluation_processor = ScriptProcessor(
    image_uri=sagemaker.image_uris.retrieve("sklearn", sagemaker_session.boto_region_name, "1.2-1"),
    command=['python3'],
    instance_type=processing_instance_type,
    instance_count=1,
    base_job_name="evaluate-model",
    role=role,
)

evaluation_report = PropertyFile(
    name="EvaluationReport", output_name="evaluation", path="evaluation.json"
)

step_eval = ProcessingStep(
    name="EvaluateModel",
    processor=evaluation_processor,
    inputs=[
        ProcessingInput(
            source=step_train.properties.ModelArtifacts.S3ModelArtifacts,
            destination="/opt/ml/processing/model",
        ),
        ProcessingInput(
            source=step_preprocess.properties.ProcessingOutputConfig.Outputs["test"].S3Output.S3Uri,
            destination="/opt/ml/processing/test",
        ),
    ],
    outputs=[ProcessingOutput(output_name="evaluation", source="/opt/ml/processing/evaluation")],
    code="source/evaluate.py",  # SageMaker will handle uploading and running this script
    job_arguments=[  # Pass arguments here instead of in command
        "--model-path", "/opt/ml/processing/model",
        "--test-path", "/opt/ml/processing/test",
        "--output-path", "/opt/ml/processing/evaluation",
        "--model-package-group-name", model_package_group_name,
        "--region", "ap-southeast-1",
    ],
    property_files=[evaluation_report],
)


model_metrics_report = ModelMetrics(
    model_statistics=FileSource(
        s3_uri=step_eval.properties.ProcessingOutputConfig.Outputs["evaluation"].S3Output.S3Uri,
        content_type="application/json"
    )
)


# RegisterModel step (always defined, but executed conditionally)
step_register_new = RegisterModel(
    name="RegisterNewModel",
    estimator=sklearn_estimator,
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["application/json"],
    response_types=["application/json"],
    inference_instances=["ml.t2.medium"],
    transform_instances=["ml.m5.large"],
    model_package_group_name="Team2PredictorModels",
    model_metrics=model_metrics_report,
    approval_status="PendingManualApproval",
)

step_register_better_model = RegisterModel(
    name="RegisterBetterModel",
    estimator=sklearn_estimator,
    model_data=step_train.properties.ModelArtifacts.S3ModelArtifacts,
    content_types=["application/json"],
    response_types=["application/json"],
    inference_instances=["ml.t2.medium"],
    transform_instances=["ml.m5.large"],
    model_package_group_name="Team2PredictorModels",
    model_metrics=model_metrics_report,
    approval_status="PendingManualApproval",
)


# Conditions: check accuracy > threshold OR no model exists
cond_accuracy = ConditionGreaterThanOrEqualTo(
    left=JsonGet(
        step_name=step_eval.name,
        property_file=evaluation_report,
        json_path="accuracy"
    ),
    right=accuracy_threshold_param
)

cond_no_registered = ConditionEquals(
    left=JsonGet(
        step_name=step_eval.name,
        property_file=evaluation_report,
        json_path="baseline_exists" # Check the key added to the report
    ),
    right=False # Condition is TRUE if baseline_exists is False
)

# Outer step: Check if model is better
step_cond_accuracy = ConditionStep(
    name="CheckAccuracy",
    conditions=[cond_accuracy],
    if_steps=[step_register_better_model], # Register model if accuracy is high
    else_steps=[],
)

# Checks for existence of registered model first
step_cond_no_registered = ConditionStep(
    name="CheckIfModelExists",
    conditions=[cond_no_registered],
    if_steps=[step_register_new],  # Register model if no baseline exists
    else_steps=[step_cond_accuracy],
)


# Define Pipeline
pipeline = Pipeline(
    name="Team2PredictionPipeline",
    parameters=[experiment_name_param, accuracy_threshold_param, model_c_param, run_name_param],
    steps=[step_preprocess, step_train, step_eval, step_cond_no_registered] # Use the 'no registered model' check as the primary condition step
)

pipeline.upsert(role_arn=role)
print("Team2PredictionPipeline is defined and ready to be executed.")

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.
INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.


Team2PredictionPipeline is defined and ready to be executed.


### Execute Pipeline

In [None]:
execution = pipeline.start(
    parameters={
        "C": 0.5,
        "RunName": "Experiment-v1"
    }
)

print("Pipeline execution.")
print(execution.arn)
execution.wait()
execution.describe()

Pipeline execution.
arn:aws:sagemaker:ap-southeast-1:837028399719:pipeline/Team2PredictionPipeline/execution/24pbe7v9t9ha


### Team2PredictionPipeline - Graph

### Experiment Tracking

### Experiment 1

### Experiment 2

Both model from the 'C-1.0' and 'C-0.5' archived accuracy of 0.8555.
The model from the 'C-0.5' run was selected.

### Create Deployment Pipeline

Create a separate pipeline that is triggered by a new model registration. This pipeline will deploy the model to a SageMaker endpoint.

In [None]:
%%writefile source/inference.py
import os
import joblib
import json
import pandas as pd

def model_fn(model_dir):

    print("Loading model from a .joblib file.")
    # The model is saved as 'model.joblib' in your training script.
    model_path = os.path.join(model_dir, "model.joblib")
    try:
        model = joblib.load(model_path)
        print("Model loaded successfully.")
        return model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None

def input_fn(request_body, request_content_type):

    print(f"Received request of type: {request_content_type}")
    if request_content_type == 'application/json':
        try:
            # Assuming the JSON input is in the format: {"data": [[...], [...]]}
            data = json.loads(request_body)
            
            if "data" not in data or not isinstance(data["data"], list):
                raise ValueError("JSON must contain a 'data' field with a list of row dictionaries.")

            df = pd.DataFrame(data['data'])
            return df
            
        except Exception as e:
            raise ValueError(f"Error parsing JSON: {e}")
    else:
        raise ValueError(f"Unsupported content type: {request_content_type}")

def predict_fn(input_data, model):

    print("Making predictions on the input data.")
    try:
        predictions = model.predict(input_data)
        return predictions
    except Exception as e:
        raise ValueError(f"Error during prediction: {e}")

def output_fn(prediction, response_content_type):
    print(f"Serializing prediction to: {response_content_type}")
    if response_content_type == 'application/json':
        try:
            # Convert numpy array to a list and wrap it in a JSON object.
            response = {"predictions": prediction.tolist()}
            return json.dumps(response)
        except Exception as e:
            raise ValueError(f"Error serializing prediction to JSON: {e}")
    else:
        raise ValueError(f"Unsupported response content type: {response_content_type}")

#### Deployment Script

This script will take the registered model and deploy it.

In [None]:
%%writefile source/deploy.py
import subprocess
import sys
import os


# --- Install required packages ---
def install(package):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "boto3==1.28.57", "botocore==1.31.57", "numpy==1.24.1", "sagemaker" ])

# Ensure sagemaker SDK is installed before importing
try:
    import sagemaker
except ImportError:
    print("sagemaker SDK not found. Installing now...")
    install("sagemaker")
    import sagemaker

import argparse
import sagemaker
import boto3
from sagemaker.model import Model
import shutil

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    # Accept the registered model's ARN instead of the S3 data path
    parser.add_argument("--model-package-arn", type=str, required=True)
    parser.add_argument("--role", type=str, required=True)
    parser.add_argument("--endpoint-name", type=str, required=True)
    parser.add_argument("--region", type=str, required=True)
    args = parser.parse_args()

    boto_session = boto3.Session(region_name=args.region)
    sagemaker_session = sagemaker.Session(boto_session=boto_session)
    sm_client = boto3.client("sagemaker", region_name=args.region)

    # --- Step 1: Get Model Artifacts from the Model Package ---
    print(f"Describing model package: {args.model_package_arn}")
    model_package_description = sm_client.describe_model_package(ModelPackageName=args.model_package_arn)
    
    # Extract the S3 path to the model artifacts (model.tar.gz)
    model_artifacts = model_package_description["InferenceSpecification"]["Containers"][0]["ModelDataUrl"]

    # Extract the container image URI
    image_uri = model_package_description["InferenceSpecification"]["Containers"][0]["Image"]

    print(f"Found model artifacts at: {model_artifacts}")
    print(f"Using container image: {image_uri}")

    # --- Step 2: Prepare a clean directory for the inference code ---
    original_code_location = "/opt/ml/processing/input/scripts"
    inference_script_path = os.path.join(original_code_location, "inference.py")
    clean_code_dir = "/tmp/code"

    if not os.path.exists(inference_script_path):
        raise FileNotFoundError(f"inference.py not found at {inference_script_path}. Did you include it via ProcessingInput?")
    
    # Create the clean directory, removing it first if it exists
    if os.path.exists(clean_code_dir):
        shutil.rmtree(clean_code_dir)
    os.makedirs(clean_code_dir)

    # Copy only the inference script to the clean directory
    shutil.copy(inference_script_path, clean_code_dir)
    print(f"Copied inference.py to clean dir: {clean_code_dir}")
    
    # --- Step 2: Create a SageMaker Model object using the local inference.py ---
    # This explicitly tells SageMaker to use your provided inference script.
    model = Model(
        image_uri=image_uri,
        model_data=model_artifacts, # Use artifacts from the registered model
        role=args.role,
        sagemaker_session=sagemaker_session,
        entry_point="inference.py",  # Explicitly use your inference script
        source_dir=clean_code_dir         # Directory containing inference.py
    )
   
    # First, try to delete existing resources to ensure a clean deployment
    try:
        # Delete the endpoint first
        sm_client.delete_endpoint(EndpointName=args.endpoint_name)
        print(f"Deleted existing endpoint: {args.endpoint_name}")
        
        # Then, delete the endpoint config
        sm_client.delete_endpoint_config(EndpointConfigName=args.endpoint_name)
        print(f"Deleted existing endpoint config: {args.endpoint_name}")
    except sm_client.exceptions.ClientError as e:
        # If the resources don't exist, that's fine.
        if "Could not find" not in str(e):
            raise e
    
    # Deploy the model to an endpoint
    print(f"Deploying registered model from ARN to endpoint: {args.endpoint_name}")
    model.deploy(
        initial_instance_count=1,
        instance_type="ml.t2.medium",
        endpoint_name=args.endpoint_name,
        # Update endpoint if it already exists
        update_endpoint=True
    )
    print("Deployment complete.")


#### Deployment Pipeline Definition

This pipeline will be triggered when a new model is registered.

In [None]:
from sagemaker.workflow.steps import ProcessingStep
from sagemaker.processing import ScriptProcessor
from sagemaker.workflow.pipeline import Pipeline
from sagemaker.workflow.parameters import ParameterString
import sagemaker

# Define Parameters for the deployment pipeline
# This will be provided by the EventBridge trigger
model_package_arn_param = ParameterString(name="ModelPackageArn", default_value="")
role_param = ParameterString(name="ExecutionRole", default_value=role)
endpoint_name_param = ParameterString(name="EndpointName", default_value="heartdisease-predictor-endpoint")

# Create a ScriptProcessor for deployment
# Using a more recent scikit-learn version is generally a good idea
deploy_processor = ScriptProcessor(
    image_uri=sagemaker.image_uris.retrieve("sklearn", sagemaker_session.boto_region_name, version="1.2-1"),
    command=["python3"],
    instance_type="ml.t3.medium",
    instance_count=1,
    role=role_param,
    base_job_name="deploy-registered-model"
)

# Define the deployment step that takes the model ARN as an argument
step_deploy = ProcessingStep(
    name="DeployRegisteredModel",
    processor=deploy_processor,
    inputs=[ProcessingInput(source="source/", destination="/opt/ml/processing/input/scripts")],
    code="source/deploy.py",
    job_arguments=[
        "--model-package-arn", model_package_arn_param,
        "--role", role_param,
        "--endpoint-name", endpoint_name_param,
        "--region", "ap-southeast-1" 
    ]
)

# Define the independent deployment pipeline
deploy_pipeline = Pipeline(
    name="HeartDiseaseDeployPipeline",
    parameters=[model_package_arn_param, role_param, endpoint_name_param],
    steps=[step_deploy],
)

# Create or update the pipeline definition
# Capture the response which contains the ARN
response = deploy_pipeline.upsert(role_arn=role)

# Extract the ARN from the response dictionary
pipeline_arn = response['PipelineArn']

print(f"Deployment pipeline ARN: {pipeline_arn}")

### Deploy endpoint manually

In [None]:
response = sagemaker_client.list_model_packages(ModelPackageGroupName="HeartDiseasePredictorModels")

#approve PendingManualApproval model
for model in response["ModelPackageSummaryList"]:
    if model["ModelApprovalStatus"] == "PendingManualApproval":
        print("Approving:", model["ModelPackageArn"])
        sagemaker_client.update_model_package(
            ModelPackageArn=model["ModelPackageArn"],
            ModelApprovalStatus="Approved",
            ApprovalDescription="Auto-approved after validation."
        )

response = sagemaker_client.list_model_packages(ModelPackageGroupName="HeartDiseasePredictorModels")
#get recent approved model
for model in response['ModelPackageSummaryList']:
    if model['ModelApprovalStatus'] == 'Approved':
        model_package_arn = model['ModelPackageArn']
        print("Approved ModelPackageArn:", model_package_arn)
        break

execution = deploy_pipeline.start(
    parameters={
        "ModelPackageArn": model_package_arn
    }
)
print("Pipeline execution.")
print(execution.arn)
execution.wait()
execution.describe()

### DeployPipeline Graph

### Invoking the Endpoint

In [None]:
import boto3
import json
import numpy as np
import pandas as pd
import sagemaker

# Define endpoint name using the same name deployed
endpoint_name = "heartdisease-predictor-endpoint"

aws_region = "ap-southeast-1"

# Create a client to interact with the SageMaker endpoint
sagemaker_runtime_client = boto3.client("sagemaker-runtime", region_name=aws_region)

# 2. Prepare your test data (payload)
s3 = boto3.client("s3")
sagemaker_session = sagemaker.Session()
bucket_name = 'iti113-team2-bucket'
base_folder = 'Team2'

#test with test.csv
s3_process_test_path = f"s3://{bucket_name}/{base_folder}/processing/test/test.csv"
df = pd.read_csv(s3_process_test_path)
df = df.drop("HeartDisease", axis=1)

#send test records
sample_data_point = df.head(5).to_dict(orient="records")
payload = {"data": sample_data_point}
print(f"Sending payload: {json.dumps(payload)}")

# 3. Invoke the endpoint and get the prediction
try:
    response = sagemaker_client.describe_endpoint(EndpointName=endpoint_name)
    endpoint_arn = response["EndpointArn"]
    print("\nEndpoint ARN:", endpoint_arn)

    response = sagemaker_runtime_client.invoke_endpoint(
        EndpointName=endpoint_name,
        ContentType="application/json",
        Body=json.dumps(payload) # Serialize the payload to a JSON string
    )

    # The response body is a streaming object, so we need to read and decode it
    response_body = response['Body'].read()
    result = json.loads(response_body.decode('utf-8'))

    print("\nSuccess!")
    print(f"Prediction result: {result}")

except Exception as e:
    print(f"Error invoking endpoint: {e}")


### Endpoint

### View Cloudwatch Logs

You can view the cloudwatch logs. Here is an example for the logs of a previous endpoint.

In [None]:
import boto3

# Enter the name of your SageMaker endpoint
endpoint_name = "heartdisease-predictor-endpoint"

# The log group is created based on the endpoint name
log_group_name = f"/aws/sagemaker/Endpoints/{endpoint_name}"

# Create a CloudWatch Logs client
logs_client = boto3.client("logs")

print(f"Searching for logs in: {log_group_name}\n")

try:
    # Find all log streams in the log group, ordered by the most recent
    response = logs_client.describe_log_streams(
        logGroupName=log_group_name,
        orderBy='LastEventTime',
        descending=True
    )

    log_streams = response.get("logStreams", [])

    if not log_streams:
        print("No log streams found. The endpoint might not have processed any requests yet.")
    
    # Loop through each stream and print its recent log events
    for stream in log_streams:
        stream_name = stream['logStreamName']
        print(f"--- Logs from stream: {stream_name} ---")

        # Get log events from the stream
        log_events = logs_client.get_log_events(
            logGroupName=log_group_name,
            logStreamName=stream_name,
            startFromHead=False,  # False gets recent logs first
            limit=50  # Get up to 50 recent log events
        )
        
        # Print events in chronological order
        for event in reversed(log_events.get("events", [])):
            print(event['message'].strip())
        
        print("-" * (len(stream_name) + 24), "\n")

except logs_client.exceptions.ResourceNotFoundException:
    print(f"Error: Log group '{log_group_name}' was not found.")
    print("Please check the endpoint name and ensure it has been invoked.")
except Exception as e:
    print(f"An error occurred: {e}")