# FastScan AWS SageMaker Pipeline Implementation

The implementation in this jupyter notebook can only be used in AWS SageMaker Studio Code Editor.

https://sagemaker-examples.readthedocs.io/en/latest/sagemaker-pipelines/tabular/abalone_build_train_deploy/sagemaker-pipelines-preprocess-train-evaluate-batch-transform.html#Define-a-Processing-Step-for-Feature-Engineering

## 1.2 Import Packages

In [1]:

import os
import numpy
import sagemaker
import sys




sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


# 2. Create SageMaker Pipeline Session

In [32]:
from sagemaker.workflow.pipeline_context import PipelineSession

sagemaker_session = sagemaker.session.Session()

region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()

pipeline_session = PipelineSession()

default_bucket = sagemaker_session.default_bucket()

model_package_group_name = f"FastScanModelPackageGroupName"

# 3. Define Parameters in the Pipeline for Pipeline Execution

In [85]:
image_input_data_uri: str = "s3://angkokleong-bucket/datasets/raw_custom_image_dataset/"
label_input_data_uri: str = "s3://angkokleong-bucket/datasets/raw_label_data/"

train_image_dataset_input_uri: str = "s3://angkokleong-bucket/datasets/fastscandataset/images/train/"
test_image_dataset_input_uri: str = "s3://angkokleong-bucket/datasets/fastscandataset/images/test/"
val_image_dataset_input_uri: str = "s3://angkokleong-bucket/datasets/fastscandataset/images/val/"

In [88]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
)

#There can be multiple input data


processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="PendingManualApproval")
input_image_data = ParameterString(
    name="InputImageData",
    default_value=image_input_data_uri
)

input_label_data = ParameterString(
    name="InputLabelData",
    default_value=label_input_data_uri
)

input_train_image_dataset_s3_uri = ParameterString(
    name="InputTrainImageDataset_S3_URI",
    default_value=train_image_dataset_uri
)

input_test_image_dataset_s3_uri = ParameterString(
    name="InputTestImageDataset_S3_URI",
    default_value=test_image_dataset_uri
)

input_val_image_dataset_s3_uri = ParameterString(
    name="InputValImageDataset_S3_URI",
    default_value=val_image_dataset_uri
)


mAP50_threshold = ParameterFloat(name="mAP50Threshold", default_value=0.9)
mAP50to95_threshold = ParameterFloat(name="mAP50to95threshold", default_value=0.8)

# 4. Video Data Processing to extract video frame and convert to images

This process is the video frame extraction from video file and the extracted video frame will be converted to image.

The image will be resized to 640 by 640 from 4K resolution.

In [None]:
from sagemaker.processing import FrameworkProcessor
from sagemaker.pytorch.estimator import PyTorch

# Refer to https://github.com/aws/deep-learning-containers/blob/master/available_images.md
# 
pytorch_processor = FrameworkProcessor(
    estimator_cls=PyTorch,
    framework_version="2.5.1",
    image_uri="763104351884.dkr.ecr.us-east-1.amazonaws.com/pytorch-training:2.5.1-gpu-py311-cu124-ubuntu22.04-sagemaker",
    role=role,
    py_version="py3",
    instance_count=processing_instance_count,
    instance_type="ml.m5.xlarge",
    sagemaker_session=pipeline_session,
    base_job_name="stratified_splitting_image_data_processing"
)


In [None]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep
import project_library.file_manager

# Project Directory in this project
project_library_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/home/sagemaker-user/user-default-efs/FastScan/project_library"))


# Docker context
ROOT_INPUT_FOLDER_PATH: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input"))
ROOT_OUTPUT_FOLDER_PATH: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/output"))

aws_dataset_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input/aws_datasets"))
fastscan_dataset_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input/aws_datasets/fastscandataset"))
raw_image_dataset_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input/aws_datasets/fastscandataset/raw_custom_image_dataset"))
raw_label_dataset_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input/aws_datasets/fastscandataset/raw_custom_label_dataset"))


# https://docs.aws.amazon.com/sagemaker/latest/dg/byoc-input-and-output.html (How Amazon Sagemaker processing configures input and output for your processing container)
# https://github.com/aws/amazon-sagemaker-examples/blob/main/sagemaker_processing/scikit_learn_data_processing_and_model_evaluation/scikit_learn_data_processing_and_model_evaluation.ipynb (visual guide for ProcessingInput and ProcessingOutput)
video_data_processor_args = pytorch_processor.run(
    inputs=[
        ProcessingInput(source=input_image_data, destination=raw_image_dataset_folder_path),
        ProcessingInput(source=input_label_data, destination=raw_label_dataset_folder_path)
    ],
    dependencies=[str("/home/sagemaker-user/user-default-efs/FastScan/project_library"), str("/home/sagemaker-user/user-default-efs/FastScan/aws_sagemaker_pipeline/preprocessing/requirements.txt")],
    code=str("/home/sagemaker-user/user-default-efs/FastScan/aws_sagemaker_pipeline/preprocessing/image_preprocessing_script.py"),
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/input/aws_datasets/fastscandataset/raw/train", destination="s3://angkokleong-bucket/datasets/fastscandataset/images/train"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/input/aws_datasets/fastscandataset/raw/test", destination="s3://angkokleong-bucket/datasets/fastscandataset/images/test"),
        ProcessingOutput(output_name="val", source="/opt/ml/processing/input/aws_datasets/fastscandataset/raw/val", destination="s3://angkokleong-bucket/datasets/fastscandataset/images/val"),
        ProcessingOutput(output_name="train_label", source="/opt/ml/processing/input/aws_datasets/fastscandataset/labels/train", destination="s3://angkokleong-bucket/datasets/fastscandataset/labels/train"),
        ProcessingOutput(output_name="test_label", source="/opt/ml/processing/input/aws_datasets/fastscandataset/labels/test", destination="s3://angkokleong-bucket/datasets/fastscandataset/labels/test"),
        ProcessingOutput(output_name="val_label", source="/opt/ml/processing/input/aws_datasets/fastscandataset/labels/val", destination="s3://angkokleong-bucket/datasets/fastscandataset/labels/val")
    ]   
)


video_data_process_step = ProcessingStep(name="video_data_processing", step_args=video_data_processor_args)

In [42]:
from sagemaker import image_uris

image_uris.retrieve(framework="sklearn", region="us-east-1", version="1.2-1", image_scope="training")

INFO:sagemaker.image_uris:Defaulting to only available Python version: py3
INFO:sagemaker.image_uris:Defaulting to only supported image scope: cpu.


'683313688378.dkr.ecr.us-east-1.amazonaws.com/sagemaker-scikit-learn:1.2-1-cpu-py3'

# 5. Prepare Train, Val and Test dataset using train-test-split (done)

Image file split to create train, test and val dataset for YOLO model training

In [74]:
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn

# Refer to https://github.com/aws/deep-learning-containers/blob/master/available_images.md

sklearn_train_test_split_processor = FrameworkProcessor(
    estimator_cls=SKLearn,
    framework_version="1.2-1",
    role=role,
    py_version="py3",
    instance_count=processing_instance_count,
    instance_type="ml.m5.xlarge",
    sagemaker_session=pipeline_session,
    base_job_name="train-test-split-image-data-processing-job"
)


In [75]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep
import project_library.file_manager

# Project Directory in this project
project_library_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/home/sagemaker-user/user-default-efs/FastScan/project_library"))


# Docker context
ROOT_INPUT_FOLDER_PATH: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input"))
ROOT_OUTPUT_FOLDER_PATH: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/output"))

aws_dataset_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input/aws_datasets"))
fastscan_dataset_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input/aws_datasets/fastscandataset"))
raw_image_dataset_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input/aws_datasets/fastscandataset/raw_custom_image_dataset"))
raw_label_dataset_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input/aws_datasets/fastscandataset/raw_custom_label_dataset"))


# https://docs.aws.amazon.com/sagemaker/latest/dg/byoc-input-and-output.html (How Amazon Sagemaker processing configures input and output for your processing container)

sklearn_train_test_split_processor_args = sklearn_train_test_split_processor.run(
    inputs=[
        ProcessingInput(source=input_image_data, destination=raw_image_dataset_folder_path)
    ],
    dependencies=[str("/home/sagemaker-user/user-default-efs/FastScan/project_library"), str("/home/sagemaker-user/user-default-efs/FastScan/aws_sagemaker_pipeline/train_test_split_image_data_processing/requirements.txt")],
    code=str("/home/sagemaker-user/user-default-efs/FastScan/aws_sagemaker_pipeline/train_test_split_image_data_processing/processing_script.py"),
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/input/aws_datasets/fastscandataset/raw/train", destination="s3://angkokleong-bucket/datasets/fastscandataset/images/train"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/input/aws_datasets/fastscandataset/raw/test", destination="s3://angkokleong-bucket/datasets/fastscandataset/images/test"),
        ProcessingOutput(output_name="val", source="/opt/ml/processing/input/aws_datasets/fastscandataset/raw/val", destination="s3://angkokleong-bucket/datasets/fastscandataset/images/val")
    ]   
)


train_test_split_image_data_process_step = ProcessingStep(name="train-test-split-image-data-processing", step_args=sklearn_train_test_split_processor_args)

# 6. Prepare Label data for each image based on the dataset created by train-test-split

This process step is to utilize the image data files residing in the train, val and test dataset folder as a reference and populate the label data file in the same way.

In [95]:
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn

# Refer to https://github.com/aws/deep-learning-containers/blob/master/available_images.md

sklearn_label_data_sorting_processor = FrameworkProcessor(
    estimator_cls=SKLearn,
    framework_version="1.2-1",
    role=role,
    py_version="py3",
    instance_count=processing_instance_count,
    instance_type="ml.m5.xlarge",
    sagemaker_session=pipeline_session,
    base_job_name="label-data-sorting-processing-job"
)


In [96]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep
import project_library.file_manager

# Project Directory in this project
project_library_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/home/sagemaker-user/user-default-efs/FastScan/project_library"))


# Docker context
ROOT_INPUT_FOLDER_PATH: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input"))
ROOT_OUTPUT_FOLDER_PATH: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/output"))

aws_dataset_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input/aws_datasets"))
fastscan_dataset_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input/aws_datasets/fastscandataset"))
raw_label_dataset_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input/aws_datasets/fastscandataset/raw_label_data"))

fastscan_dataset_train_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input/aws_datasets/fastscandataset/images/train"))
fastscan_dataset_test_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input/aws_datasets/fastscandataset/images/test"))
fastscan_dataset_val_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input/aws_datasets/fastscandataset/images/val"))

label_data_sorting_processor_args = sklearn_label_data_sorting_processor.run(
    inputs=[
        ProcessingInput(source=input_train_image_dataset_s3_uri, destination=fastscan_dataset_train_folder_path),
        ProcessingInput(source=input_test_image_dataset_s3_uri, destination=fastscan_dataset_test_folder_path),
        ProcessingInput(source=input_val_image_dataset_s3_uri, destination=fastscan_dataset_val_folder_path),
        ProcessingInput(source=input_label_data, destination=raw_label_dataset_folder_path)
    ],
    dependencies=[str("/home/sagemaker-user/user-default-efs/FastScan/project_library"), str("/home/sagemaker-user/user-default-efs/FastScan/aws_sagemaker_pipeline/label_data_processing/requirements.txt")],
    code=str("/home/sagemaker-user/user-default-efs/FastScan/aws_sagemaker_pipeline/label_data_processing/processing_script.py"),
    outputs=[
        ProcessingOutput(output_name="train_label", source="/opt/ml/processing/input/aws_datasets/fastscandataset/labels/train", destination="s3://angkokleong-bucket/datasets/fastscandataset/labels/train"),
        ProcessingOutput(output_name="test_label", source="/opt/ml/processing/input/aws_datasets/fastscandataset/labels/test", destination="s3://angkokleong-bucket/datasets/fastscandataset/labels/test"),
        ProcessingOutput(output_name="val_label", source="/opt/ml/processing/input/aws_datasets/fastscandataset/labels/val", destination="s3://angkokleong-bucket/datasets/fastscandataset/labels/val")
    ]   
)

label_data_sorting_process_step = ProcessingStep(name="label-data-sorting-processing", step_args=label_data_sorting_processor_args)

## 4. Prepare YOLO Model Training

# 5. 

# 12. Define the Pipeline

In [97]:
from sagemaker.workflow.pipeline import Pipeline

pipeline_name = f"FastScanPipeline"

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_count,
        instance_type,
        model_approval_status,
        input_image_data,
        input_label_data,
        input_train_image_dataset_s3_uri,
        input_test_image_dataset_s3_uri,
        input_val_image_dataset_s3_uri,
        mAP50_threshold,
        mAP50to95_threshold
    ],
    steps=[label_data_sorting_process_step]
)

In [98]:
pipeline.upsert(role_arn=role)

INFO:sagemaker.processing:Uploaded None to s3://sagemaker-us-east-1-396913742348/FastScanPipeline/code/f6ccfee3220fcc37f69d16d1e9d672ee/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-east-1-396913742348/FastScanPipeline/code/b8249c70fd4046bced02d085bf7bcbf8/runproc.sh
INFO:sagemaker.processing:Uploaded None to s3://sagemaker-us-east-1-396913742348/FastScanPipeline/code/f6ccfee3220fcc37f69d16d1e9d672ee/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-east-1-396913742348/FastScanPipeline/code/b8249c70fd4046bced02d085bf7bcbf8/runproc.sh


{'PipelineArn': 'arn:aws:sagemaker:us-east-1:396913742348:pipeline/FastScanPipeline',
 'ResponseMetadata': {'RequestId': '4ad7fab1-42ec-4020-aeb3-8537311eb927',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '4ad7fab1-42ec-4020-aeb3-8537311eb927',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '84',
   'date': 'Thu, 27 Feb 2025 11:23:55 GMT'},
  'RetryAttempts': 0}}

# Start the pipeline

In [99]:
execution = pipeline.start()

In [100]:
execution.wait()
execution.describe()

{'PipelineArn': 'arn:aws:sagemaker:us-east-1:396913742348:pipeline/FastScanPipeline',
 'PipelineExecutionArn': 'arn:aws:sagemaker:us-east-1:396913742348:pipeline/FastScanPipeline/execution/giemklan3xb7',
 'PipelineExecutionDisplayName': 'execution-1740655439241',
 'PipelineExecutionStatus': 'Succeeded',
 'PipelineExperimentConfig': {'ExperimentName': 'fastscanpipeline',
  'TrialName': 'giemklan3xb7'},
 'CreationTime': datetime.datetime(2025, 2, 27, 11, 23, 59, 180000, tzinfo=tzlocal()),
 'LastModifiedTime': datetime.datetime(2025, 2, 27, 11, 26, 32, 917000, tzinfo=tzlocal()),
 'CreatedBy': {'UserProfileArn': 'arn:aws:sagemaker:us-east-1:396913742348:user-profile/d-3gd8xgeqnewi/kokleong-1739789074256',
  'UserProfileName': 'kokleong-1739789074256',
  'DomainId': 'd-3gd8xgeqnewi',
  'IamIdentity': {'Arn': 'arn:aws:sts::396913742348:assumed-role/AmazonSageMaker-ExecutionRole-20250217T183323/SageMaker',
   'PrincipalId': 'AROAVY2PHCIGFAVRENP5G:SageMaker'}},
 'LastModifiedBy': {'UserProfile

In [1]:
execution.list_steps()

NameError: name 'execution' is not defined