# FastScan AWS SageMaker Pipeline Implementation

The implementation in this jupyter notebook can only be used in AWS SageMaker Studio Code Editor.

https://sagemaker-examples.readthedocs.io/en/latest/sagemaker-pipelines/tabular/abalone_build_train_deploy/sagemaker-pipelines-preprocess-train-evaluate-batch-transform.html#Define-a-Processing-Step-for-Feature-Engineering

# 1. Install required dependencies

In [8]:
%pip install sagemaker-training
%pip install opencv-python-headless
%pip install numpy
%pip install sagemaker --upgrade
%pip install scikit-learn

Collecting sagemaker-training
  Using cached sagemaker_training-4.9.0-cp311-cp311-linux_x86_64.whl
Collecting gevent (from sagemaker-training)
  Using cached gevent-24.11.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting inotify_simple==1.2.1 (from sagemaker-training)
  Using cached inotify_simple-1.2.1-py3-none-any.whl
Collecting protobuf<=3.20.3,>=3.9.2 (from sagemaker-training)
  Using cached protobuf-3.20.3-py2.py3-none-any.whl.metadata (720 bytes)
Collecting zope.event (from gevent->sagemaker-training)
  Using cached zope.event-5.0-py3-none-any.whl.metadata (4.4 kB)
Collecting zope.interface (from gevent->sagemaker-training)
  Using cached zope.interface-7.2-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (44 kB)
Using cached protobuf-3.20.3-py2.py3-none-any.whl (162 kB)
Using cached gevent-24.11.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.8 MB)
Using cached zope.e

## 1.1 Install required packages for OpenCV

In [2]:
!sudo apt-get update -y
!sudo apt-get install ffmpeg libsm6 libxext6 -y

Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease 
Get:3 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:4 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Get:5 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [3664 kB]
Get:6 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [2639 kB]
Get:7 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1531 kB]
Get:8 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [2941 kB]
Fetched 11.2 MB in 1s (8583 kB/s)                       
Reading package lists... Done
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
libsm6 is already the newest version (2:1.2.3-1build2).
libxext6 is already the newest version (2:1.3.4-1build1).
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 n

## 1.2 Import Packages

In [9]:

import os
import numpy
import sagemaker
import sys




# 2. Create SageMaker Pipeline Session

In [10]:
from sagemaker.workflow.pipeline_context import PipelineSession

sagemaker_session = sagemaker.session.Session()

region = sagemaker_session.boto_region_name
role = sagemaker.get_execution_role()

pipeline_session = PipelineSession()

default_bucket = sagemaker_session.default_bucket()

model_package_group_name = f"FastScanModelPackageGroupName"

# 3. Define Parameters in the Pipeline for Pipeline Execution

In [11]:
image_input_data_uri: str = "s3://angkokleong-bucket/datasets/raw_custom_image_dataset/"
label_input_data_uri: str = "s3://angkokleong-bucket/label-data/"

In [12]:
from sagemaker.workflow.parameters import (
    ParameterInteger,
    ParameterString,
    ParameterFloat,
)

#There can be multiple input data


processing_instance_count = ParameterInteger(name="ProcessingInstanceCount", default_value=1)
instance_type = ParameterString(name="TrainingInstanceType", default_value="ml.m5.xlarge")
model_approval_status = ParameterString(name="ModelApprovalStatus", default_value="PendingManualApproval")
input_image_data = ParameterString(
    name="InputImageData",
    default_value=image_input_data_uri
)

input_label_data = ParameterString(
    name="InputLabelData",
    default_value=label_input_data_uri
)

mAP50_threshold = ParameterFloat(name="mAP50Threshold", default_value=0.9)
mAP50to95_threshold = ParameterFloat(name="mAP50to95threshold", default_value=0.8)

# 4. Prepare Train, Val and Test dataset and Label Data

Image file split to create train, test and val dataset for YOLO model training

In [13]:
from sagemaker.processing import FrameworkProcessor
from sagemaker.sklearn.estimator import SKLearn


sklearn_processor = FrameworkProcessor(
    estimator_cls=SKLearn,
    framework_version="1.2-1",
    role=role,
    py_version="py3",
    instance_count=processing_instance_count,
    instance_type="ml.m5.xlarge",
    sagemaker_session=pipeline_session,
    base_job_name="image_and_label_data_train_test_split_process"
)


In [14]:
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.workflow.steps import ProcessingStep
import project_library.file_manager

# Project Directory in this project
project_library_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/home/sagemaker-user/user-default-efs/FastScan/project_library"))


# Docker context
ROOT_INPUT_FOLDER_PATH: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input"))
ROOT_OUTPUT_FOLDER_PATH: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/output"))

aws_dataset_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input/aws_datasets"))
fastscan_dataset_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input/aws_datasets/fastscandataset"))
raw_image_dataset_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input/aws_datasets/fastscandataset/raw_custom_image_dataset"))
raw_label_dataset_folder_path: str = str(project_library.file_manager.FileInformation.get_absolute_folder_location("/opt/ml/processing/input/aws_datasets/fastscandataset/raw_custom_label_dataset"))



processor_args = sklearn_processor.run(
    inputs=[
        ProcessingInput(source=input_image_data, destination=raw_image_dataset_folder_path),
        ProcessingInput(source=input_label_data, destination=raw_label_dataset_folder_path)
    ],
    dependencies=[str("/home/sagemaker-user/user-default-efs/FastScan/project_library"), str("/home/sagemaker-user/user-default-efs/FastScan/aws_sagemaker_pipeline/preprocessing/requirements.txt")],
    code=str("/home/sagemaker-user/user-default-efs/FastScan/aws_sagemaker_pipeline/preprocessing/image_preprocessing_script.py"),
    outputs=[
        ProcessingOutput(output_name="train", source="/opt/ml/processing/input/aws_datasets/fastscandataset/raw/train", destination="s3://angkokleong-bucket/datasets/fastscandataset/images/train"),
        ProcessingOutput(output_name="test", source="/opt/ml/processing/input/aws_datasets/fastscandataset/raw/test", destination="s3://angkokleong-bucket/datasets/fastscandataset/images/test"),
        ProcessingOutput(output_name="val", source="/opt/ml/processing/input/aws_datasets/fastscandataset/raw/val", destination="s3://angkokleong-bucket/datasets/fastscandataset/images/val"),
        ProcessingOutput(output_name="train_label", source="/opt/ml/processing/input/aws_datasets/fastscandataset/labels/train", destination="s3://angkokleong-bucket/datasets/fastscandataset/labels/train"),
        ProcessingOutput(output_name="test_label", source="/opt/ml/processing/input/aws_datasets/fastscandataset/labels/test", destination="s3://angkokleong-bucket/datasets/fastscandataset/labels/test"),
        ProcessingOutput(output_name="val_label", source="/opt/ml/processing/input/aws_datasets/fastscandataset/labels/val", destination="s3://angkokleong-bucket/datasets/fastscandataset/labels/val")
    ]   
)


image_and_label_data_process_step = ProcessingStep(name="image_and_label_data_processing", step_args=processor_args)



## 4. Prepare YOLO Model Training

# 5. 

# 12. Define the Pipeline

In [15]:
from sagemaker.workflow.pipeline import Pipeline

pipeline_name = f"FastScanPipeline"

pipeline = Pipeline(
    name=pipeline_name,
    parameters=[
        processing_instance_count,
        instance_type,
        model_approval_status,
        input_image_data,
        input_label_data,
        mAP50_threshold,
        mAP50to95_threshold
    ],
    steps=[image_and_label_data_process_step]
)




In [16]:
pipeline.upsert(role_arn=role)

INFO:sagemaker.processing:Uploaded None to s3://sagemaker-us-east-1-396913742348/FastScanPipeline/code/ced773bc155540c4d390f223f7daf79b/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-east-1-396913742348/FastScanPipeline/code/b90eb363cc0b8ed0e75b7d842ce92d88/runproc.sh
INFO:sagemaker.processing:Uploaded None to s3://sagemaker-us-east-1-396913742348/FastScanPipeline/code/ced773bc155540c4d390f223f7daf79b/sourcedir.tar.gz
INFO:sagemaker.processing:runproc.sh uploaded to s3://sagemaker-us-east-1-396913742348/FastScanPipeline/code/b90eb363cc0b8ed0e75b7d842ce92d88/runproc.sh


{'PipelineArn': 'arn:aws:sagemaker:us-east-1:396913742348:pipeline/FastScanPipeline',
 'ResponseMetadata': {'RequestId': '1f1e8be6-be27-4497-af69-4ecabd1db04d',
  'HTTPStatusCode': 200,
  'HTTPHeaders': {'x-amzn-requestid': '1f1e8be6-be27-4497-af69-4ecabd1db04d',
   'content-type': 'application/x-amz-json-1.1',
   'content-length': '84',
   'date': 'Wed, 26 Feb 2025 14:29:16 GMT'},
  'RetryAttempts': 0}}

# Start the pipeline

In [17]:
execution = pipeline.start()

In [18]:
execution.wait()
execution.describe()

WaiterError: Waiter PipelineExecutionComplete failed: Waiter encountered a terminal failure state: For expression "PipelineExecutionStatus" we matched expected path: "Failed"

In [None]:
execution.list_steps()