In [None]:
import sagemaker
from sagemaker import get_execution_role

sess = sagemaker.Session()
role = get_execution_role()
sagemaker_default_bucket = sess.default_bucket()

account = sess.boto_session.client("sts").get_caller_identity()["Account"]
region = sess.boto_session.region_name
s3_bkt = sagemaker.Session().default_bucket()
s3_bkt

## Data Processing - Bring your own container
-----

In [None]:
%%writefile docker/Dockerfile
FROM ncbi/sra-tools

# Install python/pip, sagemaker-training toolkit
ENV PYTHONUNBUFFERED=1
RUN apk add --update --no-cache python3 python3-dev gcc musl-dev linux-headers && ln -sf python3 /usr/bin/python
RUN python3 -m ensurepip
RUN pip3 install --no-cache --upgrade pip setuptools sagemaker-training

In [None]:
tag='data-processing-v0'
!docker build ./docker -t $tag

In [None]:
## define repo name, should contain *sagemaker* in the name
repo_name = "sra-project"
tag = 'data-processing-v0'

In [None]:
%%script env repo_name=$repo_name tag=$tag bash

#!/usr/bin/env bash

# This script shows how to build the Docker image and push it to ECR to be ready for use
# by SageMaker.

# The argument to this script is the image name. This will be used as the image on the local
# machine and combined with the account and region to form the repository name for ECR.
# The name of our algorithm
algorithm_name=${repo_name}

account=$(aws sts get-caller-identity --query Account --output text)

# Get the region defined in max_run=rrent configuration (default to us-west-2 if none defined)
region=$(aws configure get region)
region=${region:-us-west-2}

fullname="${account}.dkr.ecr.${region}.amazonaws.com/${algorithm_name}:${tag}"

# If the repository doesn't exist in ECR, create it.
aws ecr describe-repositories --repository-names "${algorithm_name}" > /dev/null 2>&1

if [ $? -ne 0 ]
then
    aws ecr create-repository --repository-name "${algorithm_name}" > /dev/null
fi

# Get the login command from ECR and execute it directly
aws ecr get-login-password --region ${region}|docker login --username AWS --password-stdin ${fullname}

# Build the docker image locally with the image name and then push it to ECR
# with the full name.

docker build -t ${algorithm_name}:${tag} ./docker
docker tag ${algorithm_name} ${fullname}

docker push ${fullname}

### Process data from FSxL

In [None]:
image_uri = "{}.dkr.ecr.{}.amazonaws.com/{}:{}".format(account, region, repo_name, tag)

from sagemaker.estimator import Estimator
from sagemaker.inputs import FileSystemInput

train_fs = FileSystemInput(
    file_system_id="fs-0f8a3b8eef47b6ff8",  # put File system ID here, find from FSxL Console
    file_system_type="FSxLustre",
    directory_path="/yobzhbmv",  # put Mount name here, find from FSxL Console
    file_system_access_mode="rw",  # rw: read-write, ro: read-only
)

estimator = Estimator(
    entry_point='1-process-data.sh',  # any kind of scripts
    source_dir='./src',  # this folder will be packed and sent to S3, don't put large data under it.
    role=role,
    image_uri=image_uri,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    hyperparameters={  # pass into entry_point script, in format "--key1 value1 --key2 value2"
        'data_id': 'ERR036591'
    },
    subnets=[  # subnet SageMaker use to put NIC(s), to access FSx Lustre storage inside VPC
        'subnet-07ce0ab63b4cfeb25', # private subnet, so that instance can access Internet,
                                    # otherwise, setup S3, CloudWatch private link
    ],
    security_group_ids=[  # security group to bind with this instance, in order to access fsx
        'sg-04acfc98f6929ee4e'
    ],
    max_run=2 * 24 * 3600,  # in seconds, max run time allowed.
    # keep_alive_period_in_seconds=60*10,  # in second, if you want to reuse these instances,
    # enable_remote_debug=True,  # set to True for remote debugging, or enable later
)

wait = False  # wait for execution
estimator.fit({
    'training': train_fs,  # this will exposed as Env: SM_CHANNEL_TRAINING, with value /opt/ml/input/data/training
    # 'whatever': train_fs,  # this will exposed as Env: SM_CHANNEL_WHATEVER, with value /opt/ml/input/data/whatever
}, wait=wait)

job_name = estimator._current_job_name

In [None]:
# you can enable remote debugging while a training job is running when the 
# SecondaryStatus of the job is Downloading or Training.

estimator.enable_remote_debug()

In [None]:
print(f"Access first training instance:\naws ssm start-session --target sagemaker-training-job:{job_name}_algo-1")

### SageMaker Remote Debugging
-------------

Find more info from https://docs.aws.amazon.com/sagemaker/latest/dg/train-remote-debugging.html

In [None]:
import sagemaker
import time

session = sagemaker.Session()

while True:
    # Describe the job status
    training_job_info = session.describe_training_job(job_name)
    secondary_status = training_job_info['SecondaryStatus']
    print(secondary_status)

    if secondary_status in ['Downloading', 'Training']:
        break
    else:
        time.sleep(5)

## Process data from S3

You can also access public s3 data by passing in s3 uri into .fit(), make sure you're using the same region as S3.

In [None]:
image_uri = "{}.dkr.ecr.{}.amazonaws.com/{}:{}".format(account, region, repo_name, tag)

from sagemaker.estimator import Estimator

estimator = Estimator(
    entry_point='1-process-data.sh',  # any kind of scripts
    source_dir='./src',  # this folder will be packed and sent to S3, don't put large data under it.
    role=role,
    image_uri=image_uri,
    instance_count=1,
    instance_type='ml.m5.xlarge',
    max_run=2 * 24 * 3600,  # in seconds, max run time allowed.
    # keep_alive_period_in_seconds=60*10,  # in second, if you want to reuse these instances,
    # enable_remote_debug=True,  # set to True for remote debugging, or enable later
)

wait = False  # wait for execution
estimator.fit({
    'training': 's3://sra-pub-src-1/ERR11491426/',
}, wait=wait)

job_name = estimator._current_job_name

### SageMaker Remote Debugging
-------------

Find more info from https://docs.aws.amazon.com/sagemaker/latest/dg/train-remote-debugging.html

In [None]:
import sagemaker
import time

session = sagemaker.Session()

while True:
    # Describe the job status
    training_job_info = session.describe_training_job(job_name)
    secondary_status = training_job_info['SecondaryStatus']
    print(secondary_status)

    if secondary_status in ['Downloading', 'Training']:
        break
    else:
        time.sleep(5)

In [None]:
estimator.enable_remote_debug()

In [None]:
print(f"Access first training instance:\naws ssm start-session --target sagemaker-training-job:{job_name}_algo-1")