In [8]:
import boto3
import sagemaker
from sagemaker import get_execution_role
# from sagemaker.image_uris import retrieve
from sagemaker.s3 import *
import sys

if int(sagemaker.__version__.split('.')[0]) == 2:
    !{sys.executable} -m pip install sagemaker==1.72.0
    print("Installing previous SageMaker Version. Please restart the kernel")
else:
    print("Version is good")

role = get_execution_role()


sess = sagemaker.Session(default_bucket=None)
region = boto3.session.Session().region_name
print("Region = {}".format(region))

sm = boto3.Session().client('sagemaker')


Version is good
Region = eu-central-1


In [9]:

# see https://sagemaker.readthedocs.io/en/stable/api/utility/image_uris.html for which inputs to use
# see https://github.com/aws/deep-learning-containers/blob/master/available_images.md for registry paths with custom algorithms
prefix = "763104351884.dkr.ecr.eu-central-1.amazonaws.com/pytorch-training"
PREPROCESSING_IMAGE = "{}:{}".format(prefix, "1.6.0-cpu-py36-ubuntu16.04")
PREPROCESS_INSTANCE = "ml.m5.xlarge"


TRAINING_IMAGE = "{}:{}".format(prefix, "1.6.0-gpu-py36-cu110-ubuntu16.04")
TRAINING_INSTANCE = "ml.g4dn.xlarge" 

print ("Preprocessing image: ", PREPROCESSING_IMAGE)
print ("Training image: ", TRAINING_IMAGE)



Preprocessing image:  763104351884.dkr.ecr.eu-central-1.amazonaws.com/pytorch-training:1.6.0-cpu-py36-ubuntu16.04
Training image:  763104351884.dkr.ecr.eu-central-1.amazonaws.com/pytorch-training:1.6.0-gpu-py36-cu110-ubuntu16.04


In [10]:
!pip install sagemaker-experiments 
from sagemaker.analytics import ExperimentAnalytics
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

You should consider upgrading via the '/home/ec2-user/anaconda3/envs/pytorch_p36/bin/python -m pip install --upgrade pip' command.[0m


In [11]:
sagemaker_local_bucket = sess.default_bucket() # Alternatively you can use our custom bucket here. 
original_data_bucket = "treetracker-training-images"
prefix = 'sagemaker-modelmonitor' # use this prefix to store all files pertaining to this workshop.

dataprefix = prefix + '/data'
traindataprefix = prefix + '/train_data'
testdataprefix = prefix + '/test_data'
testdatanolabelprefix = prefix + '/test_data_no_label'
trainheaderprefix = prefix + '/train_headers'

dataset_key = "imnet" # use this to restrict to a particular directory
train_key = "train"
validation_key = "validation"
test_key = "test"
s3_raw = 's3://{}/{}/'.format(original_data_bucket, dataset_key)
sagemaker_train = 's3://{}/{}/'.format(sagemaker_local_bucket, train_key)
sagemaker_validation = 's3://{}/{}/'.format(sagemaker_local_bucket, validation_key)
sagemaker_test = 's3://{}/{}/'.format(sagemaker_local_bucket, test_key)


In [12]:
train_data_location = original_data_bucket + '/' + traindataprefix
test_data_location = original_data_bucket +'/'+testdataprefix
print("Training data location = {}".format(train_data_location))
print("Test data location = {}".format(test_data_location))

Training data location = treetracker-training-images/sagemaker-modelmonitor/train_data
Test data location = treetracker-training-images/sagemaker-modelmonitor/test_data


In [13]:
# TODO: Figure out preprocessing instance jobs
from sagemaker.processing import ProcessingInput, ProcessingOutput
from sagemaker.processing import ScriptProcessor

pre_input = [ProcessingInput(source=s3_raw, destination="/opt/ml/processing/raw/", input_name="raw")]
pre_output = [ProcessingOutput(source="/opt/ml/processing/train/", destination=sagemaker_train),
              ProcessingOutput(source="/opt/ml/processing/validation/", destination=sagemaker_validation),
              ProcessingOutput(source="/opt/ml/processing/test/", destination=sagemaker_test)]
            
              
script_processor = ScriptProcessor(command= ["python"], 
                                    image_uri=PREPROCESSING_IMAGE,
                                    role=role,
                                    instance_count=1,
                                    instance_type=PREPROCESS_INSTANCE,
                                    base_job_name="preprocessing-test", 
                                    max_runtime_in_seconds=7200)

preprocessing_script = "preprocessing_p1.py" # Put path to preprocessing script here

In [14]:
script_processor.run(preprocessing_script, 
                    inputs=pre_input,
                    outputs=pre_output,
                    arguments=None)

# default arguments in script should work for now

INFO:sagemaker:Creating processing-job with name preprocessing-test-2021-01-04-02-07-35-015



Job Name:  preprocessing-test-2021-01-04-02-07-35-015
Inputs:  [{'InputName': 'raw', 'S3Input': {'S3Uri': 's3://treetracker-training-images/imnet/', 'LocalPath': '/opt/ml/processing/raw/', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}, {'InputName': 'code', 'S3Input': {'S3Uri': 's3://sagemaker-eu-central-1-053061259712/preprocessing-test-2021-01-04-02-07-35-015/input/code/preprocessing_p1.py', 'LocalPath': '/opt/ml/processing/input/code', 'S3DataType': 'S3Prefix', 'S3InputMode': 'File', 'S3DataDistributionType': 'FullyReplicated', 'S3CompressionType': 'None'}}]
Outputs:  [{'OutputName': 'output-1', 'S3Output': {'S3Uri': 's3://sagemaker-eu-central-1-053061259712/train/', 'LocalPath': '/opt/ml/processing/train/', 'S3UploadMode': 'EndOfJob'}}, {'OutputName': 'output-2', 'S3Output': {'S3Uri': 's3://sagemaker-eu-central-1-053061259712/validation/', 'LocalPath': '/opt/ml/processing/validation/', 'S3UploadMode': 'E

In [25]:
## TODO: Add step for RecordIO format 

In [18]:
from sagemaker.pytorch import PyTorch

# This is where you can add hyperparameters, framework used, point to the script, and define instances you want to train on. 
# ALl of this information is represented as environment variables passed to the instance. In your script, you can refer to these variables or 
# the argument. 

# TODO: add metric monitoring via CloudWatch 
# https://docs.aws.amazon.com/sagemaker/latest/dg/training-metrics.html
estimator = PyTorch(entry_point='mobilenet_v2.py',
                    role=role,
                    framework_version='1.6.0',
                    train_instance_count=1,
                    train_instance_type=TRAINING_INSTANCE,
                    hyperparameters={
                        'epochs': 5,
                        'backend': 'gloo',
                        'train_split': 0.7, 
                        'log_interval': 200
                    },                   
                   )

In [19]:
estimator.fit({"training": sagemaker_train, "validation": sagemaker_validation, "test": sagemaker_test})

INFO:sagemaker:Creating training-job with name: pytorch-training-2021-01-04-02-42-20-226


2021-01-04 02:42:20 Starting - Starting the training job...
2021-01-04 02:42:22 Starting - Launching requested ML instances......
2021-01-04 02:43:45 Starting - Preparing the instances for training......
2021-01-04 02:44:30 Downloading - Downloading input data.....................................................................
2021-01-04 02:56:23 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-01-04 02:56:23,902 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-01-04 02:56:23,921 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-01-04 02:56:26,936 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-01-04 02:56:27,392 sagemaker-training-toolkit INFO     Invoking user script
[0m
[34m

UnexpectedStatusException: Error for Training job pytorch-training-2021-01-04-02-42-20-226: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/opt/conda/bin/python mobilenet_v2.py --backend gloo --epochs 5 --log_interval 200 --train_split 0.7"
Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
  0%|          | 0.00/13.6M [00:00<?, ?B/s]100%|ââââââââââ| 13.6M/13.6M [00:00<00:00, 258MB/s]
/opt/conda/lib/python3.6/site-packages/torch/cuda/__init__.py:125: UserWarning: 
Tesla T4 with CUDA capability sm_75 is not compatible with the current PyTorch installation.
The current PyTorch install supports CUDA capabilities sm_35 sm_52 sm_60 sm_61 sm_70 compute_70.
If you want to use the Tesla T4 GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/

  warnings.warn(incompatible_device_warn.format(device_name, capability, " ".join(arch_list), device_name))
Traceback (most recent call last):
  File "mobilenet_v2.py", line 429, in <module>
    trainer.train(args)
  Fi