In [8]:
%%time
import boto3
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri

role = get_execution_role()

bucket = 'sxm-ecommerce-p66-location-data'

training_image = get_image_uri(boto3.Session().region_name, "image-classification")

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


CPU times: user 32 ms, sys: 7.63 ms, total: 39.6 ms
Wall time: 91.5 ms


In [62]:
# Four channels: train, validation, train_lst, and validation_lst
s3training = "s3://{}/image-classification/training/".format(bucket)
s3validation = "s3://{}/image-classification/validation/".format(bucket)
s3training_lst = "s3://{}/image-classification/training_lst/".format(bucket)
s3validation_lst = "s3://{}/image-classification/validation_lst/".format(bucket)

In [68]:
# The algorithm supports multiple network depth (number of layers). They are 18, 34, 50, 101, 152 and 200
# For this training, we will use 18 layers
num_layers = 18
# we need to specify the input image shape for the training data
image_shape = "3,600,600"
# we also need to specify the number of training samples in the training set
num_training_samples = 320
# specify the number of output classes
num_classes = 2
# batch size for training
mini_batch_size = 32
# number of epochs
epochs = 6
# learning rate
learning_rate = 0.01
# report top_5 accuracy
top_k = 5
# resize image before training
resize = 600
# period to store model parameters (in number of epochs), in this case, we will save parameters from epoch 2, 4, and 6
checkpoint_frequency = 2
# Since we are using transfer learning, we set use_pretrained_model to 1 so that weights can be
# initialized with pre-trained weights
use_pretrained_model = 1

In [69]:
import time
from time import gmtime, strftime


s3 = boto3.client("s3")
# create unique job name
job_name_prefix = "sagemaker-roadclassification"
timestamp = time.strftime("-%Y-%m-%d-%H-%M-%S", time.gmtime())
job_name = job_name_prefix + "_" + str(num_layers) + "_" + str(mini_batch_size) + "_" + str(epochs) + "_" + str(learning_rate) + "_" + timestamp
training_params = {
    # specify the training docker image
    "AlgorithmSpecification": {"TrainingImage": training_image, "TrainingInputMode": "File"},
    "RoleArn": role,
    "OutputDataConfig": {"S3OutputPath": "s3://{}/{}/output".format(bucket, job_name_prefix)},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.p2.xlarge", "VolumeSizeInGB": 50},
    "TrainingJobName": job_name,
    "HyperParameters": {
        "image_shape": image_shape,
        "num_layers": str(num_layers),
        "num_training_samples": str(num_training_samples),
        "num_classes": str(num_classes),
        "mini_batch_size": str(mini_batch_size),
        "epochs": str(epochs),
        "learning_rate": str(learning_rate),
        "top_k": str(top_k),
        "resize": str(resize),
        "checkpoint_frequency": str(checkpoint_frequency),
        "use_pretrained_model": str(use_pretrained_model),
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 360000},
    # Training data should be inside a subdirectory called "train"
    # Validation data should be inside a subdirectory called "validation"
    # The algorithm currently only supports fullyreplicated model (where data is copied onto each machine)
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3training,
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "application/x-image",
            "CompressionType": "None",
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3validation,
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "application/x-image",
            "CompressionType": "None",
        },
        {
            "ChannelName": "train_lst",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3training_lst,
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "application/x-image",
            "CompressionType": "None",
        },
        {
            "ChannelName": "validation_lst",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3validation_lst,
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "application/x-image",
            "CompressionType": "None",
        },
    ],
}
print("Training job name: {}".format(job_name))
print(
    "\nInput Data Location: {}".format(
        training_params["InputDataConfig"][0]["DataSource"]["S3DataSource"]
    )
)

Training job name: sagemaker-roadclassification_18_32_6_0.01_-2021-07-14-23-31-45

Input Data Location: {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sxm-ecommerce-p66-location-data/image-classification/training/', 'S3DataDistributionType': 'FullyReplicated'}


In [None]:
# OVERNIGHT TRAINING LOOP
layerCounts = [18, 34, 152]
batchSizes = [4, 16, 64]
epochNums = [6, 10, 20]
learningRates = [0.01, 0.02, 0.05]

import time
from time import gmtime, strftime

for lc in layerCounts:
    for bs in batchSizes:
        for en in epochNums:
            for lrs in learningRates:
#                 setParams(lc, bs, en, lrs)
                num_layers = lc
                mini_batch_size = bs
                epochs = en
                learning_rate = lrs
                
#                 genModel()
                s3 = boto3.client("s3")
                # create unique job name
                job_name_prefix = "sagemaker-roadclassification"
                timestamp = time.strftime("-%Y-%m-%d-%H-%M-%S", time.gmtime())
                job_name = job_name_prefix + "-" + str(num_layers) + "-" + str(mini_batch_size) + "-" + str(epochs) + "-" + str(int(100*learning_rate)) + "--" + timestamp
                training_params = {
                    # specify the training docker image
                    "AlgorithmSpecification": {"TrainingImage": training_image, "TrainingInputMode": "File"},
                    "RoleArn": role,
                    "OutputDataConfig": {"S3OutputPath": "s3://{}/{}/output".format(bucket, job_name_prefix)},
                    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.p2.xlarge", "VolumeSizeInGB": 50},
                    "TrainingJobName": job_name,
                    "HyperParameters": {
                        "image_shape": image_shape,
                        "num_layers": str(num_layers),
                        "num_training_samples": str(num_training_samples),
                        "num_classes": str(num_classes),
                        "mini_batch_size": str(mini_batch_size),
                        "epochs": str(epochs),
                        "learning_rate": str(learning_rate),
                        "top_k": str(top_k),
                        "resize": str(resize),
                        "checkpoint_frequency": str(checkpoint_frequency),
                        "use_pretrained_model": str(use_pretrained_model),
                    },
                    "StoppingCondition": {"MaxRuntimeInSeconds": 360000},
                    # Training data should be inside a subdirectory called "train"
                    # Validation data should be inside a subdirectory called "validation"
                    # The algorithm currently only supports fullyreplicated model (where data is copied onto each machine)
                    "InputDataConfig": [
                        {
                            "ChannelName": "train",
                            "DataSource": {
                                "S3DataSource": {
                                    "S3DataType": "S3Prefix",
                                    "S3Uri": s3training,
                                    "S3DataDistributionType": "FullyReplicated",
                                }
                            },
                            "ContentType": "application/x-image",
                            "CompressionType": "None",
                        },
                        {
                            "ChannelName": "validation",
                            "DataSource": {
                                "S3DataSource": {
                                    "S3DataType": "S3Prefix",
                                    "S3Uri": s3validation,
                                    "S3DataDistributionType": "FullyReplicated",
                                }
                            },
                            "ContentType": "application/x-image",
                            "CompressionType": "None",
                        },
                        {
                            "ChannelName": "train_lst",
                            "DataSource": {
                                "S3DataSource": {
                                    "S3DataType": "S3Prefix",
                                    "S3Uri": s3training_lst,
                                    "S3DataDistributionType": "FullyReplicated",
                                }
                            },
                            "ContentType": "application/x-image",
                            "CompressionType": "None",
                        },
                        {
                            "ChannelName": "validation_lst",
                            "DataSource": {
                                "S3DataSource": {
                                    "S3DataType": "S3Prefix",
                                    "S3Uri": s3validation_lst,
                                    "S3DataDistributionType": "FullyReplicated",
                                }
                            },
                            "ContentType": "application/x-image",
                            "CompressionType": "None",
                        },
                    ],
                }
                print("Training job name: {}".format(job_name))
                print(
                    "\nInput Data Location: {}".format(
                        training_params["InputDataConfig"][0]["DataSource"]["S3DataSource"]
                    )
                )
                
#                 runModel()
                # create the Amazon SageMaker training job
                sagemaker = boto3.client(service_name="sagemaker")
                sagemaker.create_training_job(**training_params)

                # confirm that the training job has started
                status = sagemaker.describe_training_job(TrainingJobName=job_name)["TrainingJobStatus"]
                print("Training job current status: {}".format(status))

                try:
                    # wait for the job to finish and report the ending status
                    sagemaker.get_waiter("training_job_completed_or_stopped").wait(TrainingJobName=job_name)
                    training_info = sagemaker.describe_training_job(TrainingJobName=job_name)
                    status = training_info["TrainingJobStatus"]
                    print("Training job ended with status: " + status)
                except:
                    print("Training failed to start")
                    # if exception is raised, that means it has failed
                    message = sagemaker.describe_training_job(TrainingJobName=job_name)["FailureReason"]
                    print("Training failed with the following error: {}".format(message))

Training job name: sagemaker-roadclassification-18-4-6-1---2021-07-14-23-31-51

Input Data Location: {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sxm-ecommerce-p66-location-data/image-classification/training/', 'S3DataDistributionType': 'FullyReplicated'}
Training job current status: InProgress
Training job ended with status: Completed
Training job name: sagemaker-roadclassification-18-4-6-2---2021-07-14-23-43-52

Input Data Location: {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sxm-ecommerce-p66-location-data/image-classification/training/', 'S3DataDistributionType': 'FullyReplicated'}
Training job current status: InProgress
Training job ended with status: Completed
Training job name: sagemaker-roadclassification-18-4-6-5---2021-07-14-23-51-53

Input Data Location: {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sxm-ecommerce-p66-location-data/image-classification/training/', 'S3DataDistributionType': 'FullyReplicated'}
Training job current status: InProgress
Training job ended with status: Compl

In [13]:
training_info = sagemaker.describe_training_job(TrainingJobName=job_name)
status = training_info["TrainingJobStatus"]
print("Training job ended with status: " + status)
print(training_info)

Training job ended with status: Completed
{'TrainingJobName': 'sagemaker-roadclassification-2021-07-14-20-25-00', 'TrainingJobArn': 'arn:aws:sagemaker:us-east-1:969580840594:training-job/sagemaker-roadclassification-2021-07-14-20-25-00', 'ModelArtifacts': {'S3ModelArtifacts': 's3://sxm-ecommerce-p66-location-data/sagemaker-roadclassification/output/sagemaker-roadclassification-2021-07-14-20-25-00/output/model.tar.gz'}, 'TrainingJobStatus': 'Completed', 'SecondaryStatus': 'Completed', 'HyperParameters': {'checkpoint_frequency': '2', 'epochs': '6', 'image_shape': '3,600,600', 'learning_rate': '0.01', 'mini_batch_size': '32', 'num_classes': '2', 'num_layers': '18', 'num_training_samples': '320', 'resize': '600', 'top_k': '5', 'use_pretrained_model': '0'}, 'AlgorithmSpecification': {'TrainingImage': '811284229777.dkr.ecr.us-east-1.amazonaws.com/image-classification:1', 'TrainingInputMode': 'File', 'MetricDefinitions': [{'Name': 'train:accuracy', 'Regex': 'Epoch\\S* Train-accuracy=(\\S*)'},

In [14]:
%%time
import boto3
from time import gmtime, strftime

sage = boto3.Session().client(service_name="sagemaker")

timestamp = time.strftime("-%Y-%m-%d-%H-%M-%S", time.gmtime())
model_name = "image-classification-model" + timestamp
print(model_name)
info = sage.describe_training_job(TrainingJobName=job_name)
model_data = info["ModelArtifacts"]["S3ModelArtifacts"]
print(model_data)

hosting_image = get_image_uri(boto3.Session().region_name, "image-classification")

primary_container = {
    "Image": hosting_image,
    "ModelDataUrl": model_data,
}

create_model_response = sage.create_model(
    ModelName=model_name, ExecutionRoleArn=role, PrimaryContainer=primary_container
)

print(create_model_response["ModelArn"])

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


image-classification-model-2021-07-14-20-37-22
s3://sxm-ecommerce-p66-location-data/sagemaker-roadclassification/output/sagemaker-roadclassification-2021-07-14-20-25-00/output/model.tar.gz
arn:aws:sagemaker:us-east-1:969580840594:model/image-classification-model-2021-07-14-20-37-22
CPU times: user 123 ms, sys: 8.03 ms, total: 131 ms
Wall time: 623 ms


In [60]:
public myvariable = 0

def editMyVar(newArg):
    myvariable = newArg

def readMyVar():
    print(myvariable)

SyntaxError: invalid syntax (<ipython-input-60-9987fab157ad>, line 1)

In [61]:
editMyVar(10)
readMyVar()

0
