In [1]:
%%time
import boto3
from sagemaker import get_execution_role
from sagemaker.amazon.amazon_estimator import get_image_uri

role = get_execution_role()

bucket = 'sxm-ecommerce-p66-location-data'

training_image = get_image_uri(boto3.Session().region_name, "image-classification")

The method get_image_uri has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


CPU times: user 603 ms, sys: 236 ms, total: 839 ms
Wall time: 579 ms


In [2]:
# Four channels: train, validation, train_lst, and validation_lst
s3training = "s3://{}/version2/image-classification/training/".format(bucket)
s3validation = "s3://{}/version2/image-classification/validation/".format(bucket)
s3training_lst = "s3://{}/version2/image-classification/training_lst/".format(bucket)
s3validation_lst = "s3://{}/version2/image-classification/validation_lst/".format(bucket)
print(s3training)

s3://sxm-ecommerce-p66-location-data/version2/image-classification/training/


In [3]:
# The algorithm supports multiple network depth (number of layers). They are 18, 34, 50, 101, 152 and 200
# For this training, we will use 18 layers
num_layers = 18
# we need to specify the input image shape for the training data
image_shape = "3,600,600"
# we also need to specify the number of training samples in the training set
num_training_samples = 320
# specify the number of output classes
num_classes = 2
# batch size for training
mini_batch_size = 16
# number of epochs
epochs = 20
# learning rate
learning_rate = 0.02
# report top_5 accuracy
top_k = 5
# resize image before training
resize = 600
# period to store model parameters (in number of epochs), in this case, we will save parameters from epoch 2, 4, and 6
checkpoint_frequency = 2
# Since we are using transfer learning, we set use_pretrained_model to 1 so that weights can be
# initialized with pre-trained weights
use_pretrained_model = 1

In [4]:
%%time
import time
import boto3
from time import gmtime, strftime


s3 = boto3.client("s3")
# create unique job name
job_name_prefix = "sagemaker-imageclassification-notebook"
timestamp = time.strftime("-%Y-%m-%d-%H-%M-%S", time.gmtime())
job_name = job_name_prefix + timestamp
training_params = {
    # specify the training docker image
    "AlgorithmSpecification": {"TrainingImage": training_image, "TrainingInputMode": "File"},
    "RoleArn": role,
    "OutputDataConfig": {"S3OutputPath": "s3://{}/{}/output".format(bucket, job_name_prefix)},
    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.p2.xlarge", "VolumeSizeInGB": 50},
    "TrainingJobName": job_name,
    "HyperParameters": {
        "image_shape": image_shape,
        "num_layers": str(num_layers),
        "num_training_samples": str(num_training_samples),
        "num_classes": str(num_classes),
        "mini_batch_size": str(mini_batch_size),
        "epochs": str(epochs),
        "learning_rate": str(learning_rate),
        "top_k": str(top_k),
        "resize": str(resize),
        "checkpoint_frequency": str(checkpoint_frequency),
        "use_pretrained_model": str(use_pretrained_model),
    },
    "StoppingCondition": {"MaxRuntimeInSeconds": 360000},
    # Training data should be inside a subdirectory called "train"
    # Validation data should be inside a subdirectory called "validation"
    # The algorithm currently only supports fullyreplicated model (where data is copied onto each machine)
    "InputDataConfig": [
        {
            "ChannelName": "train",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3training,
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "application/x-image",
            "CompressionType": "None",
        },
        {
            "ChannelName": "validation",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3validation,
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "application/x-image",
            "CompressionType": "None",
        },
        {
            "ChannelName": "train_lst",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3training_lst,
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "application/x-image",
            "CompressionType": "None",
        },
        {
            "ChannelName": "validation_lst",
            "DataSource": {
                "S3DataSource": {
                    "S3DataType": "S3Prefix",
                    "S3Uri": s3validation_lst,
                    "S3DataDistributionType": "FullyReplicated",
                }
            },
            "ContentType": "application/x-image",
            "CompressionType": "None",
        },
    ],
}
print("Training job name: {}".format(job_name))
print(
    "\nInput Data Location: {}".format(
        training_params["InputDataConfig"][0]["DataSource"]["S3DataSource"]
    )
)

Training job name: sagemaker-imageclassification-notebook-2021-07-16-21-00-06

Input Data Location: {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sxm-ecommerce-p66-location-data/version2/image-classification/training/', 'S3DataDistributionType': 'FullyReplicated'}
CPU times: user 87.6 ms, sys: 3.62 ms, total: 91.3 ms
Wall time: 90.5 ms


In [5]:
# create the Amazon SageMaker training job
sagemaker = boto3.client(service_name="sagemaker")
sagemaker.create_training_job(**training_params)

# confirm that the training job has started
status = sagemaker.describe_training_job(TrainingJobName=job_name)["TrainingJobStatus"]
print("Training job current status: {}".format(status))

try:
    # wait for the job to finish and report the ending status
    sagemaker.get_waiter("training_job_completed_or_stopped").wait(TrainingJobName=job_name)
    training_info = sagemaker.describe_training_job(TrainingJobName=job_name)
    status = training_info["TrainingJobStatus"]
    print("Training job ended with status: " + status)
except:
    print("Training failed to start")
    # if exception is raised, that means it has failed
    message = sagemaker.describe_training_job(TrainingJobName=job_name)["FailureReason"]
    print("Training failed with the following error: {}".format(message))

Training job current status: InProgress
Training job ended with status: Completed


In [None]:
training_info = sagemaker.describe_training_job(TrainingJobName=job_name)
status = training_info["TrainingJobStatus"]
print("Training job ended with status: " + status)
print(training_info)

In [None]:
%%time
import boto3
from time import gmtime, strftime

sage = boto3.Session().client(service_name="sagemaker")

timestamp = time.strftime("-%Y-%m-%d-%H-%M-%S", time.gmtime())
model_name = 'image-classification-model' + '-18-16-20-2---2021-07-15-02-28-07'
print(model_name)
info = sage.describe_training_job(TrainingJobName='sagemaker-roadclassification-18-16-20-2---2021-07-15-02-28-07')
model_data = info["ModelArtifacts"]["S3ModelArtifacts"]
print(model_data)

hosting_image = get_image_uri(boto3.Session().region_name, "image-classification")

primary_container = {
    "Image": hosting_image,
    "ModelDataUrl": model_data,
}

create_model_response = sage.create_model(
    ModelName=model_name, ExecutionRoleArn=role, PrimaryContainer=primary_container
)

print(create_model_response["ModelArn"])

In [None]:
timestamp = time.strftime("-%Y-%m-%d-%H-%M-%S", time.gmtime())
batch_job_name = "image-classification-model" + timestamp
batch_input = s3all_freqs
request = {
    "TransformJobName": batch_job_name,
    "ModelName": model_name,
    "MaxConcurrentTransforms": 16,
    "MaxPayloadInMB": 6,
    "BatchStrategy": "SingleRecord",
    "TransformOutput": {"S3OutputPath": "s3://{}/{}/output".format(bucket, batch_job_name)},
    "TransformInput": {
        "DataSource": {"S3DataSource": {"S3DataType": "S3Prefix", "S3Uri": batch_input}},
        "ContentType": "application/x-image",
        "SplitType": "None",
        "CompressionType": "None",
    },
    "TransformResources": {"InstanceType": "ml.p2.xlarge", "InstanceCount": 1},
}

print("Transform job name: {}".format(batch_job_name))
print("\nInput Data Location: {}".format(batch_input))

In [None]:
sagemaker = boto3.client("sagemaker")
sagemaker.create_transform_job(**request)

print("Created Transform job with name: ", batch_job_name)

while True:
    response = sagemaker.describe_transform_job(TransformJobName=batch_job_name)
    status = response["TransformJobStatus"]
    if status == "Completed":
        print("Transform job ended with status: " + status)
        break
    if status == "Failed":
        message = response["FailureReason"]
        print("Transform failed with the following error: {}".format(message))
        raise Exception("Transform job failed")
    time.sleep(30)

In [None]:
from urllib.parse import urlparse
import json
import numpy as np

s3_client = boto3.client("s3")
object_categories = ["Not road", "Road"]


def list_objects(s3_client, bucket, prefix):
    response = s3_client.list_objects(Bucket=bucket, Prefix=prefix)
    objects = [content["Key"] for content in response["Contents"]]
    return objects


def get_label(s3_client, bucket, prefix):
    filename = prefix.split("/")[-1]
    s3_client.download_file(bucket, prefix, filename)
    with open(filename) as f:
        data = json.load(f)
        index = np.argmax(data["prediction"])
        probability = data["prediction"][index]
    print("Result: label - " + object_categories[index] + ", probability - " + str(probability))
    return object_categories[index], probability


inputs = list_objects(s3_client, bucket, urlparse(batch_input).path.lstrip("/"))
print("Sample inputs: " + str(inputs[0:399]))

outputs = list_objects(s3_client, bucket, batch_job_name + "/output")
print("Sample output: " + str(outputs[0:399]))

# Check prediction result of the first 2 images
[get_label(s3_client, bucket, prefix) for prefix in outputs[0:399]]

In [None]:
# OVERNIGHT TRAINING LOOP
layerCounts = [18, 34, 152]
batchSizes = [4, 16, 64]
epochNums = [6, 10, 20]
learningRates = [0.01, 0.02, 0.05]

import time
from time import gmtime, strftime

for lc in layerCounts:
    for bs in batchSizes:
        for en in epochNums:
            for lrs in learningRates:
#                 setParams(lc, bs, en, lrs)
                num_layers = lc
                mini_batch_size = bs
                epochs = en
                learning_rate = lrs
                
#                 genModel()
                s3 = boto3.client("s3")
                # create unique job name
                job_name_prefix = "sagemaker-roadclassification"
                timestamp = time.strftime("-%Y-%m-%d-%H-%M-%S", time.gmtime())
                job_name = job_name_prefix + "-" + str(num_layers) + "-" + str(mini_batch_size) + "-" + str(epochs) + "-" + str(int(100*learning_rate)) + "--" + timestamp
                training_params = {
                    # specify the training docker image
                    "AlgorithmSpecification": {"TrainingImage": training_image, "TrainingInputMode": "File"},
                    "RoleArn": role,
                    "OutputDataConfig": {"S3OutputPath": "s3://{}/{}/output".format(bucket, job_name_prefix)},
                    "ResourceConfig": {"InstanceCount": 1, "InstanceType": "ml.p2.xlarge", "VolumeSizeInGB": 50},
                    "TrainingJobName": job_name,
                    "HyperParameters": {
                        "image_shape": image_shape,
                        "num_layers": str(num_layers),
                        "num_training_samples": str(num_training_samples),
                        "num_classes": str(num_classes),
                        "mini_batch_size": str(mini_batch_size),
                        "epochs": str(epochs),
                        "learning_rate": str(learning_rate),
                        "top_k": str(top_k),
                        "resize": str(resize),
                        "checkpoint_frequency": str(checkpoint_frequency),
                        "use_pretrained_model": str(use_pretrained_model),
                    },
                    "StoppingCondition": {"MaxRuntimeInSeconds": 360000},
                    # Training data should be inside a subdirectory called "train"
                    # Validation data should be inside a subdirectory called "validation"
                    # The algorithm currently only supports fullyreplicated model (where data is copied onto each machine)
                    "InputDataConfig": [
                        {
                            "ChannelName": "train",
                            "DataSource": {
                                "S3DataSource": {
                                    "S3DataType": "S3Prefix",
                                    "S3Uri": s3training,
                                    "S3DataDistributionType": "FullyReplicated",
                                }
                            },
                            "ContentType": "application/x-image",
                            "CompressionType": "None",
                        },
                        {
                            "ChannelName": "validation",
                            "DataSource": {
                                "S3DataSource": {
                                    "S3DataType": "S3Prefix",
                                    "S3Uri": s3validation,
                                    "S3DataDistributionType": "FullyReplicated",
                                }
                            },
                            "ContentType": "application/x-image",
                            "CompressionType": "None",
                        },
                        {
                            "ChannelName": "train_lst",
                            "DataSource": {
                                "S3DataSource": {
                                    "S3DataType": "S3Prefix",
                                    "S3Uri": s3training_lst,
                                    "S3DataDistributionType": "FullyReplicated",
                                }
                            },
                            "ContentType": "application/x-image",
                            "CompressionType": "None",
                        },
                        {
                            "ChannelName": "validation_lst",
                            "DataSource": {
                                "S3DataSource": {
                                    "S3DataType": "S3Prefix",
                                    "S3Uri": s3validation_lst,
                                    "S3DataDistributionType": "FullyReplicated",
                                }
                            },
                            "ContentType": "application/x-image",
                            "CompressionType": "None",
                        },
                    ],
                }
                print("Training job name: {}".format(job_name))
                print(
                    "\nInput Data Location: {}".format(
                        training_params["InputDataConfig"][0]["DataSource"]["S3DataSource"]
                    )
                )
                
#                 runModel()
                # create the Amazon SageMaker training job
                sagemaker = boto3.client(service_name="sagemaker")
                sagemaker.create_training_job(**training_params)

                # confirm that the training job has started
                status = sagemaker.describe_training_job(TrainingJobName=job_name)["TrainingJobStatus"]
                print("Training job current status: {}".format(status))

                try:
                    # wait for the job to finish and report the ending status
                    sagemaker.get_waiter("training_job_completed_or_stopped").wait(TrainingJobName=job_name)
                    training_info = sagemaker.describe_training_job(TrainingJobName=job_name)
                    status = training_info["TrainingJobStatus"]
                    print("Training job ended with status: " + status)
                except:
                    print("Training failed to start")
                    # if exception is raised, that means it has failed
                    message = sagemaker.describe_training_job(TrainingJobName=job_name)["FailureReason"]
                    print("Training failed with the following error: {}".format(message))

Training job name: sagemaker-roadclassification-18-4-6-1---2021-07-16-21-16-31

Input Data Location: {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sxm-ecommerce-p66-location-data/version2/image-classification/training/', 'S3DataDistributionType': 'FullyReplicated'}
Training job current status: InProgress
Training job ended with status: Completed
Training job name: sagemaker-roadclassification-18-4-6-2---2021-07-16-21-26-32

Input Data Location: {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sxm-ecommerce-p66-location-data/version2/image-classification/training/', 'S3DataDistributionType': 'FullyReplicated'}
Training job current status: InProgress
Training job ended with status: Completed
Training job name: sagemaker-roadclassification-18-4-6-5---2021-07-16-21-36-33

Input Data Location: {'S3DataType': 'S3Prefix', 'S3Uri': 's3://sxm-ecommerce-p66-location-data/version2/image-classification/training/', 'S3DataDistributionType': 'FullyReplicated'}
Training job current status: InProgress
Training j

In [None]:
#sagemaker-roadclassification-18-16-20-1---2021-07-17-00-04-47
#sagemaker-roadclassification-18-16-20-2---2021-07-17-00-18-49