# VAE Anomaly Detection Experiments on SageMaker

In [1]:
import datetime
import boto3
import sagemaker
from sagemaker import get_execution_role

print(f'sagemaker version: {sagemaker.__version__}')

sagemaker_session = sagemaker.Session()
boto_session = sagemaker_session.boto_session
sagemaker_client = boto_session.client('sagemaker')

BUCKET = 'novelty-detection-gan-grad-sagemaker'
PREFIX = 'MNIST'
LOCAL_DATA_DIRECTORY = f'/Users/ccaloian/Temp/dl-pytorch/data/{PREFIX}/novelty-detection'

print(f"Artifacts will be written to s3://{BUCKET}/{PREFIX}")

# SageMaker Studio
# role = get_execution_role()

# Local
role = "arn:aws:iam::501545181352:role/service-role/AmazonSageMaker-ExecutionRole-20210120T103803"

sagemaker version: 2.23.5
Artifacts will be written to s3://novelty-detection-gan-grad-sagemaker/MNIST


## Data Sources

In [2]:
# prefix = 'novelty-detection-mnist'
# training_input_path = sess.upload_data('edge_list.pickle', key_prefix=prefix+'/training')

# MNIST Scenario
scenario = 3

# S3 data
s3_training_input_path = 's3://novelty-detection-gan-grad-data/MNIST'
s3_validation_input_path = 's3://novelty-detection-gan-grad-data/MNIST'
s3_testing_input_path = 's3://novelty-detection-gan-grad-data/MNIST'

# Local data
training_input_path = 'file:///Users/ccaloian/Temp/dl-pytorch/data/MNIST/novelty-detection'
validation_input_path = 'file:///Users/ccaloian/Temp/dl-pytorch/data/MNIST/novelty-detection'
testing_input_path = 'file:///Users/ccaloian/Temp/dl-pytorch/data/MNIST/novelty-detection'

## Set up Experiment

In [3]:
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent

### Create an Experiment

In [4]:
experiment_name = f"vae-{PREFIX}-scenario-{scenario}-{datetime.datetime.now().strftime('%Y%m%d%H%M')}"
description = f"Novelty detection of MNIST hand-written digits scenario {scenario} using PyTorch VAE."

vae_experiment = Experiment.create(
    experiment_name=experiment_name, 
    description=description, 
    sagemaker_boto_client=sagemaker_client
)

### Create a Trial

In [5]:
trial_name = f"vae-trial-{datetime.datetime.now().strftime('%Y%m%d%H%M')}"

vae_trial = Trial.create(
    trial_name=trial_name, 
    experiment_name=vae_experiment.experiment_name,
    sagemaker_boto_client=sagemaker_client
)

### Define Estimator

In [6]:
from sagemaker.pytorch import PyTorch

hyperparameters = {
    'scenario': scenario, 
    'epochs': 30, 
    'batch_size': 128, 
    'latent_dims': 4,
    'use_gpu': None
}

estimator = PyTorch(
    entry_point='run.py',
    source_dir='../sm',
    hyperparameters=hyperparameters,
    framework_version='1.6',
    py_version='py3',
    instance_count=1, 
    instance_type='ml.p2.xlarge',
    output_path=f's3://{BUCKET}/{PREFIX}',
    code_location=f's3://{BUCKET}/{PREFIX}',
    base_job_name=f'vae-{PREFIX}-scenario-{scenario}',
    role=role,
    sagemaker_session=sagemaker_session,
)

### Associate the Estimator with the Trial and fit the model

In [7]:
estimator.fit(
    inputs={
        'training': s3_training_input_path, 
        'validation': s3_validation_input_path, 
        'testing': s3_testing_input_path
    },
    experiment_config={
        "TrialName": vae_trial.trial_name,
        "TrialComponentDisplayName": "Training",
    }
)

INFO:sagemaker.image_uris:Defaulting to the only supported framework/algorithm version: latest.
INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.
INFO:sagemaker:Creating training-job with name: vae-MNIST-scenario-3-2021-02-25-13-41-27-228


2021-02-25 13:41:28 Starting - Starting the training job...
2021-02-25 13:41:52 Starting - Launching requested ML instancesProfilerReport-1614260487: InProgress
......
2021-02-25 13:42:54 Starting - Preparing the instances for training............
2021-02-25 13:44:54 Downloading - Downloading input data...
2021-02-25 13:45:35 Training - Downloading the training image.........
2021-02-25 13:46:56 Training - Training image download completed. Training in progress.[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-02-25 13:46:53,638 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-02-25 13:46:53,676 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-02-25 13:46:55,119 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[34m2021-02-25 13:46:55,597 sagemaker-training-

[34mEpoch [13 / 30] Train Loss: 15218.003957; Validation Loss: 15074.756831;[0m
[34mEpoch [14 / 30] Train Loss: 15152.438309; Validation Loss: 15101.855767;[0m
[34mEpoch [15 / 30] Train Loss: 15101.317074; Validation Loss: 15043.985523;[0m
[34mEpoch [16 / 30] Train Loss: 15064.637461; Validation Loss: 14991.875524;[0m
[34mEpoch [17 / 30] Train Loss: 15011.407730; Validation Loss: 14986.823957;[0m
[34mEpoch [18 / 30] Train Loss: 14988.769590; Validation Loss: 14981.354121;[0m
[34mEpoch [19 / 30] Train Loss: 14942.037717; Validation Loss: 14913.819797;[0m
[34mEpoch [20 / 30] Train Loss: 14897.159758; Validation Loss: 14912.680854;[0m
[34mEpoch [21 / 30] Train Loss: 14874.336900; Validation Loss: 14810.637795;[0m
[34mEpoch [22 / 30] Train Loss: 14844.583602; Validation Loss: 14870.785337;[0m
[34mEpoch [23 / 30] Train Loss: 14813.768125; Validation Loss: 14932.414117;[0m
[34mEpoch [24 / 30] Train Loss: 14786.612145; Validation Loss: 14800.592448;[0m
[34mEpoch [25 /

## Tracking Distributed Hyperparameter Search with Multiple Trials

In [8]:
from smexperiments.tracker import Tracker

with Tracker.create(display_name="Preprocessing", sagemaker_boto_client=sagemaker_client) as tracker:
    tracker.log_parameters({
        "normalization_mean": 0.1307,
        "normalization_std": 0.3081,
    })
    
    tracker.log_input(name=f"mnist-scenario-{scenario}", media_type="s3/uri", value=s3_training_input_path)

ClientError: An error occurred (ValidationException) when calling the CreateTrialComponent operation: Trial Component creation is currently restricted to the SageMaker runtime. Try supplying an experiment config when creating a job instead.

In [None]:
experiment_name = f"mnist-scenario-{scenario}-{datetime.datetime.now().strftime('%Y%m%d%H%M')}"
description = f"Novelty detection of MNIST hand-written digits scenario {scenario} using PyTorch VAE."

mnist_experiment = Experiment.create(
    experiment_name=experiment_name,
    description=description, 
    sagemaker_boto_client=sagemaker_client
)

In [None]:
latent_dims_trial_name_map = {} # Keep references to each Trial object

# If you want to run the following training jobs asynchronously, you may need to increase
# your resource limit. Otherwise, you can run them sequentially.
for i, latent_dims in enumerate([2, 4, 8]):
    
    # create Trial object
    trial_name = f"torch-{latent_dims}-latent-dims-{datetime.datetime.now().strftime('%Y%m%d%H%M')}"
    mnist_trial = Trial.create(
        trial_name=trial_name, 
        experiment_name=mnist_experiment.experiment_name,
        sagemaker_boto_client=sagemaker_client,
    )
    latent_dims_trial_name_map[latent_dims] = trial_name
    
    # Associate the proprocessing trial component with the current trial
    mnist_trial.add_trial_component(tracker.trial_component)
    
    # all input configurations, parameters, and metrics specified in estimator 
    # definition are automatically tracked
    estimator = PyTorch(
        entry_point='run.py',
        source_dir='../sm',
        hyperparameters=hyperparameters,
        framework_version='1.6',
        py_version='py3',
        instance_count=1, 
        instance_type='ml.p2.xlarge',
        output_path=f's3://{BUCKET}/{PREFIX}',
        code_location=f's3://{BUCKET}/{PREFIX}',
        base_job_name=f'vae-{PREFIX}-scenario-{scenario}',
        role=role,
        sagemaker_session=sagemaker_session,
        hyperparameters={
            'scenario': scenario, 
            'epochs': 30, 
            'batch_size': 128, 
            'latent_dims': latent_dims,
            'use_gpu': None
        },
        metric_definitions=[
            {'Name':'train:loss', 'Regex':'Train Loss: (.*?);'},
            {'Name':'val:loss', 'Regex':'Validation Loss: (.*?),'},
        ],
        enable_sagemaker_metrics=True,
    )
    
    # Now associate the estimator with the Experiment and Trial
    estimator.fit(
        inputs={'training': inputs}, 
        experiment_config={
            "TrialName": mnist_trial.trial_name,
            "TrialComponentDisplayName": "Training",
        },
        wait=True,
    )
    
    # give it a while before dispatching the next training job
    time.sleep(5)