# VAE Anomaly Detection on SageMaker

In [15]:
import boto3
import sagemaker
from sagemaker import get_execution_role

print(f'sagemaker version: {sagemaker.__version__}')

sagemaker_session = sagemaker.Session()
boto_session = sagemaker_session.boto_session
sagemaker_client = boto_session.client('sagemaker')

BUCKET = 'novelty-detection-gan-grad-sagemaker'
PREFIX = 'MNIST'
LOCAL_DATA_DIRECTORY = f'/Users/ccaloian/Temp/dl-pytorch/data/{PREFIX}/novelty-detection'

print(f"Artifacts will be written to s3://{BUCKET}/{PREFIX}")

# SageMaker Studio
# role = get_execution_role()

# Local
role = "arn:aws:iam::501545181352:role/service-role/AmazonSageMaker-ExecutionRole-20210120T103803"

sagemaker version: 2.23.5
Artifacts will be written to s3://novelty-detection-gan-grad-sagemaker/MNIST


## Data Sources

In [17]:
# prefix = 'novelty-detection-mnist'
# training_input_path = sess.upload_data('edge_list.pickle', key_prefix=prefix+'/training')

# S3 data
s3_training_input_path = 's3://novelty-detection-gan-grad-data/MNIST'
s3_validation_input_path = 's3://novelty-detection-gan-grad-data/MNIST'
s3_testing_input_path = 's3://novelty-detection-gan-grad-data/MNIST'

# Local data
training_input_path = 'file:///Users/ccaloian/Temp/dl-pytorch/data/MNIST/novelty-detection'
validation_input_path = 'file:///Users/ccaloian/Temp/dl-pytorch/data/MNIST/novelty-detection'
testing_input_path = 'file:///Users/ccaloian/Temp/dl-pytorch/data/MNIST/novelty-detection'

## Define Estimator

In [21]:
from sagemaker.pytorch import PyTorch

hyperparameters = {'epochs': 30, 'scenario': 3}

estimator = PyTorch(
    entry_point='run.py',
    source_dir='../sm',
    hyperparameters=hyperparameters,
    framework_version='1.6',
    py_version='py3',
    instance_count=1, 
    instance_type='ml.c5.xlarge',
    output_path=f's3://{BUCKET}/{PREFIX}',
    base_job_name=f'vae-{PREFIX}-scenario-{hyperparameters["scenario"]}',
    role=role,
    sagemaker_session=sagemaker_session,
)

## Fit the Model

In [19]:
estimator.fit({
    'training': s3_training_input_path, 
    'validation': s3_validation_input_path, 
    'testing': s3_testing_input_path
})

2021-02-25 08:59:33 Starting - Starting the training job...
2021-02-25 08:59:56 Starting - Launching requested ML instancesProfilerReport-1614243572: InProgress
......
2021-02-25 09:00:59 Starting - Preparing the instances for training.........
2021-02-25 09:02:38 Downloading - Downloading input data......
2021-02-25 09:03:39 Training - Training image download completed. Training in progress..[34mbash: cannot set terminal process group (-1): Inappropriate ioctl for device[0m
[34mbash: no job control in this shell[0m
[34m2021-02-25 09:03:34,446 sagemaker-training-toolkit INFO     Imported framework sagemaker_pytorch_container.training[0m
[34m2021-02-25 09:03:34,458 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2021-02-25 09:03:34,468 sagemaker_pytorch_container.training INFO     Block until all host DNS lookups succeed.[0m
[34m2021-02-25 09:03:35,907 sagemaker_pytorch_container.training INFO     Invoking user training script.[0m
[

[34mEpoch [1 / 30] Train Loss: 11814.801836; Validation Loss: 10051.803287;[0m
[34mEpoch [2 / 30] Train Loss: 9755.161517; Validation Loss: 9542.380800;[0m
[34mEpoch [3 / 30] Train Loss: 9435.457445; Validation Loss: 9362.246600;[0m
[34mEpoch [4 / 30] Train Loss: 9274.321289; Validation Loss: 9227.730674;[0m
[34mEpoch [5 / 30] Train Loss: 9166.459061; Validation Loss: 9145.441452;[0m
[34mEpoch [6 / 30] Train Loss: 9086.817565; Validation Loss: 9046.029475;[0m
[34mEpoch [7 / 30] Train Loss: 9011.966160; Validation Loss: 9025.641059;[0m
[34mEpoch [8 / 30] Train Loss: 8947.567957; Validation Loss: 9044.148342;[0m
[34mEpoch [9 / 30] Train Loss: 8901.191396; Validation Loss: 8914.065434;[0m
[34mEpoch [10 / 30] Train Loss: 8868.193924; Validation Loss: 8938.612770;[0m
[34mEpoch [11 / 30] Train Loss: 8833.810410; Validation Loss: 8873.745902;[0m
[34mEpoch [12 / 30] Train Loss: 8804.012397; Validation Loss: 8961.161165;[0m
[34mEpoch [13 / 30] Train Loss: 8770.060792; V