In [1]:
import pandas as pd
import boto3
import sagemaker
import os

In [2]:
# session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# create an S3 bucket
bucket = sagemaker_session.default_bucket()

## Check if data is available at S3

In [3]:
# set prefix, a descriptive name for a directory  
prefix = 'anomaly'

for obj in boto3.resource('s3').Bucket(bucket).objects.all():
    print(obj.key)

anomaly/RandomCutForest-2021-06-22-10-27-23-663/.amazon.manifest
anomaly/RandomCutForest-2021-06-22-10-27-23-663/matrix_0.pbr
anomaly/output/randomcutforest-2021-06-22-10-34-28-100/output/model.tar.gz
anomaly/output/randomcutforest-2021-06-22-10-34-28-100/profiler-output/framework/training_job_end.ts
anomaly/output/randomcutforest-2021-06-22-10-34-28-100/profiler-output/system/incremental/2021062210/1624358220.algo-1.json
anomaly/output/randomcutforest-2021-06-22-10-34-28-100/profiler-output/system/incremental/2021062210/1624358280.algo-1.json
anomaly/output/randomcutforest-2021-06-22-10-34-28-100/profiler-output/system/incremental/2021062210/1624358340.algo-1.json
anomaly/output/randomcutforest-2021-06-22-10-34-28-100/profiler-output/system/training_job_end.ts
anomaly/output/randomcutforest-2021-06-22-10-34-28-100/rule-output/ProfilerReport-1624358068/profiler-output/profiler-report.html
anomaly/output/randomcutforest-2021-06-22-10-34-28-100/rule-output/ProfilerReport-1624358068/profi

In [4]:
# set locations from variables from data preperation notebook
train_location = 's3://sagemaker-us-east-1-517714493426/anomaly/train.csv'
val_location = 's3://sagemaker-us-east-1-517714493426/anomaly/val.csv'
test_location = 's3://sagemaker-us-east-1-517714493426/anomaly/test.csv'

In [5]:
print(bucket)

sagemaker-us-east-1-517714493426


## Train a Model with Sagemaker built-in function Random Cut Forests

In [6]:
from sagemaker import RandomCutForest

# specify general training job information
rcf = RandomCutForest(
    role=role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    data_location=f"s3://{bucket}/{prefix}/",
    output_path=f"s3://{bucket}/{prefix}/output",
    num_samples_per_tree=512,
    num_trees=50,
)


# explicitly specify "ShardedByS3Key" distribution type
#train_data = sagemaker.inputs.s3_input(s3_data=train_location, content_type='text/csv;label_size=0', distribution='ShardedByS3Key')
#train_data = sagemaker.inputs.TrainingInput(s3_data=train_location, content_type='csv', distribution='ShardedByS3Key')
#val_data = sagemaker.inputs.TrainingInput(s3_data=val_location, content_type='csv', distribution='FullyReplicated')  
# run the training job on input data stored in S3
#rcf.fit({'train': train_data, 'test': val_data})

# doesn't work, documentation isn't up to date -> s3_input -> TrainingInput
# but still get attribute error: 'dict' object has no attribute 'feature_dim'

#https://docs.aws.amazon.com/sagemaker/latest/dg/randomcutforest.html#rcf-input_output



In [7]:
from sagemaker import RandomCutForest

prefix = 'anomaly'

# specify general training job information
rcf = RandomCutForest(
    role=role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    data_location=f"s3://{bucket}/{prefix}/",
    output_path=f"s3://{bucket}/{prefix}/output",
    num_samples_per_tree=512,
    num_trees=50,
)

#https://sagemaker-examples.readthedocs.io/en/latest/introduction_to_amazon_algorithms/random_cut_forest/random_cut_forest.html
X_data = pd.read_pickle('X_train.pkl')

rcf.fit(rcf.record_set(X_data.to_numpy().reshape(-1, 1)))

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.
Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


2021-06-22 11:41:49 Starting - Starting the training job...
2021-06-22 11:41:56 Starting - Launching requested ML instancesProfilerReport-1624362109: InProgress
......
2021-06-22 11:43:10 Starting - Preparing the instances for training.........
2021-06-22 11:44:50 Downloading - Downloading input data...
2021-06-22 11:45:19 Training - Downloading the training image...
2021-06-22 11:45:51 Training - Training image download completed. Training in progress.[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[06/22/2021 11:45:45 INFO 140524976277312] Reading default configuration from /opt/amazon/lib/python3.7/site-packages/algorithm/resources/default-conf.json: {'num_samples_per_tree': 256, 'num_trees': 100, 'force_dense': 'true', 'eval_metrics': ['accuracy', 'precision_recall_fscore'], 'epochs': 1, 'mini_batch_size': 1000, '_log_level': 'info', '_kvstore': 'dist_async', '_num_kv_servers': 'auto', '_num_gpus': 'auto', '_t

In [8]:
print(f"Training job name: {rcf.latest_training_job.job_name}")

Training job name: randomcutforest-2021-06-22-11-41-49-152


In [9]:
rcf_transformer = rcf.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

# TODO: Start the transform job. Make sure to specify the content type and the split type of the test data.
rcf_transformer.transform(test_location, content_type='text/csv', split_type='Line')
rcf_transformer.wait()

# copy result on S3 to local notebook instance
!aws s3 cp --recursive $rcf_transformer.output_path $data_dir

Defaulting to the only supported framework/algorithm version: 1. Ignoring framework/algorithm version: 1.


...............................[34mDocker entrypoint called with argument(s): serve[0m
[34mRunning default environment configuration script[0m
[34m[06/22/2021 11:52:12 INFO 139744339183424] loaded entry point class algorithm.serve.server_config:config_api[0m
[34m[06/22/2021 11:52:12 INFO 139744339183424] loading entry points[0m
[34m[06/22/2021 11:52:12 INFO 139744339183424] Loaded iterator creator application/x-recordio-protobuf for content type ('application/x-recordio-protobuf', '1.0')[0m
[34m[06/22/2021 11:52:12 INFO 139744339183424] loaded request iterator application/json[0m
[34m[06/22/2021 11:52:12 INFO 139744339183424] loaded request iterator application/jsonlines[0m
[34m[06/22/2021 11:52:12 INFO 139744339183424] loaded request iterator application/x-recordio-protobuf[0m
[34m[06/22/2021 11:52:12 INFO 139744339183424] loaded request iterator text/csv[0m
[34m[06/22/2021 11:52:12 INFO 139744339183424] loaded response encoder application/json[0m
[34m[06/22/2021 

UnexpectedStatusException: Error for Transform job randomcutforest-2021-06-22-11-47-03-495: Failed. Reason: ClientError: See job logs for more information

In [11]:
X_test = pd.read_csv(test_location)
print("X_test shape {}".format(X_test.shape))
X_test = None
print("X_data shape {}".format(X_data.shape))


X_test shape (74102, 116)
X_data shape (345821, 116)


In [None]:
#Load S3 Data into AWS SageMaker Notebook

#import pandas as pd
#
#bucket='my-bucket'
#data_key = 'train.csv'
#data_location = 's3://{}/{}'.format(bucket, data_key)
#
#pd.read_csv(data_location)

In [None]:
#rcf_transformer = rcf.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')
#rcf_transformer.transform(test_location, content_type='text/csv', split_type='Line')
#rcf_transformer.wait()

## Inference 

In [None]:
rcf_inference = rcf.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")
print(f"Endpoint name: {rcf_inference.endpoint}")

In [None]:
# only work vor old sagemaker version
#from sagemaker.predictor import csv_serializer
#
#rcf_inference.content_type = 'text/csv'
#rcf_inference.serializer = csv_serializer

## Data Serialization

In [None]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

rcf_inference.serializer = CSVSerializer()
rcf_inference.deserializer = JSONDeserializer()

In [None]:
X_test = pd.read_pickle("X_test.pkl")
y_test = pd.read_pickle("y_test.pkl")

In [None]:
X_test.values

In [None]:
#X_test_numpy = X_test.values.to_numpy().reshape(-1, 1)
X_test_numpy = X_test.values.reshape(-1, 1)
#print(X_test_numpy[:6])
results = rcf_inference.predict(
    X_test_numpy, initial_args={"ContentType": "text/csv", "Accept": "application/json"}
)

## Predicion & Anomaly Scores

In [None]:
results = rcf_inference.predict(X_test_numpy)
scores = [entry["score"] for entry in results["scores"]]

# add scores to taxi data frame and print first few values
y_test_predict = pd.Series(scores, index=X_test.index)
y_test_predict.head()