In [1]:
import pandas as pd
import boto3
import sagemaker
import os

In [2]:
# session and role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# create an S3 bucket
bucket = sagemaker_session.default_bucket()

## Check if data is available at S3

In [3]:
# set prefix, a descriptive name for a directory  
prefix = 'anomaly'

for obj in boto3.resource('s3').Bucket(bucket).objects.all():
    print(obj.key)

anomaly/RandomCutForest-2021-06-22-05-30-51-789/.amazon.manifest
anomaly/RandomCutForest-2021-06-22-05-30-51-789/matrix_0.pbr
anomaly/output/randomcutforest-2021-06-22-05-37-15-577/output/model.tar.gz
anomaly/output/randomcutforest-2021-06-22-05-37-15-577/profiler-output/framework/training_job_end.ts
anomaly/output/randomcutforest-2021-06-22-05-37-15-577/profiler-output/system/incremental/2021062205/1624340340.algo-1.json
anomaly/output/randomcutforest-2021-06-22-05-37-15-577/profiler-output/system/incremental/2021062205/1624340400.algo-1.json
anomaly/output/randomcutforest-2021-06-22-05-37-15-577/profiler-output/system/incremental/2021062205/1624340460.algo-1.json
anomaly/output/randomcutforest-2021-06-22-05-37-15-577/profiler-output/system/incremental/2021062205/1624340520.algo-1.json
anomaly/output/randomcutforest-2021-06-22-05-37-15-577/profiler-output/system/training_job_end.ts
anomaly/output/randomcutforest-2021-06-22-05-37-15-577/rule-output/ProfilerReport-1624340235/profiler-ou

In [4]:
# set locations from variables from data preperation notebook
train_location = 's3://sagemaker-us-east-1-517714493426/anomaly/train.csv'
val_location = 's3://sagemaker-us-east-1-517714493426/anomaly/val.csv'
test_location = 's3://sagemaker-us-east-1-517714493426/anomaly/test.csv'

In [5]:
print(bucket)

sagemaker-us-east-1-517714493426


## Train a Model with Sagemaker built-in function Random Cut Forests

In [None]:
from sagemaker import RandomCutForest

prefix = 'anomaly'

# specify general training job information
rcf = RandomCutForest(
    role=role,
    instance_count=1,
    instance_type="ml.m4.xlarge",
    data_location=f"s3://{bucket}/{prefix}/",
    output_path=f"s3://{bucket}/{prefix}/output",
    num_samples_per_tree=512,
    num_trees=50,
)

#https://sagemaker-examples.readthedocs.io/en/latest/introduction_to_amazon_algorithms/random_cut_forest/random_cut_forest.html
X_data = pd.read_pickle('X_train.pkl')

rcf.fit(rcf.record_set(X_data.to_numpy().reshape(-1, 1)))

In [None]:
print(f"Training job name: {rcf.latest_training_job.job_name}")

In [6]:
from sagemaker import image_uris
image = image_uris.retrieve(framework='randomcutforest',region='us-east-1')

# Output path
#'382416733822.dkr.ecr.us-east-1.amazonaws.com/randomcutforest:1'


In [7]:
rcf = sagemaker.model.Model(
    image_uri=image,
    model_data='s3://sagemaker-us-east-1-517714493426/anomaly/output/randomcutforest-2021-06-22-05-37-15-577/output/model.tar.gz',
    role=role)

In [None]:
#rcf_transformer = rcf.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

#rcf_transformer.transform(test_location, content_type='text/csv', split_type='Line')
#rcf_transformer.wait()

# copy result on S3 to local notebook instance
#!aws s3 cp --recursive $rcf_transformer.output_path $data_dir

# Customer Error: Feature dimension mismatch: training feature dim is: 1, scoring feature dim is: 117 (caused by ValueError)

#IMDB Sentiment Analysis - XGBoost (Updating a Model) - Solution

In [None]:
#Load S3 Data into AWS SageMaker Notebook

#import pandas as pd
#
#bucket='my-bucket'
#data_key = 'train.csv'
#data_location = 's3://{}/{}'.format(bucket, data_key)
#
#pd.read_csv(data_location)

## Inference 

In [16]:
rcf_inference = rcf.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

print(f"Endpoint name: {rcf_inference.endpoint}")

-----------------!

AttributeError: 'NoneType' object has no attribute 'endpoint'

## Data Serialization

In [10]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

rcf_inference.serializer = CSVSerializer() # old version: rcf_inference.content_type = 'text/csv', rcf_inference.serializer = csv_serializer
rcf_inference.deserializer = JSONDeserializer()

AttributeError: 'NoneType' object has no attribute 'serializer'

In [14]:
X_test = pd.read_pickle("X_test.pkl")
y_test = pd.read_pickle("y_test.pkl")

In [15]:
#X_test_numpy = X_test.values.to_numpy().reshape(-1, 1)
X_test_numpy = X_test.values.reshape(-1, 1)
#print(X_test_numpy[:6])
results = rcf_inference.predict(
    X_test_numpy[:6], initial_args={"ContentType": "text/csv", "Accept": "application/json"}
)

AttributeError: 'NoneType' object has no attribute 'predict'

## Predicion & Anomaly Scores

In [None]:
results = rcf_inference.predict(X_test_numpy)
scores = [entry["score"] for entry in results["scores"]]

# add scores to taxi data frame and print first few values
y_test_predict = pd.Series(scores, index=X_test.index)
y_test_predict.head()