In [None]:
import os
!pip install sagemaker
!pip install boto3
!pip install matplotlib

In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import boto3
import botocore
import sagemaker
import sys
from sagemaker import RandomCutForest
# taxi_data = pd.read_csv('nyc_taxi.csv', delimiter=",")
network_data = pd.read_csv('network_traffic.csv', delimiter=",")


In [None]:

session = sagemaker.Session()

# RCF 모델 생성
rcf = RandomCutForest(
    role=os.getenv("sagemaker_role"),
    instance_count=1,
    instance_type="ml.m4.xlarge",
    data_location=f"s3://{os.getenv('bucket_name')}/data/",
    output_path=f"s3://{os.getenv('bucket_name')}/output",
    num_samples_per_tree=512,
    num_trees=50,
)

# automatically upload the training data to S3 and run the training job
rcf.fit(rcf.record_set(network_data.value.to_numpy().reshape(-1, 1)))

In [None]:
print(f"Training job name: {rcf.latest_training_job.job_name}")

In [None]:
# RCF 모델 배포
rcf_inference = rcf.deploy(initial_instance_count=1, instance_type="ml.m4.xlarge")

In [None]:
print(f"Endpoint name: {rcf_inference.endpoint}")

In [None]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

rcf_inference.serializer = CSVSerializer()
rcf_inference.deserializer = JSONDeserializer()

inference_data = pd.read_csv('test.csv', delimiter=",")
network_data_numpy = inference_data.value.to_numpy().reshape(-1, 1)
print(network_data_numpy[:6])
results = rcf_inference.predict(
    network_data_numpy[:6], initial_args={"ContentType": "text/csv", "Accept": "application/json"}
)

In [None]:
results = rcf_inference.predict(network_data_numpy)
scores = [datum["score"] for datum in results["scores"]]

inference_data["score"] = pd.Series(scores, index=inference_data.index)
inference_data.head()

In [None]:
fig, ax1 = plt.subplots()
ax2 = ax1.twinx()

start, end = 0, len(inference_data)

network_data_subset = inference_data[start:end]

ax1.plot(network_data_subset["value"], color="C0", alpha=0.8)
ax2.plot(network_data_subset["score"], color="C1")

ax1.grid(which="major", axis="both")

ax1.set_ylabel("Traffic Packets", color="C0")
ax2.set_ylabel("Anomaly Score", color="C1")

ax1.tick_params("y", colors="C0")
ax2.tick_params("y", colors="C1")

ax1.set_ylim(0, 200)
ax2.set_ylim(min(scores), 1.4 * max(scores))
fig.set_figwidth(10)

In [None]:
score_mean = inference_data["score"].mean()
score_std = inference_data["score"].std()
score_cutoff = score_mean + 3 * score_std

anomalies = network_data_subset[network_data_subset["score"] > score_cutoff]
anomalies

In [None]:
ax2.plot(anomalies.index, anomalies.score, "ko")
fig

In [None]:
sagemaker.Session().delete_endpoint(rcf_inference.endpoint)