In [1]:
import sagemaker
from sagemaker import get_execution_role

sagemaker_session = sagemaker.Session()

# Get a SageMaker-compatible role used by this Notebook Instance.
role = get_execution_role()

# S3 prefix
bucket = sagemaker_session.default_bucket()
prefix = "Scikit-LinearLearner-snmpcpu"

In [3]:
import pandas as pd
import numpy as np
import pyarrow
from sklearn.model_selection import train_test_split

df=pd.read_parquet("7e940b15d4914503a2c42e3e75ac196c.parquet")
df=df.to_csv("7e940b15d4914503a2c42e3e75ac196c.csv")
df=pd.read_csv("7e940b15d4914503a2c42e3e75ac196c.csv")

df['ts']=pd.to_datetime(df['ts'])
data_cpu=df.drop('Unnamed: 0',axis=1)

#data_cpu['day']=data_cpu['ts'].dt.day
data_cpu['day_of_week']=data_cpu['ts'].dt.day_of_week.astype(np.float)
data_cpu['hour']=data_cpu['ts'].dt.hour.astype(np.float)
data_cpu['minutes']=data_cpu['ts'].dt.minute.astype(np.float)
data_cpu['date']=data_cpu['ts'].dt.strftime("%Y-%m-%d")
data_cpu['date']=pd.to_datetime(data_cpu['date'])

time_series=data_cpu[['minutes','hour','day_of_week','snmpcpu_processor_1','date']]
time_series=time_series.drop('date',axis=1)

columns=list(time_series.columns)


x_train, x_test = train_test_split(time_series, test_size=0.25)
x_eval = x_test[['minutes', 'hour', 'day_of_week',]]
x_train.to_csv("train.csv")
x_test.to_csv("test.csv")
x_eval.to_csv("eval.csv", header=False, index=False)

trainpath = sagemaker_session.upload_data(
    path="train.csv", bucket=bucket, key_prefix="sagemaker/sklearn-train"
)

testpath = sagemaker_session.upload_data(
    path="test.csv", bucket=bucket, key_prefix="sagemaker/sklearn-train"
)

#print(trainpath)
#print(testpath)

sagemaker_session.upload_data(
    path="eval.csv", bucket=bucket, key_prefix="sagemaker/sklearn-eval"
)

eval_s3_prefix = f"s3://{bucket}/sagemaker/sklearn-eval/"
#eval_s3_prefix

In [4]:
%%writefile script.py

import argparse
import joblib
import os

import numpy as np
import pandas as pd
#from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

# inference functions ---------------
def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf


if __name__ == "__main__":

    print("extracting arguments")
    parser = argparse.ArgumentParser()

    # hyperparameters sent by the client are passed as command-line arguments to the script.
    # to simplify the demo we don't use all sklearn RandomForest hyperparameters
    #parser.add_argument("--n-estimators", type=int, default=10)
    #parser.add_argument("--min-samples-leaf", type=int, default=3)

    # Data, model, and output directories
    parser.add_argument("--model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAIN"))
    parser.add_argument("--test", type=str, default=os.environ.get("SM_CHANNEL_TEST"))
    parser.add_argument("--train-file", type=str, default="train.csv")
    parser.add_argument("--test-file", type=str, default="test.csv")
    parser.add_argument(
        "--features", type=str
    )  # in this script we ask user to explicitly name features
    parser.add_argument(
        "--target", type=str
    )  # in this script we ask user to explicitly name the target

    args, _ = parser.parse_known_args()

    print("reading data")
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

    print("building training and testing datasets")
    X_train = train_df[args.features.split()]
    X_test = test_df[args.features.split()]
    y_train = train_df[args.target]
    y_test = test_df[args.target]

    # train
    print("training model")
    model =LinearRegression() #RandomForestRegressor(n_estimators=args.n_estimators, min_samples_leaf=args.min_samples_leaf, n_jobs=-1)

    model.fit(X_train, y_train)

    # print abs error
    print("validating model")
    abs_err = np.abs(model.predict(X_test) - y_test)

    # print couple perf metrics
    for q in [10, 50, 90]:
        print("AE-at-" + str(q) + "th-percentile: " + str(np.percentile(a=abs_err, q=q)))

    # persist model
    path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, path)
    print("model persisted at " + path)
    #print(args.min_samples_leaf)

Writing script.py


In [7]:
# We use the Estimator from the SageMaker Python SDK
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "1.0-1"
training_job_1_name = "sklearn-snmp-cpu-1"

sklearn_estimator_1 = SKLearn(
    entry_point="script.py",
    role=get_execution_role(),
    instance_count=1,
    instance_type="ml.m5.xlarge",
    framework_version=FRAMEWORK_VERSION,
    base_job_name=training_job_1_name,
    metric_definitions=[{"Name": "median-AE", "Regex": "AE-at-50th-percentile: ([0-9.]+).*$"}],
    sagemaker_session=sagemaker_session,
    hyperparameters={
        #"n-estimators": 2,
        #"min-samples-leaf": 3,
        "features": "minutes hour day_of_week",
        "target": "snmpcpu_processor_1",
    },
)

In [8]:
sklearn_estimator_1.fit({"train": trainpath, "test": testpath})

2022-11-21 15:40:17 Starting - Starting the training job...
2022-11-21 15:40:43 Starting - Preparing the instances for trainingProfilerReport-1669045217: InProgress
......
2022-11-21 15:41:45 Downloading - Downloading input data...
2022-11-21 15:42:10 Training - Downloading the training image...
2022-11-21 15:42:43 Training - Training image download completed. Training in progress..[34m2022-11-21 15:42:43,097 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training[0m
[34m2022-11-21 15:42:43,100 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-11-21 15:42:43,108 sagemaker_sklearn_container.training INFO     Invoking user training script.[0m
[34m2022-11-21 15:42:43,477 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-11-21 15:42:43,488 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2022-11-21 15:42:43,498 sagemaker

In [9]:
predictor=sklearn_estimator_1.deploy(initial_instance_count=1,instance_type='ml.m5.xlarge')

----!

In [10]:
predictor.endpoint

The endpoint attribute has been renamed in sagemaker>=2.
See: https://sagemaker.readthedocs.io/en/stable/v2.html for details.


'sklearn-snmp-cpu-1-2022-11-21-15-45-48-272'

[Minutes, hour, weekday]

In [11]:
predictor.predict([[38., 16.,  4.]])

array([2.65591222])

# Batch processing
## Create a Model Package Group for the trained model to be registered

Create a new Model Package Group or use an existing one to register the model

In [12]:
import boto3
import time

client = boto3.client("sagemaker")

model_package_group_name = "sklearn-snmp-cpu" + str(round(time.time()))
model_package_group_input_dict = {
    "ModelPackageGroupName": model_package_group_name,
    "ModelPackageGroupDescription": "My sample sklearn model package group",
}

create_model_pacakge_group_response = client.create_model_package_group(
    **model_package_group_input_dict
)
model_package_arn = create_model_pacakge_group_response["ModelPackageGroupArn"]
print(f"ModelPackageGroup Arn : {model_package_arn}")


ModelPackageGroup Arn : arn:aws:sagemaker:us-east-1:940426109786:model-package-group/sklearn-snmp-cpu1669048935


### Register the model of the training job in the Model Registry

Once the model is registered, you will see it in the Model Registry tab of the SageMaker Studio UI. The model is registered with the approval_status set to "Approved". By default, the model is registered with the approval_status set to PendingManualApproval. Users can then navigate to the Model Registry to manually approve the model based on any criteria set for model evaluation or this can be done via API.

In [13]:
inference_instance_type = "ml.m5.xlarge"
model_package_1 = sklearn_estimator_1.register(
    model_package_group_name=model_package_arn,
    inference_instances=[inference_instance_type],
    transform_instances=[inference_instance_type],
    content_types=["text/csv"],
    response_types=["text/csv"],
    approval_status="Approved",
)

model_package_arn_1 = model_package_1.model_package_arn
print("Model Package ARN : ", model_package_arn_1)

Model Package ARN :  arn:aws:sagemaker:us-east-1:940426109786:model-package/sklearn-snmp-cpu1669048935/1


## Create a transform job with the default configurations from the model of the  training job

In [14]:
sklearn_1_transformer = model_package_1.transformer(
    instance_count=1, instance_type=inference_instance_type
)

In [15]:
sklearn_1_transformer.transform(eval_s3_prefix, split_type="Line", content_type="text/csv")

..........................[34m2022-11-21 16:49:55,293 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2022-11-21 16:49:55,295 INFO - sagemaker-containers - No GPUs detected (normal if no gpus installed)[0m
[34m2022-11-21 16:49:55,296 INFO - sagemaker-containers - nginx config: [0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr;[0m
[34mworker_rlimit_nofile 4096;[0m
[34mevents {
  worker_connections 2048;[0m
[34m}[0m
[34mhttp {
  include /etc/nginx/mime.types;
  default_type application/octet-stream;
  access_log /dev/stdout combined;
  upstream gunicorn {
    server unix:/tmp/gunicorn.sock;
  }
  server {
    listen 8080 deferred;
    client_max_body_size 0;
    keepalive_timeout 3;
    location ~ ^/(ping|invocations|execution-parameters) {
      proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
      proxy_set_header Host $http_host;
      proxy_redirect off;
    

Let's inspect the output of the Batch Transform job in S3. It should show the CPU in block group.

**Go to s3 and Download the from the s3 part**

In [16]:
sklearn_1_transformer.output_path

's3://sagemaker-us-east-1-940426109786/1-2022-11-21-16-45-45-088'

In [17]:
output_file_name = "eval.csv.out"

In [18]:
pd.read_csv(output_file_name, sep=",", header=None)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,10790,10791,10792,10793,10794,10795,10796,10797,10798,10799
0,[2.674172600256959,2.656889,2.673542,2.668324,2.661286,2.63597,2.632166,2.662212,2.652649,2.645131,...,2.662187,2.669324,2.664676,2.671665,2.652525,2.665082,2.663946,2.635937,2.638675,2.6566369938885632]


**Actual**

In [22]:
pd.read_csv("test.csv")['snmpcpu_processor_1']

0         2.0
1         2.0
2        38.0
3         4.0
4         2.0
         ... 
10795     2.0
10796     3.0
10797     2.0
10798     4.0
10799     3.0
Name: snmpcpu_processor_1, Length: 10800, dtype: float64