In [None]:
import os
import boto3
import re
import sagemaker


role = sagemaker.get_execution_role()
region = boto3.Session().region_name

# S3 bucket for training data.
# Feel free to specify a different bucket and prefix.
data_bucket = sagemaker.Session().default_bucket()
data_prefix = "sagemaker/linear-learner-regression-ccpp"


# S3 bucket for saving code and model artifacts.
# Feel free to specify a different bucket and prefix
output_bucket = sagemaker.Session().default_bucket()
output_prefix = "linear-learner-regression-ccpp"


In [None]:
%%time

import io
import boto3
import random

# Load the dataset
FILE_DATA = "Folds5x2_pp.csv"

# split the downloaded data into train/test/validation files
FILE_TRAIN = "ccpp_dataset1_train.csv"
FILE_VALIDATION = "ccpp_dataset1_validation.csv"
FILE_TEST = "ccpp_dataset1_test.csv"
PERCENT_TRAIN = 70
PERCENT_VALIDATION = 15
PERCENT_TEST = 15

def data_split(
    FILE_DATA, FILE_TRAIN, FILE_VALIDATION, FILE_TEST, PERCENT_TRAIN, PERCENT_VALIDATION, PERCENT_TEST
):
    data = [l for l in open(FILE_DATA, "r")]
    train_file = open(FILE_TRAIN, "w")
    valid_file = open(FILE_VALIDATION, "w")
    tests_file = open(FILE_TEST, "w")

    num_of_data = len(data)
    num_train = int((PERCENT_TRAIN / 100.0) * num_of_data)
    num_valid = int((PERCENT_VALIDATION / 100.0) * num_of_data)
    num_tests = int((PERCENT_TEST / 100.0) * num_of_data)

    data_fractions = [num_train, num_valid, num_tests]
    split_data = [[], [], []]

    rand_data_ind = 0

    for split_ind, fraction in enumerate(data_fractions):
        for i in range(fraction):
            rand_data_ind = random.randint(0, len(data) - 1)
            split_data[split_ind].append(data[rand_data_ind])
            data.pop(rand_data_ind)

    for l in split_data[0]:
        train_file.write(l)

    for l in split_data[1]:
        valid_file.write(l)

    for l in split_data[2]:
        tests_file.write(l)

    train_file.close()
    valid_file.close()
    tests_file.close()
    
data_split(
    FILE_DATA, FILE_TRAIN, FILE_VALIDATION, FILE_TEST, PERCENT_TRAIN, PERCENT_VALIDATION, PERCENT_TEST
)

# S3 bucket to store training data.
# Feel free to specify a different bucket and prefix.
bucket = sagemaker.Session().default_bucket()
prefix = "sagemaker/linear-learner-regression-ccpp"

def write_to_s3(fobj, bucket, key):
    return (
        boto3.Session(region_name=region).resource("s3").Bucket(bucket).Object(key).upload_fileobj(fobj)
    )


def upload_to_s3(bucket, prefix, channel, filename):
    fobj = open(filename, "rb")
    key = f"{prefix}/{channel}/{filename}"
    url = f"s3://{bucket}/{key}"
    print(f"Writing to {url}")
    write_to_s3(fobj, bucket, key)
    
# upload the files to the S3 bucket
upload_to_s3(bucket, prefix, "train", FILE_TRAIN)
upload_to_s3(bucket, prefix, "validation", FILE_VALIDATION)
upload_to_s3(bucket, prefix, "test", FILE_TEST)

Writing to s3://sagemaker-us-east-1-872507569564/sagemaker/linear-learner-regression-ccpp/train/ccpp_dataset1_train.csv
Writing to s3://sagemaker-us-east-1-872507569564/sagemaker/linear-learner-regression-ccpp/validation/ccpp_dataset1_validation.csv
Writing to s3://sagemaker-us-east-1-872507569564/sagemaker/linear-learner-regression-ccpp/test/ccpp_dataset1_test.csv
CPU times: user 298 ms, sys: 31 ms, total: 329 ms
Wall time: 822 ms


In [None]:

FILE_TRAIN = "ccpp_dataset1_train.csv"
s3 = boto3.client("s3")
s3.download_file(data_bucket, f"{data_prefix}/train/{FILE_TRAIN}", FILE_TRAIN)

import pandas as pd  # Read in csv and store in a pandas dataframe

df = pd.read_csv(FILE_TRAIN, sep=",", encoding="latin1", names=["PE","AT","V","AP","RH"])
print(df.head(2))

       PE     AT      V       AP     RH
0  441.12  26.96  72.86  1004.86  59.17
1  489.36   5.51  35.57  1026.30  78.97


In [None]:
# creating the inputs for the fit() function with the training and validation location
s3_train_data = f"s3://{data_bucket}/{data_prefix}/train"
print(f"training files will be taken from: {s3_train_data}")
s3_validation_data = f"s3://{data_bucket}/{data_prefix}/validation"
print(f"validtion files will be taken from: {s3_validation_data}")
output_location = f"s3://{output_bucket}/{output_prefix}/output"
print(f"training artifacts output location: {output_location}")

# generating the session.s3_input() format for fit() accepted by the sdk
train_data = sagemaker.inputs.TrainingInput(
    s3_train_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)
validation_data = sagemaker.inputs.TrainingInput(
    s3_validation_data,
    distribution="FullyReplicated",
    content_type="text/csv",
    s3_data_type="S3Prefix",
    record_wrapping=None,
    compression=None,
)

training files will be taken from: s3://sagemaker-us-east-1-872507569564/sagemaker/linear-learner-regression-ccpp/train
validtion files will be taken from: s3://sagemaker-us-east-1-872507569564/sagemaker/linear-learner-regression-ccpp/validation
training artifacts output location: s3://sagemaker-us-east-1-872507569564/linear-learner-regression-ccpp/output


In [None]:
# getting the linear learner image according to the region
from sagemaker.image_uris import retrieve

container = retrieve("linear-learner", boto3.Session().region_name, version="1")
print(container)

382416733822.dkr.ecr.us-east-1.amazonaws.com/linear-learner:1


In [None]:
%%time

from time import gmtime, strftime

sess = sagemaker.Session()

job_name = "linear-learner-regression-ccpp-" + strftime("%H-%M-%S", gmtime())
print("Training job", job_name)

linear = sagemaker.estimator.Estimator(
    container,
    role,
    input_mode="File",
    instance_count=1,
    instance_type="ml.c4.xlarge",
    output_path=output_location,
    sagemaker_session=sess,
)

linear.set_hyperparameters(
    feature_dim=4,
    epochs=16,
    wd=0.01,
    loss="absolute_loss",
    predictor_type="regressor",
    normalize_data=True,
    optimizer="adam",
    mini_batch_size=100,
    lr_scheduler_step=100,
    lr_scheduler_factor=0.99,
    lr_scheduler_minimum_lr=0.0001,
    learning_rate=0.1,
)

Training job linear-learner-regression-ccpp-21-21-31
CPU times: user 11.1 ms, sys: 4.17 ms, total: 15.2 ms
Wall time: 14.5 ms


In [None]:
%%time
linear.fit(inputs={"train": train_data, "validation": validation_data}, job_name=job_name)

2021-01-12 21:21:35 Starting - Starting the training job...
2021-01-12 21:21:58 Starting - Launching requested ML instancesProfilerReport-1610486494: InProgress
.........
2021-01-12 21:23:19 Starting - Preparing the instances for training.........
2021-01-12 21:25:01 Downloading - Downloading input data
2021-01-12 21:25:01 Training - Downloading the training image..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[01/12/2021 21:25:21 INFO 139896050251584] Reading default configuration from /opt/amazon/lib/python2.7/site-packages/algorithm/resources/default-input.json: {u'loss_insensitivity': u'0.01', u'epochs': u'15', u'feature_dim': u'auto', u'init_bias': u'0.0', u'lr_scheduler_factor': u'auto', u'num_calibration_samples': u'10000000', u'accuracy_top_k': u'3', u'_num_kv_servers': u'auto', u'use_bias': u'true', u'num_point_for_scaler': u'10000', u'_log_level': u'info', u'quantile': u'0.5', u'bias_lr_mult': u'auto',

In [None]:
%%time
# creating the endpoint out of the trained model
linear_predictor = linear.deploy(initial_instance_count=1, instance_type="ml.t2.medium")
print(f"\ncreated endpoint: {linear_predictor.endpoint_name}")

-------------------!
created endpoint: linear-learner-2021-01-12-21-31-23-270
CPU times: user 340 ms, sys: 9.67 ms, total: 350 ms
Wall time: 9min 32s


In [None]:
# configure the predictor to accept to serialize csv input and parse the reposne as json
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer

linear_predictor.serializer = CSVSerializer()
linear_predictor.deserializer = JSONDeserializer()

In [None]:
%%time
import json
from itertools import islice
import math
import struct
import boto3
import random

# downloading the test file from data_bucket
FILE_TEST = "ccpp_dataset1_test.csv"
s3 = boto3.client("s3")
s3.download_file(data_bucket, f"{data_prefix}/test/{FILE_TEST}", FILE_TEST)

# getting testing sample from our test file
test_data = [l for l in open(FILE_TEST, "r")]
sample = random.choice(test_data).split(",")
actual_age = sample[0]
payload = sample[1:]  # removing actual age from the sample
payload = ",".join(map(str, payload))

# Invoke the predicor and analyise the result
result = linear_predictor.predict(payload)

# extracting the prediction value
result = round(float(result["predictions"][0]["score"]), 2)


accuracy = str(round(100 - ((abs(float(result) - float(actual_age)) / float(actual_age)) * 100), 2))
print(f"Actual power: {actual_age}\nPrediction: {result}\nAccuracy: {accuracy}")

Actual power: 440.13
Prediction: 440.48
Accuracy: 99.92
CPU times: user 41.5 ms, sys: 0 ns, total: 41.5 ms
Wall time: 164 ms


In [None]:
sagemaker.Session().delete_endpoint(linear_predictor.endpoint_name)
print(f"deleted {linear_predictor.endpoint_name} successfully!")

deleted linear-learner-2021-01-12-21-31-23-270 successfully!
