In [1]:
import os
import io
import json
import numpy as np
import pandas as pd
from sklearn import datasets
from sklearn.model_selection import train_test_split

import boto3
import sagemaker
from sagemaker import get_execution_role
from sagemaker.session import TrainingInput
from sagemaker.tensorflow import TensorFlow, TensorFlowModel

In [2]:
sagemaker_session = sagemaker.Session()
s3 = boto3.resource('s3')

role = get_execution_role()
region = sagemaker_session.boto_session.region_name

# sm_boto3 = boto3.client("sagemaker")
sess = sagemaker.Session()
bucket = sess.default_bucket()  # this could also be a hard-coded bucket name
prefix = 'frontier'
print("Using bucket " + bucket)

Using bucket sagemaker-us-east-1-367158743199


In [3]:
# Generate training data
X, y = datasets.make_moons(1000, noise=0.2)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [4]:
np.save('./data/X_train.npy', X_train)
np.save('./data/y_train.npy', y_train)

np.save('./data/X_val.npy', X_test)
np.save('./data/y_val.npy', y_test)


training_data_uri = f"s3://{bucket}/{prefix}/input"

s3.meta.client.upload_file('./data/X_train.npy', bucket, f'{prefix}/input/X_train.npy')
s3.meta.client.upload_file('./data/y_train.npy', bucket, f'{prefix}/input/y_train.npy')
s3.meta.client.upload_file('./data/X_val.npy', bucket, f'{prefix}/input/X_val.npy')
s3.meta.client.upload_file('./data/y_val.npy', bucket, f'{prefix}/input/y_val.npy')

# Train

In [5]:
%%writefile ./src/train.py

import os
import json
import argparse
import numpy as np
import pandas as pd
import tensorflow as tf


def model(x_train, y_train, x_test, y_test):
    """Generate a simple model"""
    model = tf.keras.models.Sequential(
        [
            tf.keras.Input(shape=[x_train.shape[1]]),
            tf.keras.layers.Dense(2, activation=tf.nn.relu),
            tf.keras.layers.Dense(1, activation=tf.nn.sigmoid),
        ]
    )
    
    print(model.summary)
    opt = tf.keras.optimizers.Adam(learning_rate=0.001)
    model.compile(optimizer=opt, loss="binary_crossentropy", metrics=["accuracy"])
    model.fit(x_train, y_train)
    model.evaluate(x_test, y_test)

    return model

def _load_training_data(base_dir):
    """Load training data"""
    x_train = np.load(os.path.join(base_dir, "X_train.npy"))
    y_train = np.load(os.path.join(base_dir, "y_train.npy"))
    return x_train, y_train


def _load_validation_data(base_dir):
    """Load testing data"""
    x_test = np.load(os.path.join(base_dir, "X_val.npy"))
    y_test = np.load(os.path.join(base_dir, "y_val.npy"))
    return x_test, y_test

def _parse_args():
    parser = argparse.ArgumentParser()

    # Data, model, and output directories
    # model_dir is always passed in from SageMaker. By default this is a S3 path under the default bucket.
    parser.add_argument("--model_dir", type=str)
    parser.add_argument("--sm-model-dir", type=str, default=os.environ.get("SM_MODEL_DIR"))
    parser.add_argument("--train", type=str, default=os.environ.get("SM_CHANNEL_TRAINING"))

    return parser.parse_known_args()

if __name__ == "__main__":
    args, unknown = _parse_args()
    
    train_data, train_labels = _load_training_data(args.train)
    eval_data, eval_labels = _load_validation_data(args.train)

    clf = model(train_data, train_labels, eval_data, eval_labels)
    clf.save(os.path.join(args.sm_model_dir, "000000001"))

Overwriting ./src/train.py


In [6]:
# # test locally
# !mkdir model
# ! python src/train.py  --sm-model-dir ./model/ \
#                    --train ./data/

mkdir: cannot create directory ‘model’: File exists
2021-06-29 20:30:33.913186: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.
2021-06-29 20:30:33.913309: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:105] SageMaker Profiler is not enabled. The timeline writer thread will not be started, future recorded events will be dropped.
2021-06-29 20:30:33.945391: W tensorflow/core/profiler/internal/smprofiler_timeline.cc:460] Initializing the SageMaker Profiler.
2021-06-29 20:30:35.255764: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN)to use the following CPU instructions in performance-critical operations:  AVX512F
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-06-29 20:30:35.263627: I tensorflow/core/platform/profile_utils/cpu_utils.cc:104] CPU Frequency: 2499985000 Hz
2021-06-29 20:30:35

In [7]:
estimator = TensorFlow(
    entry_point="./src/train.py",
    role=role,
    instance_count=1,
    instance_type="ml.p3.2xlarge",
    framework_version="2.1.0",
    py_version="py3",
    output_path = f"s3://{bucket}/{prefix}/output"
)

# estimator.fit({"train": train_input, "validation": validation_input})
estimator.fit(training_data_uri)

2021-06-29 20:30:37 Starting - Starting the training job...
2021-06-29 20:31:01 Starting - Launching requested ML instancesProfilerReport-1624998637: InProgress
......
2021-06-29 20:32:01 Starting - Preparing the instances for training.........
2021-06-29 20:33:26 Downloading - Downloading input data...
2021-06-29 20:34:01 Training - Downloading the training image.........
2021-06-29 20:35:32 Uploading - Uploading generated training model[34m2021-06-29 20:35:22,011 sagemaker-containers INFO     Imported framework sagemaker_tensorflow_container.training[0m
[34m2021-06-29 20:35:22,445 sagemaker-containers INFO     Invoking user script
[0m
[34mTraining Env:
[0m
[34m{
    "additional_framework_parameters": {},
    "channel_input_dirs": {
        "training": "/opt/ml/input/data/training"
    },
    "current_host": "algo-1",
    "framework_module": "sagemaker_tensorflow_container.training:main",
    "hosts": [
        "algo-1"
    ],
    "hyperparameters": {
        "model_dir": "s3:/

In [8]:
estimator._current_job_name

'tensorflow-training-2021-06-29-20-30-37-585'

In [9]:
sm_boto3 = boto3.client("sagemaker")

# Get information about the best training job
artifact = sm_boto3.describe_training_job(
    TrainingJobName=estimator._current_job_name)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)

Model artifact persisted at s3://sagemaker-us-east-1-367158743199/frontier/output/tensorflow-training-2021-06-29-20-30-37-585/output/model.tar.gz


# Batch inference

In [10]:
# Save test data as jsonlines
test_data = []

for i in range(X_test.shape[0]):
    data_row = {'id': int(i),
               'data': [float(x) for x in X[i].tolist()]
               }
    test_data.append(data_row)
      
with open('./data/test_data.jsonl', 'w') as f:
    for entry in test_data:
        json.dump(entry, f)
        f.write('\n')
        
s3.meta.client.upload_file('./data/test_data.jsonl', bucket, f'{prefix}/X_test.jsonl')
test_s3_uri = f"s3://{bucket}/{prefix}/X_test.jsonl"

In [11]:
entry

{'id': 199, 'data': [0.1269050880890254, 0.4463074970347955]}

In [12]:
%%writefile ./src/inference.py

import json
import requests


def handler(data, context):
    """Handle request.
    Args:
        data (obj): the request data
        context (Context): an object containing request and configuration details
    Returns:
        (bytes, string): data to return to client, (optional) response content type
    """
    processed_input = _process_input(data, context)
    response = requests.post(context.rest_uri, data=processed_input)
    return _process_output(response, context)


def _process_input(data, context):
    if context.request_content_type == 'application/json':
        # pass through json (assumes it's correctly formed)
        d = data.read().decode('utf-8')
        print(d)
        return d['feature_data'] if len(d) else ''

    raise ValueError('{{"error": "unsupported content type {}"}}'.format(
        context.request_content_type or "unknown"))


def _process_output(data, context):
    if data.status_code != 200:
        raise ValueError(data.content.decode('utf-8'))

    response_content_type = context.accept_header
    prediction = data.content
    return prediction, response_content_type

Overwriting ./src/inference.py


In [13]:
# artifact = "s3://sagemaker-us-east-1-367158743199/frontier/output/tensorflow-training-2021-06-29-17-35-28-662/output/model.tar.gz"
# create a TF model
model = TensorFlowModel(model_data=artifact,
#                         entry_point='./src/inference.py',
                        role=role,
                        framework_version="2.1.0"                        
                       )

In [14]:
batch_output = f's3://{bucket}/{prefix}/transform/'
print(batch_output)

tf_transformer = model.transformer(
    instance_count=1,
    instance_type='ml.m4.xlarge',
    accept = 'application/jsonlines',
    output_path= batch_output,
    assemble_with = 'Line',

)

tf_transformer.transform(test_s3_uri, 
                         content_type='application/jsonlines',
                         split_type='Line',
                         input_filter = "$.data",
#                          output_filter="$['id','SageMakerOutput']",
#                         join_source = "Input",
)


# I am checking with service team to understand why joining sources with jsonlines is returning error

s3://sagemaker-us-east-1-367158743199/frontier/transform/
............................[34mINFO:__main__:starting services[0m
[34mINFO:__main__:using default model name: model[0m
[35mINFO:__main__:starting services[0m
[35mINFO:__main__:using default model name: model[0m
[34mINFO:__main__:tensorflow serving model config: [0m
[34mmodel_config_list: {
  config: {
    name: "model",
    base_path: "/opt/ml/model",
    model_platform: "tensorflow"
  }[0m
[34m}

[0m
[34mINFO:__main__:nginx config: [0m
[34mload_module modules/ngx_http_js_module.so;
[0m
[34mworker_processes auto;[0m
[34mdaemon off;[0m
[34mpid /tmp/nginx.pid;[0m
[34merror_log  /dev/stderr error;
[0m
[35mINFO:__main__:tensorflow serving model config: [0m
[35mmodel_config_list: {
  config: {
    name: "model",
    base_path: "/opt/ml/model",
    model_platform: "tensorflow"
  }[0m
[35m}

[0m
[35mINFO:__main__:nginx config: [0m
[35mload_module modules/ngx_http_js_module.so;
[0m
[35mworker_proces

In [15]:
!mkdir ./output
!aws s3 cp {batch_output} ./output --recursive

mkdir: cannot create directory ‘./output’: File exists
download: s3://sagemaker-us-east-1-367158743199/frontier/transform/X_test.jsonl.out to output/X_test.jsonl.out


In [16]:
with open('./output/X_test.jsonl.out') as f:
    for line in f:
        j_content = json.loads(line)

In [20]:
j_content['predictions']

[[0.595570147],
 [0.497284353],
 [0.497284353],
 [0.514561117],
 [0.499498218],
 [0.536596298],
 [0.524701118],
 [0.497284353],
 [0.511146247],
 [0.497284353],
 [0.607224226],
 [0.616298795],
 [0.5916394],
 [0.630199194],
 [0.497284353],
 [0.554815233],
 [0.534981251],
 [0.497284353],
 [0.497284353],
 [0.497284353],
 [0.573644102],
 [0.604129851],
 [0.54785192],
 [0.500333428],
 [0.599979818],
 [0.497284353],
 [0.589829922],
 [0.497284353],
 [0.575754106],
 [0.526062548],
 [0.510247886],
 [0.527407765],
 [0.497284353],
 [0.497284353],
 [0.497284353],
 [0.507006884],
 [0.513528645],
 [0.590680182],
 [0.497284353],
 [0.497284353],
 [0.497284353],
 [0.497284353],
 [0.497284353],
 [0.497284353],
 [0.610685349],
 [0.497284353],
 [0.626481473],
 [0.603338182],
 [0.590058684],
 [0.516494036],
 [0.497284353],
 [0.593872845],
 [0.497284353],
 [0.497284353],
 [0.497284353],
 [0.502238452],
 [0.505226135],
 [0.497284353],
 [0.519869864],
 [0.497284353],
 [0.497284353],
 [0.497284353],
 [0.5520173