# Setup

In [None]:
import boto3
import sagemaker
from sagemaker import get_execution_role
import sys
import IPython

print(sagemaker.__version__)

In [None]:
# IAM role
role = get_execution_role()

print(f"Role = {role}")

# create a sagemaker sessipn
sess = sagemaker.Session()

region = boto3.session.Session().region_name

# what region are we in?
print(f"Region = {region}")

# sdk sagemaker object
sm = boto3.Session().client("sagemaker")

Library imports

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import os
from time import sleep, gmtime, strftime
import json
import time

Make sure we have experiment capabilities

In [None]:
!pip install sagemaker-experiments 
from sagemaker.analytics import ExperimentAnalytics
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

In [None]:
# where the raw data will be stored
rawbucket = "dm-raw"

In [None]:
from sagemaker.sklearn.processing import SKLearnProcessor
sklearn_processor = SKLearnProcessor(framework_version='0.20.0',
                                     role=role,
                                     instance_type="ml.c5.xlarge",
                                     instance_count=1)

In [None]:
%%writefile preprocessing.py
import pandas as pd
import numpy as np
from sklearn import model_selection
import os


def main():
    
    print("Reading input data")
    
    df = pd.read_csv("/opt/ml/processing/input/Advertising.csv", index_col=0)
    
    print("Complete")
    
    features = [
        'TV'
        , 'radio'
        , 'newspaper'
    ]

    target = "sales"

    y = df[target]

    X = df[features]
    
    print("Splitting data")

    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.30, random_state=20)
    
    print("Complete")
    
    train_path = "/opt/ml/processing/train/"
    test_path = "/opt/ml/processing/test/"
    
    print("Outputting data")
    
    # train output
    X_train.to_csv(train_path + "X_train.csv", header=True)
    y_train.to_csv(train_path + "y_train.csv", header=True)

    # test output
    X_test.to_csv(test_path + "X_test.csv", header=True)
    y_test.to_csv(test_path + "y_test.csv", header=True)
    
    print("Complete")
    
    
if __name__ == "__main__":
    main()

In [None]:
# Copy the preprocessing code over to the s3 bucket
codeupload = sess.upload_data('preprocessing.py', bucket=rawbucket, key_prefix="code")

print(codeupload)

In [None]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

sklearn_processor.run(code=codeupload,
                      inputs=[ProcessingInput(
                        source=f"s3://{rawbucket}/data",
                        destination='/opt/ml/processing/input')],
                      outputs=[ProcessingOutput(output_name='train_data',
                                                source='/opt/ml/processing/train',
                               destination=f"s3://{rawbucket}/train"),
                               ProcessingOutput(output_name='test_data',
                                                source="/opt/ml/processing/test",
                                               destination=f"s3://{rawbucket}/test")
                              ]
                     )

preprocessing_job_description = sklearn_processor.jobs[-1].describe()

output_config = preprocessing_job_description['ProcessingOutputConfig']
for output in output_config['Outputs']:
    if output['OutputName'] == 'train_data':
        preprocessed_training_data = output['S3Output']['S3Uri']
    if output['OutputName'] == 'test_data':
        preprocessed_test_data = output['S3Output']['S3Uri']

In [None]:
# Create a SageMaker Experiment
cc_experiment = Experiment.create(
    experiment_name=f"Build-train-deploy-{int(time.time())}", 
    description="Predict sales given a marketing mix",
    sagemaker_boto_client=sm)

print(cc_experiment)

In [None]:
# Start Tracking parameters used in the Pre-processing pipeline.
with Tracker.create(display_name="Preprocessing", sagemaker_boto_client=sm) as tracker:
    # we can log the s3 uri to the dataset we just uploaded
    tracker.log_input(name="ccdefault-raw-dataset", media_type="s3/uri", value=f"s3://{rawbucket}/data")
    tracker.log_input(name="ccdefault-train-dataset", media_type="s3/uri", value=f"s3://{rawbucket}/train")
    tracker.log_input(name="ccdefault-test-dataset", media_type="s3/uri", value=f"s3://{rawbucket}/data/test")

In [None]:
preprocessing_trial_component = tracker.trial_component

In [None]:
trial_name = f"cc-default-training-job-{int(time.time())}"
cc_trial = Trial.create(
        trial_name=trial_name, 
            experiment_name=cc_experiment.experiment_name,
        sagemaker_boto_client=sm
    )

cc_trial.add_trial_component(preprocessing_trial_component)
cc_training_job_name = "cc-training-job-{}".format(int(time.time()))

In [None]:
%%writefile train.py
# training script
from sklearn import linear_model, metrics
from sklearn.externals import joblib
import pandas as pd
import argparse
import os

if __name__ == "__main__":
    
    parser = argparse.ArgumentParser()
    
    # Data, model, and output directories
    parser.add_argument('--output-data-dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    
    args, _ = parser.parse_known_args()
    
    X_train = pd.read_csv(os.path.join(args.train, "X_train.csv"), index_col=0)
    y_train = pd.read_csv(os.path.join(args.train, "y_train.csv"), index_col=0)

    reg = linear_model.LinearRegression()
    
    reg.fit(X_train, y_train)
    
    in_sample_rmse = metrics.mean_squared_error(y_train, reg.predict(X_train))
    
    print(f"IS-RMSE: {in_sample_rmse}")
    
    joblib.dump(reg, os.path.join(args.model_dir, "model.joblib"))
    

def model_fn(model_dir):
    """Deserialized and return fitted model
    Note that this should have the same name as the serialized model in the main method
    """
    reg = joblib.load(os.path.join(model_dir, "model.joblib"))
    return reg

In [None]:
from sagemaker.sklearn.estimator import SKLearn

sklearn_estimator = SKLearn('train.py',
                            instance_type="ml.c5.xlarge",
                            framework_version='0.20.0',
                            role=role,
                            metric_definitions=[{"Name": "train:rmse", "Regex": "IS-RMSE: ([0-9.]+).*$"}]
                           )

sklearn_estimator.fit(
    inputs={"train": f"s3://{rawbucket}/train", "test": f"s3://{rawbucket}/test"},
    job_name=cc_training_job_name,
    experiment_config={
            "TrialName": cc_trial.trial_name, #log training job in Trials for lineage
            "TrialComponentDisplayName": "Training",
        }
)

In [None]:
sklearn_estimator.latest_training_job.wait(logs="None")

artifact = sm.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact persisted at " + artifact)

In [None]:
from sagemaker.sklearn.model import SKLearnModel

model = SKLearnModel(
    model_data=artifact,
    role=get_execution_role(),
    entry_point="train.py",
    framework_version='0.20.0',
)

In [None]:
#predictor = model.deploy(instance_type="ml.m5.xlarge", initial_instance_count=1)

In [None]:
#X_test = pd.read_csv(f"s3://{rawbucket}/test/X_test.csv", index_col=0)

#X_test.head()

In [None]:
#X_test.head()

In [None]:
#predicted = predictor.predict(X_test)

#actual = pd.read_csv(f"s3://{rawbucket}/test/y_test.csv", index_col=0)

In [None]:
#import matplotlib.pyplot as plt
#from sklearn import metrics

#mse = metrics.mean_squared_error(actual, predicted, squared=False)

#print(f"Test-set RMSE = {mse}")

#plt.scatter(actual, predicted)
#plt.plot([0, 25], [0, 25], '--', linewidth=2, c="r")
#plt.xlabel("Actual Sales")
#plt.ylabel("Predicted Sales")
#plt.show()

In [None]:
#sm.delete_endpoint(EndpointName=predictor.endpoint)