# Train, Test & Deploy: Advertising

> https://www.statlearning.com/s/Advertising.csv

## Setup

In [None]:
import boto3 # aws python sdk
import sagemaker
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json
import time
import sys
import IPython
import os

# what version
print("SageMaker Version: " + sagemaker.__version__)

In [None]:
# IAM role you created when you set everything up
role = sagemaker.get_execution_role()

# create a sagemaker sessipn
sess = sagemaker.Session()

# what region are we in?
region = boto3.session.Session().region_name

print(f"Region: {region}")

# sdk sagemaker object
sm = boto3.Session().client("sagemaker")

In [None]:
# Make sure we have experimental capabilities

!pip install sagemaker-experiments 
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

In [None]:
# where the raw data will be stored (you will need to change this one)
rawbucket = "dm-raw"

# sklearn framework version
framework_version = "0.20.0"

## Data processing

In [None]:
from sagemaker.sklearn.processing import SKLearnProcessor

# this will allow us to use sklearn to process data
sklearn_processor = SKLearnProcessor(framework_version=framework_version,
                                     role=role,
                                     instance_type="ml.c5.xlarge",
                                     instance_count=1)

In [None]:
# save file locally
%%writefile preprocessing.py

import pandas as pd
import numpy as np
from sklearn import model_selection
import os

# this is best practice to make sure it is run at the right time
if __name__ == "__main__":
    
    print("Reading input data")
    
    df = pd.read_csv("/opt/ml/processing/input/Advertising.csv", index_col=0)
    
    print("Complete")
    
    # feature selection
    features = [
        'TV'
        , 'radio'
        , 'newspaper'
    ]

    # target
    target = "sales"

    # target
    y = df[target]

    # feature set
    X = df[features]
    
    print("Splitting data")

    # train/test split
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.30, random_state=20)
    
    print("Complete")
    
    # output directories
    train_path = "/opt/ml/processing/train/"
    test_path = "/opt/ml/processing/test/"
    
    print("Transfering data to storeage")
    
    # train output
    X_train.to_csv(train_path + "X_train.csv", header=True)
    y_train.to_csv(train_path + "y_train.csv", header=True)

    # test output
    X_test.to_csv(test_path + "X_test.csv", header=True)
    y_test.to_csv(test_path + "y_test.csv", header=True)
    
    print("Complete")
    


In [None]:
# save the script to s3
codeupload = sess.upload_data('preprocessing.py', bucket=rawbucket, key_prefix="code")

print(codeupload)

In [None]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

# run the data processing on a dedicated vm
sklearn_processor.run(
    code=codeupload,
    inputs=[
        ProcessingInput(source=f"s3://{rawbucket}/data",
        destination='/opt/ml/processing/input')
    ],
    outputs=[
        ProcessingOutput(output_name='train_data',
        source='/opt/ml/processing/train',
        destination=f"s3://{rawbucket}/train"),
        ProcessingOutput(output_name='test_data',
        source="/opt/ml/processing/test",
        destination=f"s3://{rawbucket}/test")
    ]
)

# give us some info on the process
preprocessing_job_description = sklearn_processor.jobs[-1].describe()

# show us what happened with the processing
print(preprocessing_job_description)

## Experiment setup

In [None]:
# create a SageMaker Experiment
mmm_experiment = Experiment.create(
    experiment_name=f"mmm-train-{int(time.time())}", 
    description="Predict sales given a marketing mix",
    sagemaker_boto_client=sm
)

# show experiment details
print(mmm_experiment)

In [None]:
# Start Tracking parameters used in the Pre-processing pipeline.
with Tracker.create(display_name="Preprocessing", sagemaker_boto_client=sm) as tracker:
    # we can log the s3 uri to the dataset we just uploaded
    tracker.log_input(name="mmm-raw-dataset", media_type="s3/uri", value=f"s3://{rawbucket}/data")
    tracker.log_input(name="mmm-train-dataset", media_type="s3/uri", value=f"s3://{rawbucket}/train")
    tracker.log_input(name="mmm-test-dataset", media_type="s3/uri", value=f"s3://{rawbucket}/data/test")

In [None]:
# name of the trial
trial_name = f"mmm-training-job-{int(time.time())}"

# create a new trial
mmm_trial = Trial.create(
    trial_name=trial_name, 
    experiment_name=mmm_experiment.experiment_name,
    sagemaker_boto_client=sm
)

# add a trial component
mmm_trial.add_trial_component(tracker.trial_component)

# give the training run a name
mmm_training_job_name = "cc-training-job-{}".format(int(time.time()))

## Training

In [None]:
%%writefile train.py
# training script
from sklearn import linear_model, metrics
from sklearn.externals import joblib
import pandas as pd
import argparse
import os

if __name__ == "__main__":
    
    parser = argparse.ArgumentParser()
    
    # data, model, and output directories. you don't have to specify these
    parser.add_argument('--output-data-dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
    parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
    parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
    parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
    
    # get the arguments
    args, _ = parser.parse_known_args()
    
    # load the training data from s3
    X_train = pd.read_csv(os.path.join(args.train, "X_train.csv"), index_col=0)
    y_train = pd.read_csv(os.path.join(args.train, "y_train.csv"), index_col=0)

    # initialise estimator
    reg = linear_model.LinearRegression()
    
    # train
    reg.fit(X_train, y_train)
    
    # calculate in-sample root-mean-squared-error
    in_sample_rmse = metrics.mean_squared_error(y_train, reg.predict(X_train), squared=False)
    
    # print the rmse, this will appear in the log and will be captured by sagemaker
    print(f"IS-RMSE: {in_sample_rmse}")
    
    # save the model to the model directory
    joblib.dump(reg, os.path.join(args.model_dir, "model.joblib"))
    

# this is a required step to successfully deploy the model
def model_fn(model_dir):
    """Deserialized and return fitted model
    Note that this should have the same name as the serialized model in the main method
    """
    reg = joblib.load(os.path.join(model_dir, "model.joblib"))
    return reg

In [None]:
from sagemaker.sklearn.estimator import SKLearn

# define a sklearn estimator
sklearn_estimator = SKLearn(
    entry_point='train.py',
    instance_type="ml.c5.xlarge",
    framework_version=framework_version,
    role=role,
    metric_definitions=[ # these metrics will be logged and picked up by sagemaker for reporting
        {"Name": "train:rmse", "Regex": "IS-RMSE: ([0-9.]+).*$"}
    ]
)

# train the estimator
sklearn_estimator.fit(
    inputs={
        "train": f"s3://{rawbucket}/train"
        #, "test": f"s3://{rawbucket}/test"
    },
    job_name=mmm_training_job_name,
    experiment_config={
        "TrialName": cc_trial.trial_name, #log training job in Trials for lineage
        "TrialComponentDisplayName": "Training",
    }
)

## Deployment

In [None]:
# find the most recent trained estimator
sklearn_estimator.latest_training_job.wait(logs="None")

# find where it is stored
artifact = sm.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

# tell us
print(f"Model artifact persisted at {artifact}")

In [None]:
from sagemaker.sklearn.model import SKLearnModel

# import the model from the location specified
model = SKLearnModel(
    model_data=artifact,
    role=role,
    entry_point="train.py",
    framework_version=framework_version,
)

In [None]:
# deploy the model and get the endpoint (this might take some time)
predictor = model.deploy(instance_type="ml.m5.xlarge", initial_instance_count=1)

## Test

In [None]:
# load the test data (features)
X_test = pd.read_csv(f"s3://{rawbucket}/test/X_test.csv", index_col=0)

# look at the test data (features)
X_test.head()

In [None]:
# send the test data to the endpoint
predicted = predictor.predict(X_test)

# load the actual values
actual = pd.read_csv(f"s3://{rawbucket}/test/y_test.csv", index_col=0)

In [None]:
from sklearn import metrics

# calculate the rmse
rmse = metrics.mean_squared_error(actual, predicted, squared=False)

print(f"Test-set RMSE = {rmse}")

# show a plot of test-set predictions and actual values
plt.scatter(actual, predicted)
plt.plot([0, 25], [0, 25], '--', linewidth=1, c="b")
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.show()

## Clean up

In [None]:
sm.delete_endpoint(EndpointName="sagemaker-scikit-learn-2021-06-16-01-21-39-258")

## References

https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/scikit_learn_randomforest/Sklearn_on_SageMaker_end2end.ipynb

https://aws.amazon.com/getting-started/hands-on/build-train-deploy-monitor-machine-learning-model-sagemaker-studio/?trk=gs_card