# Train, Test & Deploy: Advertising

> https://www.statlearning.com/s/Advertising.csv

In [None]:
import boto3 # aws python sdk
import sagemaker # aws sagemaker
import numpy as np # numerical python
import pandas as pd # python for data analysis
import matplotlib.pyplot as plt # plotting
import json
import time
import sys
import IPython
import os

# what version
print("SageMaker Version: " + sagemaker.__version__)

In [None]:
# IAM role you created when you set everything up
role = sagemaker.get_execution_role()

# create a sagemaker sessipn
sess = sagemaker.Session()

# what region are we in?
region = boto3.session.Session().region_name

print(f"Region: {region}")

# sdk sagemaker object
sm = boto3.Session().client("sagemaker")

In [None]:
# where the raw data is stored
#rawbucket = ""

# sklearn framework version
framework_version = "0.20.0"

## Data processing

In [None]:
from sagemaker.sklearn.processing import SKLearnProcessor

# this will allow us to use sklearn to process data
sklearn_processor = SKLearnProcessor(framework_version=framework_version,
                                     role=role,
                                     instance_type="ml.c5.xlarge",
                                     instance_count=1)

In [None]:
%%writefile preprocessing.py
import pandas as pd
import numpy as np
from sklearn import model_selection
from sklearn import datasets
import os

# this is best practice to make sure it is run at the right time
if __name__ == "__main__":
    
    print("Reading input data")
    
    df = pd.read_csv("/opt/ml/processing/input/Advertising.csv", index_col=0)
    
    print("Complete")
    
    # feature selection
    features = [
        'TV'
        , 'radio'
        , 'newspaper'
    ]

    # target
    target = "sales"

    # target
    y = df[target]

    # feature set
    X = df[features]
    
    print("Splitting data")

    # train/test split
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.30, random_state=20)
    
    print("Complete")
    
    # output directories
    train_path = "/opt/ml/processing/train/"
    test_path = "/opt/ml/processing/test/"
    
    print("Transfering data to storage")
    
    # train output
    datasets.dump_svmlight_file(X_train, y_train, train_path + "train")
    
    # test output
    X_test.to_csv(test_path + "X_test.csv", header=True)
    y_test.to_csv(test_path + "y_test.csv", header=True)
    
    print("Complete")

In [None]:
# save the script to s3
codeupload = sess.upload_data('preprocessing.py', bucket=rawbucket, key_prefix="code")

# where was it uploaded
print(codeupload)

In [None]:
from sagemaker.processing import ProcessingInput, ProcessingOutput

# run the data processing on a dedicated vm
sklearn_processor.run(
    code=codeupload,
    inputs=[
        ProcessingInput(source=f"s3://{rawbucket}/data",
        destination='/opt/ml/processing/input')
    ],
    outputs=[
        ProcessingOutput(output_name='train_data',
        source='/opt/ml/processing/train',
        destination=f"s3://{rawbucket}/train"),
        ProcessingOutput(output_name='test_data',
        source="/opt/ml/processing/test",
        destination=f"s3://{rawbucket}/test")
    ]
)

# give us some info on the process
preprocessing_job_description = sklearn_processor.jobs[-1].describe()

# show us what happened with the processing
print(preprocessing_job_description)

## Experiment setup

In [None]:
# Make sure we have experimental capabilities

!pip install sagemaker-experiments 
from smexperiments.experiment import Experiment
from smexperiments.trial import Trial
from smexperiments.trial_component import TrialComponent
from smexperiments.tracker import Tracker

In [None]:
# create a SageMaker Experiment
mmm_experiment = Experiment.create(
    experiment_name=f"mmm-train-{int(time.time())}", 
    description="Predict sales given a marketing mix",
    sagemaker_boto_client=sm
)

# show experiment details
print(mmm_experiment)

In [None]:
# Start Tracking parameters used in the Pre-processing pipeline.
with Tracker.create(display_name="Preprocessing", sagemaker_boto_client=sm) as tracker:
    # we can log the s3 uri to the dataset we just uploaded
    tracker.log_input(name="mmm-raw-dataset", media_type="s3/uri", value=f"s3://{rawbucket}/data")
    tracker.log_input(name="mmm-train-dataset", media_type="s3/uri", value=f"s3://{rawbucket}/train")
    tracker.log_input(name="mmm-test-dataset", media_type="s3/uri", value=f"s3://{rawbucket}/test")

In [None]:
# name of the trial
trial_name = f"mmm-training-job-{int(time.time())}"

# create a new trial
mmm_trial = Trial.create(
    trial_name=trial_name, 
    experiment_name=mmm_experiment.experiment_name,
    sagemaker_boto_client=sm
)

# add a trial component
mmm_trial.add_trial_component(tracker.trial_component)

# give the training run a name
mmm_training_job_name = "mmm-training-job-{}".format(int(time.time()))

## Training

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri

# get the container image for xgboost
container = get_image_uri(region, 'xgboost', '1.0-1')

# set your hyperparams (this is a regression problem)
hyperparameters = {
        "objective":"reg:squarederror",
        "num_round":"100"
}

# specify the xgboost estimator and training instance vm size
estimator = sagemaker.estimator.Estimator(
    container, 
    role=role,
    hyperparameters=hyperparameters,
    train_instance_count=1, 
    train_instance_type='ml.m5.2xlarge', 
)

# train
estimator.fit(inputs={
        "train": f"s3://{rawbucket}/train"
    },
    job_name=mmm_training_job_name,
    experiment_config={
        "TrialName": mmm_trial.trial_name, #log training job in Trials for lineage
        "TrialComponentDisplayName": "Training"
})

## Model saving

In [None]:
# find the most recent trained estimator
estimator.latest_training_job.wait(logs="None")

# find where it is stored
artifact = sm.describe_training_job(
    TrainingJobName=estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

# tell us
print(f"Model artifact persisted at {artifact}")

## Deployment

In [None]:
from sagemaker.serializers import CSVSerializer

# deploy the xgboost estimator, make sure it will take a csv input format
xgb_predictor = estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m4.xlarge",
    serializer=CSVSerializer()
)

## Testing

In [None]:
def predict(data, rows=500):
    """ 
    prediction function for xgboost algorithm that takes
    a CSV input
    """
    split_array = np.array_split(data, int(data.shape[0] / float(rows) + 1))
    predictions = ''
    for array in split_array:
        predictions = ','.join([predictions, xgb_predictor.predict(array).decode('utf-8')])

    return np.fromstring(predictions[1:], sep=',')

In [None]:
# model predictions
predicted = predict(pd.read_csv(f"s3://{rawbucket}/test/X_test.csv", index_col=0).to_numpy())

# actual values
actual = pd.read_csv(f"s3://{rawbucket}/test/y_test.csv", index_col=0).to_numpy()

In [None]:
from sklearn import metrics

rmse = metrics.mean_squared_error(actual, predicted, squared=False)

print(f"Test-set RMSE = {rmse}")

# show a plot of test-set predictions and actual values
plt.scatter(actual, predicted)
plt.plot([0, 25], [0, 25], '--', linewidth=1, c="b")
plt.xlabel("Actual Values")
plt.ylabel("Predicted Values")
plt.show()

## Cleanup

In [None]:
sm.delete_endpoint(EndpointName="sagemaker-xgboost-2021-09-14-12-01-40-203")

## References

https://github.com/aws/amazon-sagemaker-examples/blob/master/sagemaker-python-sdk/scikit_learn_randomforest/Sklearn_on_SageMaker_end2end.ipynb

https://aws.amazon.com/getting-started/hands-on/build-train-deploy-monitor-machine-learning-model-sagemaker-studio/?trk=gs_card

https://docs.aws.amazon.com/sagemaker/latest/dg/xgboost.html

https://aws.amazon.com/blogs/machine-learning/simplify-machine-learning-with-xgboost-and-amazon-sagemaker/