# Train, Test & Deploy: Advertising

https://www.statlearning.com/s/Advertising.csv

## Setup

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import azureml.core
from azureml.core import Workspace
from datetime import datetime

# what version
print("Azure ML SDK Version: " + azureml.core.VERSION)

%matplotlib inline

In [None]:
# load workspace configuration
ws = Workspace.from_config()

print("Workspace name: " + ws.name)
print("Workspace location: " + ws.location)
print("Workspace resource group: " + ws.resource_group)

## Experiment setup

In [None]:
# create an azure ml experiment
experiment_name = "mmm-train"

from azureml.core import Experiment
exp = Experiment(workspace=ws, name=experiment_name)

## Compute clusters

In [None]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "cpu-cluster")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 2)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print("found compute target: " + compute_name)
else:
    print("creating new compute target...")
    provisioning_config = AmlCompute.provisioning_configuration(vm_size = vm_size,
                                                                min_nodes = compute_min_nodes, 
                                                                max_nodes = compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(ws, compute_name, provisioning_config)
    
    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)
    
     # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

In [None]:
#%%bash
#mkdir data

In [None]:
from azureml.core import Dataset
import os

dataset = Dataset.get_by_name(ws, name='advertising')

dataset.download(target_path='./data', overwrite=True)

if 'Advertising.csv' in os.listdir('./data'):
    print("Dataset downloaded and in the right place!")

In [None]:
#%%bash
#mkdir training

## Training

In [None]:
%%writefile training/train.py

import argparse
import os
import pandas as pd
import numpy as np
from sklearn import model_selection, metrics, linear_model
import joblib
from azureml.core import Run

if __name__ == "__main__":
    # let user feed in 2 parameters, the dataset to mount or download, and the regularization rate of the logistic regression model
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-directory', type=str, dest='data_directory', help='data directory mounting point')
    args = parser.parse_args()

    data_directory = args.data_directory

    print("Reading input data")

    df = pd.read_csv(data_directory, index_col=0)

    print("Complete")

    # target
    y = df["sales"]

    # feature set
    X = df.drop(columns="sales")

    print("Splitting data")

    # train/test split
    X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.30, random_state=20)
    
    print("Complete")

    # get hold of the current run
    run = Run.get_context()

    print("Training")

    # initialise estimator
    reg = linear_model.LinearRegression()
    
    # train
    reg.fit(X_train, y_train)

    print("Complete")

    # calculate in-sample root-mean-squared-error
    in_sample_rmse = np.sqrt(metrics.mean_squared_error(y_train, reg.predict(X_train)))
    
    # print the rmse, this will appear in the log and will be captured by sagemaker
    print(f"IS-RMSE: {in_sample_rmse}")

    # log the metric result
    run.log('accuracy', np.float(in_sample_rmse))

    # make an output directory
    os.makedirs('output', exist_ok=True)

    # save the model
    joblib.dump(value=reg, filename='output/reg.pkl')

In [None]:
import os
import pandas as pd
import numpy as np
from sklearn import model_selection, metrics, linear_model
import joblib
from azureml.core import Run

df = pd.read_csv(os.path.join("data", "Advertising.csv"), index_col=0)

print("Complete")

# target
y = df["sales"]

# feature set
X = df.drop(columns="sales")

print("Splitting data")

# train/test split
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.30, random_state=20)

print("Complete")

# get hold of the current run
#run = Run.get_context()

print("Training")

# initialise estimator
reg = linear_model.LinearRegression()

# train
reg.fit(X_train, y_train)

print("Complete")

# calculate in-sample root-mean-squared-error
in_sample_rmse = np.sqrt(metrics.mean_squared_error(y_train, reg.predict(X_train)))

# print the rmse, this will appear in the log and will be captured by sagemaker
print(f"IS-RMSE: {in_sample_rmse}")

# log the metric result
#run.log('accuracy', np.float(in_sample_rmse))

# make an output directory
os.makedirs('output', exist_ok=True)

# save the model
joblib.dump(value=reg, filename='output/reg.pkl')

In [None]:
from azureml.core.environment import Environment
from azureml.core.conda_dependencies import CondaDependencies

# to install required packages
env = Environment('ml-env')
cd = CondaDependencies.create(pip_packages=['azureml-dataset-runtime[pandas,fuse]', 'azureml-defaults'], conda_packages = ['scikit-learn==0.22.1'])

env.python.conda_dependencies = cd

# Register environment to re-use later
env.register(workspace = ws)

In [None]:
from azureml.core import ScriptRunConfig

args = ['--data-directory', dataset.as_mount()]

src = ScriptRunConfig(source_directory="./training",
                      script='train.py', 
                      arguments=args,
                      compute_target=compute_target,
                      environment=env)

In [None]:
run = exp.submit(config=src)
run

In [None]:
from azureml.widgets import RunDetails
RunDetails(run).show()

In [None]:
# specify show_output to True for a verbose log
run.wait_for_completion(show_output=True) 

In [None]:
print(run.get_metrics())