# Automated ML

Import Dependencies.

In [2]:
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.train.automl import AutoMLConfig
from azureml.widgets import RunDetails

from pprint import pprint
from azureml.core import Model
from train import clean
from sklearn.model_selection import train_test_split

from azureml.core.model import InferenceConfig
from azureml.core.environment import Environment
from azureml.core.webservice import AciWebservice

import azureml
import pandas as pd
import os
import json
import requests
import joblib

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

### Workspace and compute cluster configuration

In [None]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'loan-default-automl'

experiment=Experiment(ws, experiment_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = experiment.start_logging()

In [None]:
cpu_cluster_name = "capstone-compute" 

#verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print(f"Found existing cluster: {cpu_cluster_name} to be used.")
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2', max_nodes=6)

    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True, min_node_count = 1, timeout_in_minutes = 10)

## Dataset

### Overview

The dataset used in this project is a loan default prediction dataset. The project entails identifying customers who would either default or not  default after taking a loan credit. This transalates to a binary classification: to default or not to default. Hence, a Logistic regression model would be built using azure's automl funcionality. SKLearn's logistic regression algorithm is a well-known supervised learning approach optimized for dichotomous or binary variables.

In [None]:
# retrieve data from github

url = "https://raw.githubusercontent.com/ChidiNdego/loan-default-capstone-project/master/starter_file/loan_default_prediction.csv"
data = TabularDatasetFactory.from_delimited_files(url)

In [None]:
# Use the clean_data function to clean your data.
x, y = clean(data)

import pandas as pd
# Add cleaned target column to cleaned predictor variables
train_data = pd.concat([x,y],axis=1)
train_data.head()

In [None]:
train, test = train_test_split(train_data, test_size=0.3)

In [None]:
# Get default datastore
default_ds = ws.get_default_datastore()

# Register the dataset with name 'AutoML_data'
dataSet = TabularDatasetFactory.register_pandas_dataframe(train_data,target=(default_ds,'AutoMLData'),name='AutoML_data',show_progress=True)

automl_data = ws.datasets.get('AutoML_data')

## AutoML Configuration

In [None]:
automl_settings = {
    "experiment_timeout_minutes": 30,
    "max_concurrent_iterations": 5,
    "n_cross_validations": 5,
    "primary_metric" : 'accuracy'
}

automl_config = AutoMLConfig(compute_target=cpu_cluster,
                             task = "classification",
                             training_data=automl_data,
                             label_column_name="loanDefault",   
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log = "automl_errors.log",
                             enable_onnx_compatible_models=True,
                             **automl_settings)

### Reasons for automl settings and configuration

*   `"experiment_timeout_minutes": 30`: Maximum amount of time (in minutes) to complete training itertions. Set at 30 minutes because dataset has over 10,000 entries.
*   `"max_concurrent_iterations": 5`: Maximum number of iterations that can be executed simultaneously. Advisably, this value should be less than the number of compute cluster node.
*   `"n_cross_validations": 5`: Cross validation is a model validation technique used to reduce overfitting. `n` is the number of training examples.
*   `"primary_metric" : 'Accuracy'`: This parameter determines the metric to be used during model training for optimization.
*   `compute_target=cpu_cluster`: This points to the compute cluster configuration created earlier.
*   `task = "classification"`: The problem in view is a classification task.
*   `training_data=automl_data`: Specifies the dataset to be used: an external dataset already registered in azure datastore.
*   `label_column_name="loanDefault"`: Specifies the dependent variable to be predicted.
*   `enable_early_stopping=True`: Allows for an early stopping rule to be applied.
*   `featurization= 'auto'`: Allows azure to automatically perform feature engineering.

In [None]:
# Submit your experiment
print('Submitting AutoML experiment...')
remote_run = experiment.submit(automl_config)

## Run Details

Use the `RunDetails` widget to show the different experiments.

In [None]:
RunDetails(remote_run).show()
remote_run.wait_for_completion(show_output=True)

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

## Best Model

TODO: In the cell below, get the best model from the automl experiments and display all the properties of the model.



In [None]:
# Retrieve the best automl model

best_automl_run, fitted_model = remote_run.get_output()
print(best_automl_run)

#Returns the various metric values for the best run
best_run_metrics = best_automl_run.get_metrics()
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print('{}: {}'.format(metric_name, metric))

In [None]:
# Parameters of the best model
def print_model(model, prefix=""):
    for step in model.steps:
        print(prefix + step[0])
        if hasattr(step[1], 'estimators') and hasattr(step[1], 'weights'):
            pprint({'estimators': list(
                e[0] for e in step[1].estimators), 'weights': step[1].weights})
            print()
            for estimator in step[1].estimators:
                print_model(estimator[1], estimator[0] + ' - ')
        else:
            pprint(step[1].get_params())
            print()

print_model(fitted_model)

In [None]:
# Register model
bestModel = best_automl_run.register_model(model_path='outputs/model.pkl', model_name='model_automl',
                        tags={'Training context':'Auto ML'},
                        properties={'Accuracy': best_run_metrics['accuracy']})

# List registered models
for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

## Get Onnx Model

In [None]:
from azureml.automl.runtime.onnx_convert import OnnxConverter

best_run , onnx_model = remote_run.get_output(return_onnx_model=True)
onnx_path = "./best_model.onnx"
OnnxConverter.save_onnx_model(onnx_model, onnx_path)

In [None]:
import sys
from azureml.automl.core.onnx_convert import OnnxConvertConstants
from azureml.train.automl import constants

if sys.version_info < OnnxConvertConstants.OnnxIncompatiblePythonVersion:
    python_version_compatible = True
else:
    python_version_compatible = False

import onnxruntime
from azureml.automl.runtime.onnx_convert import OnnxInferenceHelper

def get_onnx_res(run):
    res_path = 'onnx_resource.json'
    run.download_file(name=constants.MODEL_RESOURCE_PATH_ONNX, output_file_path=res_path)
    with open(res_path) as f:
        onnx_res = json.load(f)
    return onnx_res

if python_version_compatible:
    model_bytes = onnx_model.SerializeToString()
    onnx_res = get_onnx_res(best_run)

    onnxrt_helper = OnnxInferenceHelper(model_bytes, onnx_res)
    pred_onnx, pred_prob_onnx = onnxrt_helper.predict(test)

    print(pred_onnx)
    print(pred_prob_onnx)
else:
    print('Please use Python version 3.6 or 3.7 to run the inference helper.')

## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

In [None]:
# download scoring file 
best_automl_run.download_file('outputs/scoring_file_v_1_0_0.py', 'score.py')

# download environment file
best_automl_run.download_file('outputs/conda_env_v_1_0_0.yml', 'envFile.yml')

TODO: In the cell below, send a request to the web service you deployed to test it.

In [None]:
inference_config = InferenceConfig(entry_script='score.py',
                                    environment=best_automl_run.get_environment())

# deploying model

deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1, auth_enabled=True)
service = Model.deploy(workspace = ws, 
                    name = "deployed-best-model", 
                    models = [bestModel], 
                    inference_config = inference_config, 
                    deployment_config = deployment_config,
                    overwrite=True)

In [None]:
service

In [None]:
service.wait_for_deployment(show_output = True)

In [None]:
print(f"Service state: {service.state}")
print(f"Scoring URI : {service.scoring_uri}")
print(f"Swagger URI: {service.swagger_uri}")
print(f"Primary key: {service.get_keys()[0]}")

In [None]:
### Enabling logging
!python3 logs.py

In [None]:
# select sample data
x_samp = train_data.sample(4) # data is the pandas dataframe of the original data
y_samp = x_samp.pop('loanDefault')

# convert data samples to json format
sample = json.dumps({'data': x_samp.to_dict(orient='records')})

print(sample)

TODO: In the cell below, print the logs of the web service and delete the service

In [None]:
# option 1: through endpoint.py script
!python3 endpoint.py

In [None]:
# option 2
# Used for http post request

# Set the content type
headers = {'Content-type': 'application/json'}
response = requests.post(service.scoring_uri, test_sample, headers=headers)

In [None]:
# Print results from the inference
print(response.text)
print(response.status_code)
print(response.elapsed)
print(response.json())

In [None]:
# Print original labels
print(y_samp)

In [None]:
print(service.get_logs())

In [None]:
# deleting the service
service.delete()