# Automated ML

The following cell contains commands to import all the dependencies for the project.

In [1]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.pipeline.steps import AutoMLStep

## Dataset

### Overview
In this markdown cell, we give an overview of the dataset used and the task we will be performing.

The data used for training of the models is obtained from publicly UCI Machine Learning Repository. The dataset contains 1599 records of eleven red wine physicochemical properties and one output variable 'quality' as sensory data denoting perceived quality of wine according to human taste. Quality is scored from 0 to 10, latest denoting the highest quality. Classes are not balanced and there are more 'ordinary' wines than high or poor quality ones (P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis., Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.). The data can be used both for regression and classification machine learning tasks.

Our task will be to train the classification model to predict quality of unknown wine by its physicochemical properties. As starting point we will remove all missing data, as we have to be certain that we are using the clean dataset for training. 

The next cell contains the code we use to access the data used in this project. This dataset is external in regard to Microsoft Azure ML. The manual process of providing dataset for training is described in README.md file and if performed it is not necessary to run the next cell.

In [2]:
ws = Workspace.from_config()

# Designate a name for experiment
experiment_name = 'wine-quality-automl-experiment'

experiment=Experiment(ws, experiment_name)

# Attempt to load the dataset from the Workspace. Otherwise,
# Prepare Dataset from external data
# Data located at: 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
# 
found = False
key = "wine-quality"
description_text = "Wine Quality DataSet for Udacity Capstone Project"

if key in ws.datasets.keys(): 
        found = True
        ds = ws.datasets[key] 

if not found:
        # Create AutoML Dataset and register it into Workspace
        web_uri = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
        ds = Dataset.Tabular.from_delimited_files(web_uri, separator=';', header='ALL_FILES_HAVE_SAME_HEADERS')        
        #Register Dataset in Workspace
        ds = ds.register(workspace=ws,
                                   name=key,
                                   description=description_text)
        
dframe = ds.to_pandas_dataframe()
dframe.describe()
ds.take(5).to_pandas_dataframe()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25,67,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15,54,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17,60,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11,34,0.9978,3.51,0.56,9.4,5


If manual creation of compute target was skipped, following two cells should be run. The first one creates needed compute resource for model training and the second creates folder structure for outputs.

In [24]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

ws = Workspace.from_config()

# choose a name for compute
compute_name = "aml-compute"

try:
    compute_target = ComputeTarget(workspace=ws, name=compute_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS3_v2', 
                                                           max_nodes=10)

    # create the cluster
    compute_target = ComputeTarget.create(ws, compute_name, compute_config)

# can poll for a minimum number of nodes and for a specific timeout. 
# if no min node count is provided it uses the scale settings for the cluster
compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

Creating a new compute target...
CreatingAmlCompute is getting created. Consider calling wait_for_completion() first


Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2021-02-16T11:36:51.643000+00:00', 'errors': None, 'creationTime': '2021-02-16T11:36:49.007360+00:00', 'modifiedTime': '2021-02-16T11:37:04.761897+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 10, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_DS3_V2'}


In [4]:
import os

os.makedirs('./outputs', exist_ok=True)
os.makedirs('./aml', exist_ok=True)
os.makedirs('./scripts', exist_ok=True)

In [5]:
# Providing dataset for AutoML models training
dataset_name = 'wine-quality'
dataset = Dataset.get_by_name(workspace=ws, name=dataset_name)

## AutoML Configuration

For this project we have selected classification task with accuracy as primary metric. There are many settings for AutoML experiment configuration. Reasoning for selecting certain settings may be saving resources and limiting duration of training time. Here we have limited time for training of all iterations set to 20 minutes and maximum of 5 concurrent iterations. Accuracy is most common metric for model comparison and therefore it was used here. Maximum concurrent iterations was set in order to keep within limits of compute resources available and experiment timeout was set in order to prevent overallocation of resources in case of divergent iterations.

In [6]:
# This section provides AutoML settings
#
automl_settings = {
    "experiment_timeout_minutes": 20,
    "max_concurrent_iterations": 5,
    "primary_metric" : 'accuracy'
                    }

# The portion of AutoML config here
#
compute_target_name = "aml-compute"
automl_config = AutoMLConfig(compute_target=compute_target_name,
                             task = "classification",
                             training_data=dataset,
                             label_column_name="quality",   
                             path = "./aml",
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log = "automl_errors.log",
                             **automl_settings
                            )

In [7]:
# Submit configured experiment
#
remote_run = experiment.submit(automl_config)

Running on remote.


## Run Details

Azure Auto ML creates a number of pipelines in parallel and test different algorithms and parameters. Each iteration produces a model with a training score. In reality machine learning models operate under certain assumptions. One of the assumptions regards the data for model training. If the real data characteristics differs much from assumed data, then we may get a poor fit. Some models are more succeptible to overfit than others. In order to minimize the risk of overfit, we may combine several good models to get possibly an even better model, which is a technique called VotingEnsemble in case of Azure Auto ML.

After submitting the experimet one of the ways to monitor progress of the experiment directly from the notebook is to use `RunDetails` widget, which gives the information like the one on captured screenshot below.

![](https://github.com/DivkovicD/ML-Engineer-w-MS-Azure/blob/master/Screenshots/Screenshot%20of%20RunDetails%20widget%20showing%20the%20progress%20of%20training%20runs%20of%20different%20experiments%20v5.png?raw=true)

In the cell below, we used the `RunDetails` widget to show the different experiments.

In [8]:
from azureml.widgets import RunDetails
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

Different models perform differently in regard to primary metric chosen. AutoML calculates performance metrics, based on the scikit learn implementation for each classification model generated for experiment. Common consideration with all models is class imbalance. Models that are more sensitive to class imbalance show less accuracy.

## Best Model

The following cell contains the code to get the best model from AutoML experiments and displays properties of the model. Then we save the best model.

In [9]:
# Retrieve and save your best automl model.
# attribution, November 2020, https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-and-where?tabs=python
#
best_run, fitted_model = remote_run.get_output()
print(best_run)
print(fitted_model)

Package:azureml-automl-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-core, training version:1.21.0.post1, current version:1.20.0
Package:azureml-dataprep, training version:2.8.2, current version:2.7.3
Package:azureml-dataprep-native, training version:28.0.0, current version:27.0.0
Package:azureml-dataprep-rslex, training version:1.6.0, current version:1.5.0
Package:azureml-dataset-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-defaults, training version:1.21.0, current version:1.20.0
Package:azureml-interpret, training version:1.21.0, current version:1.20.0
Package:azureml-pipeline-core, training version:1.21.0, current version:1.20.0
Package:azureml-telemetry, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-client, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-runtime, training version:1.21.0, current version:1.20.0


Run(Experiment: wine-quality-automl-experiment,
Id: AutoML_7a5045ae-120b-4594-8551-6f31603fad12_38,
Type: azureml.scriptrun,
Status: Completed)
Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                        max_iter=-1,
                                                                                        probability=True,
                                         

The result of AutoML run was VotingEnsemble model which is a combined performance of best performing models from AutoML experiment run. The ensemble was created from previous AutoML iterations with soft voting. The VotingEnsemble consists of ['XGBoostClassifier', 'KNN', 'LightGBM', 'XGBoostClassifier', 'XGBoostClassifier', 'LightGBM', 'LightGBM', 'XGBoostClassifier', 'LightGBM', 'SVM', 'ExtremeRandomTrees', 'LightGBM'] algorithms, which are top twelve models rated by accuracy. The AutoML Voting Ensemble selected parameters read from azureml-logs:

- 'ensemble_iterations': 35, 27, 0, 50, 1, 39, 44, 45, 8, 7, 28, 31
- 'training_type': 'MeanCrossValidation'
- 'goal': 'accuracy_max'
- 'primary_metric': 'accuracy'

Other AutoML parameters were mostly default values. The detailed list of parameters can also be obtained from Raw JSON file located under "See all properties" of Details blade of the AutoML experiment. 

In [10]:
# Save the best model
# Example of approach at https://benalexkeen.com/using-azure-automl-and-aml-for-assessing-multiple-models-and-deployment/
# folder 'outputs' must be present
#
from sklearn.externals import joblib

model_path = 'outputs/automl-wine-quality-model.pkl'
joblib.dump(fitted_model, model_path)

['outputs/automl-wine-quality-model.pkl']

Screenshot of the best model with its run id:

![](https://github.com/DivkovicD/ML-Engineer-w-MS-Azure/blob/master/Screenshots/Screenshot%20of%20the%20best%20model%20(AutoML)%20with%20its%20run%20id.png?raw=true)

## Model Deployment

The result of AutoML run was VotingEnsemble model which achieved higher accuracy in comparison with Hyperparameter tuning approach. The result of VotingEnsemble model is a combined performance of best performing models from AutoML experiment run. Therefore we decide to deploy this model, so the next cell contains the code to register the model, create an inference config and deploy the model as a web service.

In [11]:
# Register the best model that was previously saved
#
from azureml.core.model import Model
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice, Webservice
from azureml.core.run import Run

model = Model.register(model_path = "outputs/automl-wine-quality-model.pkl",
                       model_name = "automl-wine-quality-model",
                       tags = {"VotingEnsemble": "1.0"},
                       description = "AutoML model for prediction of wine quality",
                       workspace = ws)

print(model.name, model.id, model.version, sep = '\n')



Registering model automl-wine-quality-model
automl-wine-quality-model
automl-wine-quality-model:1
1


In [12]:
# Define inference configuration
#
from azureml.core.environment import Environment
from azureml.core.model import InferenceConfig


env = Environment.get(ws, "AzureML-AutoML")
# Instead of using environment file, we will be using preconfigured environment

for pip_package in ["scikit-learn"]:
    env.python.conda_dependencies.add_pip_package(pip_package)

inference_config = InferenceConfig(entry_script='scoring2.py',
                                    environment=env)

In [23]:
# Setting deployment configuration
#
deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1)

# Provide model, inference, deployment configuration, web service name and location to deploy the model
service = Model.deploy(
    workspace = ws,
    name = "wine-quality-web-service",
    models = [model],
    inference_config = inference_config,
    deployment_config = deployment_config)


service.wait_for_deployment(show_output=True)
# Record and use the following output for interaction with deployed service
print(service.scoring_uri)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running.........................................
Succeeded
ACI service creation operation finished, operation "Succeeded"
http://bce1965c-b7a1-42c5-9a6a-f9c94d225770.southcentralus.azurecontainer.io/score


A screenshot showing the model endpoint as active:

![](https://github.com/DivkovicD/ML-Engineer-w-MS-Azure/blob/master/Screenshots/Screenshot%20showing%20model%20endpoint%20as%20active%20-%204%20automl%20notebook.png?raw=true)

In [25]:
# Comment lines below for disabling Application Inisghts and run the cell
# in this case it is necessary to use Python script score-w-appinsig.py instead of scoring2.py
service.update(enable_app_insights=True)
service.wait_for_deployment(show_output=True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running....
Succeeded
ACI service creation operation finished, operation "Succeeded"


The cell below contains the code that sends a request to deployed web service. There are several ways to do it. The first demonstrates simple JSON formating without error catching, the second contains the sample code from Consume blade of deployed model, and third shows how could it be done via `curl` command¸.

In [26]:
import requests
import json
from ast import literal_eval
    
# URL for the web service from scoring_uri of the deployed model
scoring_uri = 'http://bce1965c-b7a1-42c5-9a6a-f9c94d225770.southcentralus.azurecontainer.io/score'
## If the service is authenticated, set the key or token and uncomment the line below
# key = '<your key or token>'
    
# Two sets of data to score, two results back
data = {"data":
        [
            {"fixed acidity": 7.4,
             "volatile acidity": 0.7,
             "citric acid": 0,
             "residual sugar": 1.9,
             "chlorides": 0.076,
             "free sulfur dioxide": 11,
             "total sulfur dioxide": 34,
             "density": 0.9978,
             "pH": 3.51,
             "sulphates": 0.56,
             "alcohol": 9.4
          },
            {"fixed acidity": 11.2,
             "volatile acidity": 0.28,
             "citric acid": 0.56,
             "residual sugar": 1.9,
             "chlorides": 0.075,
             "free sulfur dioxide": 17,
             "total sulfur dioxide": 60,
             "density": 0.998,
             "pH": 3.16,
             "sulphates": 0.58,
             "alcohol": 9.4
          }
        ]
     }

# Convert to JSON string
input_data = json.dumps(data)
    
# Set appropriate content type
headers = {'Content-Type': 'application/json'}
## Note: If authentication is enabled, set the authorization header, by uncommenting the next line of code
# headers['Authorization'] = f'Bearer {key}'
    
# Post the formated request and display the response
response = requests.post(scoring_uri, input_data, headers=headers)
print(response.text)
# And alterntive way to display result    
result = literal_eval(response.text)
print (result)

[5, 6]
[5, 6]


In [15]:
# Taken from Consume blade of deployed model
#
import urllib.request
import json
import os
import ssl

def allowSelfSignedHttps(allowed):
    # bypass the server certificate verification on client side
    if allowed and not os.environ.get('PYTHONHTTPSVERIFY', '') and getattr(ssl, '_create_unverified_context', None):
        ssl._create_default_https_context = ssl._create_unverified_context

allowSelfSignedHttps(True) # this line is needed if you use self-signed certificate in your scoring service.

data = {
    "data":
    [
        {
            'fixed acidity': 7.4,
            'fixed acidity': 7.4,
            'volatile acidity': 0.7,
            'citric acid': 0,
            'residual sugar': 1.9,
            'chlorides': 0.076,
            'free sulfur dioxide': 11,
            'total sulfur dioxide': 34,
            'density': 0.9978,
            'pH': 3.51,
            'sulphates': 0.56,
            'alcohol': 9.4
        },
    ]
}
print(data)
body = str.encode(json.dumps(data))
print(body)
url = 'http://bce1965c-b7a1-42c5-9a6a-f9c94d225770.southcentralus.azurecontainer.io/score'
api_key = '' # Replace this with the API key for the web service and uncomment the following line, while commenting the next one
# headers = {'Content-Type':'application/json', 'Authorization':('Bearer '+ api_key)}
headers = {'Content-Type':'application/json'}

req = urllib.request.Request(url, body, headers)

try:
    response = urllib.request.urlopen(req)

    result = response.read()
    print(result)
except urllib.error.HTTPError as error:
    print("The request failed with status code: " + str(error.code))

    # Print the headers - they include the requert ID and the timestamp, which are useful for debugging the failure
    print(error.info())
    print(json.loads(error.read().decode("utf8", 'ignore')))

{'data': [{'fixed acidity': 7.4, 'volatile acidity': 0.7, 'citric acid': 0, 'residual sugar': 1.9, 'chlorides': 0.076, 'free sulfur dioxide': 11, 'total sulfur dioxide': 34, 'density': 0.9978, 'pH': 3.51, 'sulphates': 0.56, 'alcohol': 9.4}]}
b'{"data": [{"fixed acidity": 7.4, "volatile acidity": 0.7, "citric acid": 0, "residual sugar": 1.9, "chlorides": 0.076, "free sulfur dioxide": 11, "total sulfur dioxide": 34, "density": 0.9978, "pH": 3.51, "sulphates": 0.56, "alcohol": 9.4}]}'
b'[5]'


In [16]:
# Substitute the last line of the cell with scoring_uri for interaction with deployed service
#
from azureml.core.authentication import InteractiveLoginAuthentication

interactive_auth = InteractiveLoginAuthentication()
auth_header = interactive_auth.get_authentication_header()

!curl -X POST \
    -H 'Content-Type':'application/json' \
    -d '{"data":[{"fixed acidity": 7.4, "volatile acidity": 0.7, \
        "citric acid": 0, "residual sugar": 1.9, "chlorides": 0.076, "free sulfur dioxide": 11, \
        "total sulfur dioxide": 34, "density": 0.9978, "pH": 3.51, "sulphates": 0.56, "alcohol": 9.4}]}' \
    http://bce1965c-b7a1-42c5-9a6a-f9c94d225770.southcentralus.azurecontainer.io/score

[5]

The following cell prints out web service logs, deletes the service and comupte target, therefore releasing the resources in order to economize with them.

In [27]:
# Get logs and release resources
#
print(service.get_logs())
service.delete()
compute_target.delete()

2021-02-16T11:38:16,137168988+00:00 - gunicorn/run 
2021-02-16T11:38:16,137460291+00:00 - rsyslog/run 
2021-02-16T11:38:16,138593903+00:00 - iot-server/run 
/usr/sbin/nginx: /azureml-envs/azureml_09ff55f546b313bb1ab136a466214499/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_09ff55f546b313bb1ab136a466214499/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_09ff55f546b313bb1ab136a466214499/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_09ff55f546b313bb1ab136a466214499/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_09ff55f546b313bb1ab136a466214499/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
2021-02-16T11:38:16,139008608+00:00 - nginx/run 
rsyslogd

Note: The cells below are helper files to be used in case original files are altered.

In [None]:
%%writefile scoring2.py

# Reccomended approach
# https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-and-where?tabs=python
#
# Inspiration for portions of code from https://docs.microsoft.com/en-us/azure/machine-learning/how-to-deploy-advanced-entry-script
# and https://docs.microsoft.com/en-us/azure/machine-learning/tutorial-power-bi-custom-model
#
import json
import pickle
import numpy as np
import pandas as pd
import os
import joblib
from azureml.core.model import Model

from inference_schema.schema_decorators import input_schema, output_schema
from inference_schema.parameter_types.numpy_parameter_type import NumpyParameterType
from inference_schema.parameter_types.pandas_parameter_type import PandasParameterType


def init():
    global model
    path = os.getenv('AZUREML_MODEL_DIR') 
    model_path = os.path.join(path, 'automl-wine-quality-model.pkl')
    model = joblib.load(model_path)


input_sample = pd.DataFrame(data=[{
            "fixed acidity": 7.4,
             "volatile acidity": 0.7,
             "citric acid": 0,
             "residual sugar": 1.9,
             "chlorides": 0.076,
             "free sulfur dioxide": 11,
             "total sulfur dioxide": 34,
             "density": 0.9978,
             "pH": 3.51,
             "sulphates": 0.56,
             "alcohol": 9.4
}])

# Expected result is inetger.
output_sample = np.array([0])

@input_schema('data', PandasParameterType(input_sample))
@output_schema(NumpyParameterType(output_sample))

def run(data):
    try:
        print("Inputs:")
        print(data.columns)
        print(type(data))
        result = model.predict(data)
        print("Result:")
        print(result)
        return result.tolist()
    except Exception as e:
        error = str(e)
        return error