# Automated ML

Import dependencies.

In [1]:
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.train.automl import AutoMLConfig
from azureml.widgets import RunDetails

from pprint import pprint
from azureml.core import Model
from train import clean
from sklearn.model_selection import train_test_split

from azureml.core.model import InferenceConfig
from azureml.core.environment import Environment
from azureml.core.webservice import AciWebservice

from azureml.automl.runtime.onnx_convert import OnnxConverter
from azureml.automl.core.onnx_convert import OnnxConvertConstants
from azureml.train.automl import constants

import pandas as pd
import azureml
import requests
import joblib
import json
import sys
import os

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.20.0


### Workspace and compute cluster configuration

In [2]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'loan-default-automl'

experiment=Experiment(ws, experiment_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = experiment.start_logging()

Workspace name: quick-starts-ws-137409
Azure region: southcentralus
Subscription id: f5091c60-1c3c-430f-8d81-d802f6bf2414
Resource group: aml-quickstarts-137409


In [4]:
cpu_cluster_name = "capstone-compute" 

#verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print(f"Found existing cluster: {cpu_cluster_name} to be used.")
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2', max_nodes=6)

    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)#, min_node_count = 1, timeout_in_minutes = 10)

Found existing cluster: capstone-compute to be used.

Running


## Dataset

### Overview

The dataset used in this project is a loan default prediction dataset. The project entails identifying customers who would either default or not  default after taking a loan credit. This transalates to a binary classification: to default or not to default. Hence, a Logistic regression model would be built using azure's automl funcionality. SKLearn's logistic regression algorithm is a well-known supervised learning approach optimized for dichotomous or binary variables.

In [5]:
# retrieve data from github

url = "https://raw.githubusercontent.com/ChidiNdego/loan-default-capstone-project/master/starter_file/loan_default_prediction.csv"

data = TabularDatasetFactory.from_delimited_files(url)

In [6]:
# use the clean function to polish the data.
x, y = clean(data)

# add cleaned target column to cleaned predictor variables
train_data = pd.concat([x,y],axis=1)
train_data.head()

Unnamed: 0,Column1,firstPaymentDefault,firstPaymentRatio,max_amount_taken,max_tenor_taken,loanAmount,interestRate,clientIncome,clientAge,clientNumberPhoneCOntacts,...,clientLoanPurpose_education,clientLoanPurpose_house,clientLoanPurpose_medical,clientLoanPurpose_other,clientResidentialStauts_Family Owned,clientResidentialStauts_Own Residence,clientResidentialStauts_Rented,clientResidentialStauts_Temp Residence,incomeVerified_True,loanDefault
0,0,0,0.0,1,1,101500,6.0,133613.31,50,1013.0,...,1,0,0,0,0,1,0,0,1,0
1,1,0,0.0,1,1,61000,12.5,160995.02,36,649.0,...,0,1,0,0,0,0,1,0,1,1
2,2,1,0.0,0,0,23000,10.0,140000.0,34,1171.0,...,0,0,0,0,0,0,1,0,0,1
3,3,0,0.0,0,0,46000,12.5,100000.0,35,568.0,...,0,0,0,0,0,0,1,0,1,0
4,4,0,0.0,0,1,27000,12.5,52500.0,24,500.0,...,0,0,0,0,1,0,0,0,0,0


In [7]:
train, test = train_test_split(train_data, test_size=0.3)

In [8]:
# get default datastore
default_ds = ws.get_default_datastore()

# register the dataset
dataSet = TabularDatasetFactory.register_pandas_dataframe(train_data,target=(default_ds,'AutoMLData'),name='AutoML_data',show_progress=True)

automl_data = ws.datasets.get('AutoML_data')

Method register_pandas_dataframe: This is an experimental method, and may change at any time.<br/>For more information, see https://aka.ms/azuremlexperimental.


Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to AutoMLData/ee620830-ab7c-4f9d-a982-b3fb5779e73f/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


## AutoML Configuration

In [9]:
automl_settings = {
    "experiment_timeout_minutes": 30,
    "max_concurrent_iterations": 5,
    "n_cross_validations": 5,
    "primary_metric" : 'accuracy'
}

automl_config = AutoMLConfig(compute_target=cpu_cluster,
                             task = "classification",
                             training_data=automl_data,
                             label_column_name="loanDefault",   
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log = "automl_errors.log",
                             enable_onnx_compatible_models=True,
                             **automl_settings)

### Reasons for automl settings and configuration

*   `"experiment_timeout_minutes": 30`: Maximum amount of time (in minutes) to complete training itertions. Set at 30 minutes because dataset has over 10,000 entries.
*   `"max_concurrent_iterations": 5`: Maximum number of iterations that can be executed simultaneously. Advisably, this value should be less than the number of compute cluster node.
*   `"n_cross_validations": 5`: Cross validation is a model validation technique used to reduce overfitting. `n` is the number of training examples.
*   `"primary_metric" : 'Accuracy'`: This parameter determines the metric to be used during model training for optimization.
*   `compute_target=cpu_cluster`: This points to the compute cluster configuration created earlier.
*   `task = "classification"`: The problem in view is a classification task.
*   `training_data=automl_data`: Specifies the dataset to be used: an external dataset already registered in azure datastore.
*   `label_column_name="loanDefault"`: Specifies the dependent variable to be predicted.
*   `enable_early_stopping=True`: Allows for an early stopping rule to be applied.
*   `featurization= 'auto'`: Allows azure to automatically perform feature engineering.
*   `enable_onnx_compatible_models=True`: Allows for model to be converted to onnx format.

In [10]:
# submit your experiment
print('Submitting AutoML experiment...')
remote_run = experiment.submit(automl_config)

Submitting AutoML experiment...
Running on remote.


## Run Details

Use the `RunDetails` widget to show the different experiments.

In [11]:
RunDetails(remote_run).show()
remote_run.wait_for_completion(show_output=True)

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…


Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.m

{'runId': 'AutoML_455d9474-2160-4678-8424-d7f42202d42a',
 'target': 'capstone-compute',
 'status': 'Completed',
 'startTimeUtc': '2021-02-06T05:38:14.530719Z',
 'endTimeUtc': '2021-02-06T06:04:19.657632Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'capstone-compute',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"ce7a3ee3-3c40-421f-97a9-0551e141a14b\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"AutoMLData/ee620830-ab7c-4f9d-a982-b3fb5779e73f/\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-137409\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"f5091c60-1c3c-430

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

`RandomForest` is an ensemble of Decision trees. Random Forests performs Bagging internally. It creates several trees and calculates the best possible model for a given dataset. Instead of considering all features while splitting a node, Random Forest algorithm selects the best feature out of a subset of all features. This trades a higher bias for lower variance, which potentially yields a very good model.

`XGBoostClassifier` is a powerful gradient boosting method that assigns positive and negative values to every decision made. All Trees are weak learners and provide decisions slightly better than a random guess. But collectively averaged out, XGBoost performs really well.

`VotingEnsemble` which happens to be the best model from the run, uses diverse algorithms and then ensemble them to predict the final output. Say, you use a Random Forest Classifier, XGBoost Classifier, SVM Classifier, Linear Regression etc.; models are pitted against each other and selected upon best performance by voting using the VotingClassifier Class from sklearn.ensemble. This probably explains why it turned out to be the best.

## Best Model

Get the best model from the automl experiments and display all the properties of the model.



In [12]:
# retrieve the best automl model

best_automl_run, fitted_model = remote_run.get_output()
print(best_automl_run)

# returns the various metric values for the best run
best_run_metrics = best_automl_run.get_metrics()
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print('{}: {}'.format(metric_name, metric))

Package:azureml-automl-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-core, training version:1.21.0.post1, current version:1.20.0
Package:azureml-dataprep, training version:2.8.2, current version:2.7.3
Package:azureml-dataprep-native, training version:28.0.0, current version:27.0.0
Package:azureml-dataprep-rslex, training version:1.6.0, current version:1.5.0
Package:azureml-dataset-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-defaults, training version:1.21.0, current version:1.20.0
Package:azureml-interpret, training version:1.21.0, current version:1.20.0
Package:azureml-pipeline-core, training version:1.21.0, current version:1.20.0
Package:azureml-telemetry, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-client, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-runtime, training version:1.21.0, current version:1.20.0


Run(Experiment: loan-default-automl,
Id: AutoML_455d9474-2160-4678-8424-d7f42202d42a_38,
Type: azureml.scriptrun,
Status: Completed)
recall_score_micro: 0.8522456906701731
average_precision_score_macro: 0.8740168864608657
norm_macro_recall: 0.5764955224582013
f1_score_micro: 0.8522456906701731
f1_score_weighted: 0.8472255312055694
AUC_macro: 0.8758648878830533
AUC_micro: 0.9263170862360457
precision_score_micro: 0.8522456906701731
accuracy: 0.8522456906701731
precision_score_weighted: 0.8481184871036564
balanced_accuracy: 0.7882477612291008
matthews_correlation: 0.6172484642540191
recall_score_weighted: 0.8522456906701731
f1_score_macro: 0.8049526241203928
average_precision_score_micro: 0.9247367438706243
average_precision_score_weighted: 0.8978189301427075
AUC_weighted: 0.8758648878810564
log_loss: 0.34408592564745194
weighted_accuracy: 0.8954699918850606
precision_score_macro: 0.8304634446491308
recall_score_macro: 0.7882477612291008
confusion_matrix: aml://artifactId/ExperimentRun/d

In [13]:
# parameters of the best model
def print_model(model, prefix=""):
    for step in model.steps:
        print(prefix + step[0])
        if hasattr(step[1], 'estimators') and hasattr(step[1], 'weights'):
            pprint({'estimators': list(
                e[0] for e in step[1].estimators), 'weights': step[1].weights})
            print()
            for estimator in step[1].estimators:
                print_model(estimator[1], estimator[0] + ' - ')
        else:
            pprint(step[1].get_params())
            print()

print_model(fitted_model)

datatransformer
{'enable_dnn': None,
 'enable_feature_sweeping': None,
 'feature_sweeping_config': None,
 'feature_sweeping_timeout': None,
 'featurization_config': None,
 'force_text_dnn': None,
 'is_cross_validation': None,
 'is_onnx_compatible': None,
 'logger': None,
 'observer': None,
 'task': None,
 'working_dir': None}

prefittedsoftvotingclassifier
{'estimators': ['1', '0', '19', '6', '9', '35', '14', '28'],
 'weights': [0.2, 0.1, 0.1, 0.1, 0.2, 0.1, 0.1, 0.1]}

1 - maxabsscaler
{'copy': True}

1 - xgboostclassifier
{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': nan,
 'n_estimators': 100,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': None,
 'subsample': 1,
 'tree_method': 'auto',


In [14]:
# register the model
bestModel = best_automl_run.register_model(model_path='outputs/model.pkl', model_name='model_automl',
                        tags={'Training context':'Auto ML'},
                        properties={'Accuracy': best_run_metrics['accuracy']})

# list the registered trained models
for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

model_automl version: 1
	 Training context : Auto ML
	 Accuracy : 0.8522456906701731


HypDriveBestModel version: 1
	 Accuracy : 0.7167860648718645
	 Regularization Strength (C) : 2.0
	 Maximum Iterations (max_iter) : 150




Out of the two models generated, the automl appears to be the best with an Accuracy of `85.22%`. Hence, the automl model is the chosen model for deployment.

However, before deployment let's export or model with ONNX - Open Neural Network Exchange. This model format allows for conversion of models from one framework to another, or even to deploy trained models to devices such as iOS or Android devices.

## Get model with ONNX

In [15]:
best_run , onnx_model = remote_run.get_output(return_onnx_model=True)
onnx_path = "./best_model.onnx"
OnnxConverter.save_onnx_model(onnx_model, onnx_path)

In [16]:
if sys.version_info < OnnxConvertConstants.OnnxIncompatiblePythonVersion:
    python_version_compatible = True
else:
    python_version_compatible = False

import onnxruntime
from azureml.automl.runtime.onnx_convert import OnnxInferenceHelper

def get_onnx_res(run):
    res_path = 'onnx_resource.json'
    run.download_file(name=constants.MODEL_RESOURCE_PATH_ONNX, output_file_path=res_path)
    with open(res_path) as f:
        onnx_res = json.load(f)
    return onnx_res

if python_version_compatible:
    model_bytes = onnx_model.SerializeToString()
    onnx_res = get_onnx_res(best_run)

    onnxrt_helper = OnnxInferenceHelper(model_bytes, onnx_res)
    pred_onnx, pred_prob_onnx = onnxrt_helper.predict(test)

    print(pred_onnx)
    print(pred_prob_onnx)
else:
    print('Please use an updated version of Python to run the inference helper.')

[1 1 1 ... 1 1 1]
[[0.31244847 0.68755156]
 [0.3004808  0.6995192 ]
 [0.290097   0.709903  ]
 ...
 [0.17100433 0.8289957 ]
 [0.29687423 0.70312583]
 [0.20969594 0.79030406]]


## Model Deployment

Deploying the best model: AutoML.

Register the model, create an inference config and deploy the model as a web service.

In [17]:
# download scoring file 
best_automl_run.download_file('outputs/scoring_file_v_1_0_0.py', 'score.py')

# download environment file
best_automl_run.download_file('outputs/conda_env_v_1_0_0.yml', 'envFile.yml')

In [18]:
# create an inference configuration

inference_config = InferenceConfig(entry_script='score.py',
                                    environment=best_automl_run.get_environment())

# deploying model

deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1, auth_enabled=True)
service = Model.deploy(workspace = ws, 
                    name = "deployed-best-model", 
                    models = [bestModel], 
                    inference_config = inference_config, 
                    deployment_config = deployment_config,
                    overwrite=True)

In [19]:
service

AciWebservice(workspace=Workspace.create(name='quick-starts-ws-137409', subscription_id='f5091c60-1c3c-430f-8d81-d802f6bf2414', resource_group='aml-quickstarts-137409'), name=deployed-best-model, image_id=None, compute_type=None, state=ACI, scoring_uri=Transitioning, tags=None, properties={}, created_by={})

In [20]:
service.wait_for_deployment(show_output = True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running...............................................................................
Succeeded
ACI service creation operation finished, operation "Succeeded"


In [21]:
# return service information to be used for model consumption

print(f"Service state: {service.state}")
print(f"Scoring URI : {service.scoring_uri}")
print(f"Swagger URI: {service.swagger_uri}")
print(f"Primary key: {service.get_keys()[0]}")

Service state: Healthy
Scoring URI : http://d4aa7520-9f53-427a-9f96-7d7ccb145c0e.southcentralus.azurecontainer.io/score
Swagger URI: http://d4aa7520-9f53-427a-9f96-7d7ccb145c0e.southcentralus.azurecontainer.io/swagger.json
Primary key: dzQpVmOR79g0kDN3K2As1sgjqTRIn5iq


#### Enable logging

Logging is a core pillar of MLOps. It gives information on how the deployed model is behaving.

In [22]:
!python3 logs.py

2021-02-06T06:18:12,802852147+00:00 - iot-server/run 
2021-02-06T06:18:12,805063354+00:00 - gunicorn/run 
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
2021-02-06T06:18:12,807857388+00:00 - nginx/run 
2021-02-06T06:18:12,806434719+00:00 - rsyslog/run 

#### Consume deployed services

Send a request to the web service you deployed to test it.

In [58]:
# select sample data
x_samp = train_data.sample(4) # data is the pandas dataframe of the original data
y_samp = x_samp.pop('loanDefault')

# convert data samples to json format
sample = json.dumps({'data': x_samp.to_dict(orient='records')})

print(sample)

{"data": [{"Column1": 140887, "firstPaymentDefault": 0, "firstPaymentRatio": 0.0, "max_amount_taken": 0, "max_tenor_taken": 1, "loanAmount": 45000, "interestRate": 5.0, "clientIncome": 125000.0, "clientAge": 41, "clientNumberPhoneCOntacts": 3013.0, "clientAvgCallsPerDay": 18.680497925311204, "loanNumber": 14, "clientGender_MALE": 1, "clientMaritalStatus_Married": 1, "clientMaritalStatus_Separated": 0, "clientMaritalStatus_Single": 0, "clientMaritalStatus_Widowed": 0, "clientLoanPurpose_education": 0, "clientLoanPurpose_house": 1, "clientLoanPurpose_medical": 0, "clientLoanPurpose_other": 0, "clientResidentialStauts_Family Owned": 0, "clientResidentialStauts_Own Residence": 0, "clientResidentialStauts_Rented": 1, "clientResidentialStauts_Temp Residence": 0, "incomeVerified_True": 1}, {"Column1": 157934, "firstPaymentDefault": 1, "firstPaymentRatio": 0.0, "max_amount_taken": 1, "max_tenor_taken": 1, "loanAmount": 34500, "interestRate": 12.5, "clientIncome": 17500.0, "clientAge": 29, "cli

Copy sample data, scoring uri, and the service's primary key and update in the `endpoint.py` script before running the cell below.

In [59]:
# response from model endpoint
!python3 endpoint.py

{"result": [0, 1, 1, 1]}


In [60]:
# print original labels
print(y_samp)

140887    0
157934    1
23515     1
5117      1
Name: loanDefault, dtype: int64


Evidently, the model endpoint rightly returns the expected output.

Print the logs of the web service and delete the service

In [44]:
# web service log

print(service.get_logs())

/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
2021-02-06T06:21:34,307110464+00:00 - nginx/run 
2021-02-06T06:21:34,307162366+00:00 - rsyslog/run 
2021-02-06T06:21:34,305457990+00:00 - gunicorn/run 
2021-02-06T06:21:34,307659788+00:00 - iot-server/run 
rsyslogd

In [61]:
# delete the service

service.delete()