# Automated ML

Import Dependencies.

In [20]:
from azureml.core import Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

from azureml.data.dataset_factory import TabularDatasetFactory
from azureml.train.automl import AutoMLConfig
from azureml.widgets import RunDetails

from pprint import pprint
from azureml.core import Model
from train import clean

from azureml.core.model import InferenceConfig
from azureml.core.environment import Environment
from azureml.core.webservice import AciWebservice

import azureml
import pandas as pd
import os
import json
import requests
import joblib

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.20.0


### Workspace and compute cluster configuration

In [2]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'loan-default-automl'

experiment=Experiment(ws, experiment_name)

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = experiment.start_logging()

Workspace name: quick-starts-ws-137363
Azure region: southcentralus
Subscription id: aa7cf8e8-d23f-4bce-a7b9-1f0b4e0ac8ee
Resource group: aml-quickstarts-137363


In [4]:
cpu_cluster_name = "capstone-compute" 

#verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print(f"Found existing cluster: {cpu_cluster_name} to be used.")
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_DS12_V2', max_nodes=6)

    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)#, min_node_count = 1, timeout_in_minutes = 10)

Found existing cluster: capstone-compute to be used.

Running


## Dataset

### Overview

The dataset used in this project is a loan default prediction dataset. The project entails identifying customers who would either default or not  default after taking a loan credit. This transalates to a binary classification: to default or not to default. Hence, a Logistic regression model would be built using azure's automl funcionality. SKLearn's logistic regression algorithm is a well-known supervised learning approach optimized for dichotomous or binary variables.

In [5]:
# retrieve data from github

url = "https://raw.githubusercontent.com/ChidiNdego/loan-default-capstone-project/master/starter_file/loan_default_prediction.csv"
data = TabularDatasetFactory.from_delimited_files(url)

In [22]:
# Use the clean_data function to clean your data.
x, y = clean(data)

import pandas as pd
# Add cleaned target column to cleaned predictor variables
train_data = pd.concat([x,y],axis=1)
train_data.head()

Unnamed: 0,firstPaymentDefault,firstPaymentRatio,max_amount_taken,max_tenor_taken,loanAmount,interestRate,clientIncome,clientAge,clientNumberPhoneCOntacts,clientAvgCallsPerDay,...,clientLoanPurpose_education,clientLoanPurpose_house,clientLoanPurpose_medical,clientLoanPurpose_other,clientResidentialStauts_Family Owned,clientResidentialStauts_Own Residence,clientResidentialStauts_Rented,clientResidentialStauts_Temp. Residence,incomeVerified_True,loanDefault
0,0,0.0,1,1,101500,6.0,133613.31,50,1013.0,52.45,...,1,0,0,0,0,1,0,0,1,0
1,0,0.0,1,1,61000,12.5,160995.02,36,649.0,4.67,...,0,1,0,0,0,0,1,0,1,1
2,1,0.0,0,0,23000,10.0,140000.0,34,1171.0,47.17,...,0,0,0,0,0,0,1,0,0,1
3,0,0.0,0,0,46000,12.5,100000.0,35,568.0,127.38,...,0,0,0,0,0,0,1,0,1,0
4,0,0.0,0,1,27000,12.5,52500.0,24,500.0,0.0,...,0,0,0,0,1,0,0,0,0,0


In [7]:
# Get default datastore
default_ds = ws.get_default_datastore()

# Register the dataset with name 'AutoML_data'
dataSet = TabularDatasetFactory.register_pandas_dataframe(train_data,target=(default_ds,'AutoMLData'),name='AutoML_data',show_progress=True)

automl_data = ws.datasets.get('AutoML_data')

Method register_pandas_dataframe: This is an experimental method, and may change at any time.<br/>For more information, see https://aka.ms/azuremlexperimental.


Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to AutoMLData/bb36e146-72e3-40e1-a06f-2fa645bf20af/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


## AutoML Configuration

In [8]:
automl_settings = {
    "experiment_timeout_minutes": 30,
    "max_concurrent_iterations": 5,
    "n_cross_validations": 5,
    "primary_metric" : 'accuracy'
}

automl_config = AutoMLConfig(compute_target=cpu_cluster,
                             task = "classification",
                             training_data=automl_data,
                             label_column_name="loanDefault",   
                             enable_early_stopping= True,
                             featurization= 'auto',
                             debug_log = "automl_errors.log",
                             enable_onnx_compatible_models=True,
                             **automl_settings)

### Reasons for automl settings and configuration

*   `"experiment_timeout_minutes": 30`: Maximum amount of time (in minutes) to complete training itertions. Set at 30 minutes because dataset has over 10,000 entries.
*   `"max_concurrent_iterations": 5`: Maximum number of iterations that can be executed simultaneously. Advisably, this value should be less than the number of compute cluster node.
*   `"n_cross_validations": 5`: Cross validation is a model validation technique used to reduce overfitting. `n` is the number of training examples.
*   `"primary_metric" : 'Accuracy'`: This parameter determines the metric to be used during model training for optimization.
*   `compute_target=cpu_cluster`: This points to the compute cluster configuration created earlier.
*   `task = "classification"`: The problem in view is a classification task.
*   `training_data=automl_data`: Specifies the dataset to be used: an external dataset already registered in azure datastore.
*   `label_column_name="loanDefault"`: Specifies the dependent variable to be predicted.
*   `enable_early_stopping=True`: Allows for an early stopping rule to be applied.
*   `featurization= 'auto'`: Allows azure to automatically perform feature engineering.

In [9]:
# Submit your experiment
print('Submitting AutoML experiment...')
remote_run = experiment.submit(automl_config)

Submitting AutoML experiment...
Running on remote.


## Run Details

Use the `RunDetails` widget to show the different experiments.

In [10]:
RunDetails(remote_run).show()
remote_run.wait_for_completion(show_output=True)

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…


Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.m

{'runId': 'AutoML_b06b1d76-d0fb-4280-9ed4-7c85891cf392',
 'target': 'capstone-compute',
 'status': 'Completed',
 'startTimeUtc': '2021-02-05T19:41:31.139406Z',
 'endTimeUtc': '2021-02-05T20:08:39.795781Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'capstone-compute',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"c256afe6-a24d-43e3-af7c-4999da1b8cbd\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"AutoMLData/bb36e146-72e3-40e1-a06f-2fa645bf20af/\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"aml-quickstarts-137363\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"aa7cf8e8-d23f-4bc

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

## Best Model

TODO: In the cell below, get the best model from the automl experiments and display all the properties of the model.



In [14]:
# Retrieve the best automl model

best_automl_run, fitted_model = remote_run.get_output()
print(best_automl_run)

#Returns the various metric values for the best run
best_run_metrics = best_automl_run.get_metrics()
for metric_name in best_run_metrics:
    metric = best_run_metrics[metric_name]
    print('{}: {}'.format(metric_name, metric))

Package:azureml-automl-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-core, training version:1.21.0.post1, current version:1.20.0
Package:azureml-dataprep, training version:2.8.2, current version:2.7.3
Package:azureml-dataprep-native, training version:28.0.0, current version:27.0.0
Package:azureml-dataprep-rslex, training version:1.6.0, current version:1.5.0
Package:azureml-dataset-runtime, training version:1.21.0, current version:1.20.0
Package:azureml-defaults, training version:1.21.0, current version:1.20.0
Package:azureml-interpret, training version:1.21.0, current version:1.20.0
Package:azureml-pipeline-core, training version:1.21.0, current version:1.20.0
Package:azureml-telemetry, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-client, training version:1.21.0, current version:1.20.0
Package:azureml-train-automl-runtime, training version:1.21.0, current version:1.20.0


Run(Experiment: loan-default-automl,
Id: AutoML_b06b1d76-d0fb-4280-9ed4-7c85891cf392_38,
Type: azureml.scriptrun,
Status: Completed)
balanced_accuracy: 0.7776959877910882
recall_score_micro: 0.852114099159676
matthews_correlation: 0.6138337521086046
f1_score_macro: 0.7996336943774203
average_precision_score_weighted: 0.8971926077551393
precision_score_macro: 0.8393293616209618
AUC_micro: 0.9257051901321102
f1_score_weighted: 0.8447553802786739
log_loss: 0.35610441546303195
f1_score_micro: 0.8521140991596757
precision_score_weighted: 0.8491594340781198
norm_macro_recall: 0.5553919755821762
accuracy: 0.852114099159676
recall_score_weighted: 0.852114099159676
average_precision_score_macro: 0.8735095045774248
weighted_accuracy: 0.9023908783053486
recall_score_macro: 0.7776959877910882
AUC_weighted: 0.8752435353193716
average_precision_score_micro: 0.9225807165012195
precision_score_micro: 0.852114099159676
AUC_macro: 0.8752435346723424
confusion_matrix: aml://artifactId/ExperimentRun/dcid.

In [12]:
# Parameters of the best model
def print_model(model, prefix=""):
    for step in model.steps:
        print(prefix + step[0])
        if hasattr(step[1], 'estimators') and hasattr(step[1], 'weights'):
            pprint({'estimators': list(
                e[0] for e in step[1].estimators), 'weights': step[1].weights})
            print()
            for estimator in step[1].estimators:
                print_model(estimator[1], estimator[0] + ' - ')
        else:
            pprint(step[1].get_params())
            print()

print_model(fitted_model)

datatransformer
{'enable_dnn': None,
 'enable_feature_sweeping': None,
 'feature_sweeping_config': None,
 'feature_sweeping_timeout': None,
 'featurization_config': None,
 'force_text_dnn': None,
 'is_cross_validation': None,
 'is_onnx_compatible': None,
 'logger': None,
 'observer': None,
 'task': None,
 'working_dir': None}

prefittedsoftvotingclassifier
{'estimators': ['0',
                '1',
                '8',
                '17',
                '9',
                '6',
                '22',
                '26',
                '28',
                '10',
                '29',
                '2'],
 'weights': [0.13333333333333333,
             0.2,
             0.06666666666666667,
             0.06666666666666667,
             0.06666666666666667,
             0.06666666666666667,
             0.06666666666666667,
             0.06666666666666667,
             0.06666666666666667,
             0.06666666666666667,
             0.06666666666666667,
             0.066666666

In [13]:
# Register model
bestModel = best_automl_run.register_model(model_path='outputs/model.pkl', model_name='model_automl',
                        tags={'Training context':'Auto ML'},
                        properties={'Accuracy': best_run_metrics['accuracy']})

# List registered models
for model in Model.list(ws):
    print(model.name, 'version:', model.version)
    for tag_name in model.tags:
        tag = model.tags[tag_name]
        print ('\t',tag_name, ':', tag)
    for prop_name in model.properties:
        prop = model.properties[prop_name]
        print ('\t',prop_name, ':', prop)
    print('\n')

model_automl version: 1
	 Training context : Auto ML
	 Accuracy : 0.852114099159676




## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

In [39]:
# download scoring file 
best_automl_run.download_file('outputs/scoring_file_v_1_0_0.py', 'score.py')

# download environment file
best_automl_run.download_file('outputs/conda_env_v_1_0_0.yml', 'envFile.yml')

TODO: In the cell below, send a request to the web service you deployed to test it.

In [40]:
inference_config = InferenceConfig(entry_script='score.py',
                                    environment=best_automl_run.get_environment())

# deploying model

deployment_config = AciWebservice.deploy_configuration(cpu_cores = 1, memory_gb = 1,auth_enabled=True)
service = Model.deploy(workspace = ws, 
                    name = "deployed-best-model", 
                    models = [bestModel], 
                    inference_config = inference_config, 
                    deployment_config = deployment_config,
                    overwrite=True)

In [41]:
service

AciWebservice(workspace=Workspace.create(name='quick-starts-ws-137363', subscription_id='aa7cf8e8-d23f-4bce-a7b9-1f0b4e0ac8ee', resource_group='aml-quickstarts-137363'), name=deployed-best-model, image_id=None, compute_type=None, state=ACI, scoring_uri=Healthy, tags=http://7cbe51b8-7a22-4143-994a-5a1a2a68ce63.southcentralus.azurecontainer.io/score, properties=None, created_by={})

In [42]:
service.wait_for_deployment(show_output = True)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running....
Succeeded
ACI service creation operation finished, operation "Succeeded"


In [43]:
print(f"Service state: {service.state}")
print(f"Scoring URI : {service.scoring_uri}")
print(f"Swagger URI: {service.swagger_uri}")

Service state: Healthy
Scoring URI : http://7cbe51b8-7a22-4143-994a-5a1a2a68ce63.southcentralus.azurecontainer.io/score
Swagger URI: http://7cbe51b8-7a22-4143-994a-5a1a2a68ce63.southcentralus.azurecontainer.io/swagger.json


In [44]:
service.get_keys()[0]

'dhUJHew1KNYgHUIfqnUcCDem1fDhpJv2'

In [38]:
### Enabling logging
!python3 logs.py

2021-02-05T20:47:42,682353549+00:00 - iot-server/run 
2021-02-05T20:47:42,683705314+00:00 - nginx/run 
2021-02-05T20:47:42,682783470+00:00 - gunicorn/run 
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libcrypto.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
/usr/sbin/nginx: /azureml-envs/azureml_20a8278aa8b20dd48cc50f56a6d2586c/lib/libssl.so.1.0.0: no version information available (required by /usr/sbin/nginx)
2021-02-05T20:47:42,686571552+00:00 - rsyslog/run 

In [50]:
# select sample data
x_samp = train_data.sample(4) # data is the pandas dataframe of the original data
y_samp = x_samp.pop('loanDefault')

# convert data samples to json format
sample = json.dumps({'data': x_samp.to_dict(orient='records')})

print(sample)

{"data": [{"firstPaymentDefault": 1, "firstPaymentRatio": 0.9487179487179488, "max_amount_taken": 0, "max_tenor_taken": 1, "loanAmount": 15000, "interestRate": 15.0, "clientIncome": 35000.0, "clientAge": 30, "clientNumberPhoneCOntacts": 685.0, "clientAvgCallsPerDay": 45.0, "loanNumber": 2, "clientGender_MALE": 1, "clientMaritalStatus_Married": 0, "clientMaritalStatus_Separated": 0, "clientMaritalStatus_Single": 1, "clientMaritalStatus_Widowed": 0, "clientLoanPurpose_education": 0, "clientLoanPurpose_house": 1, "clientLoanPurpose_medical": 0, "clientLoanPurpose_other": 0, "clientResidentialStauts_Family Owned": 0, "clientResidentialStauts_Own Residence": 0, "clientResidentialStauts_Rented": 1, "clientResidentialStauts_Temp. Residence": 0, "incomeVerified_True": 0}, {"firstPaymentDefault": 0, "firstPaymentRatio": 0.0, "max_amount_taken": 1, "max_tenor_taken": 1, "loanAmount": 37500, "interestRate": 12.5, "clientIncome": 210000.0, "clientAge": 35, "clientNumberPhoneCOntacts": 2743.0, "cli

In [None]:
# Used for http post request

# Set the content type
headers = {'Content-type': 'application/json'}

response = requests.post(service.scoring_uri, sample, headers=headers)

TODO: In the cell below, print the logs of the web service and delete the service

In [None]:
print(response.text)

In [None]:
# Print results from the inference
print(response.text)
print(response.status_code)
print(response.elapsed)
print(response.json())

In [None]:
# Print original labels
print(y_samp)

In [None]:
print(service.get_logs())

In [None]:
service.delete()