# Automated ML


In [30]:
import logging
import os
import csv

from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets
import pkg_resources

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.train.automl import AutoMLConfig
from azureml.core.dataset import Dataset

from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails

from azureml.pipeline.steps import AutoMLStep

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)

SDK version: 1.34.0


## Dataset

### Overview


In [11]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')

# choose a name for experiment
experiment_name = 'automl-heart-failure-prediction'

experiment=Experiment(ws, experiment_name)
experiment

quick-starts-ws-159882
aml-quickstarts-159882
southcentralus
d4ad7261-832d-46b2-b093-22156001df5b


Name,Workspace,Report Page,Docs Page
automl-heart-failure-prediction,quick-starts-ws-159882,Link to Azure Machine Learning studio,Link to Documentation


### Set up compute target

In [12]:
amlcompute_cluster_name = "heart-compute"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=amlcompute_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',max_nodes=4)
    compute_target = ComputeTarget.create(ws, amlcompute_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)


InProgress....
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Dataset

In [14]:
found = False
key = "heart-failure-dataset"

if key in ws.datasets.keys(): 
        found = True
        dataset = ws.datasets[key]
        
if not found:
    print("Dataset not found, please upload it!")
    
df = dataset.to_pandas_dataframe()
df.describe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
count,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0,299.0
mean,60.833893,0.431438,581.839465,0.41806,38.083612,0.351171,263358.029264,1.39388,136.625418,0.648829,0.32107,130.26087,0.32107
std,11.894809,0.496107,970.287881,0.494067,11.834841,0.478136,97804.236869,1.03451,4.412477,0.478136,0.46767,77.614208,0.46767
min,40.0,0.0,23.0,0.0,14.0,0.0,25100.0,0.5,113.0,0.0,0.0,4.0,0.0
25%,51.0,0.0,116.5,0.0,30.0,0.0,212500.0,0.9,134.0,0.0,0.0,73.0,0.0
50%,60.0,0.0,250.0,0.0,38.0,0.0,262000.0,1.1,137.0,1.0,0.0,115.0,0.0
75%,70.0,1.0,582.0,1.0,45.0,1.0,303500.0,1.4,140.0,1.0,1.0,203.0,1.0
max,95.0,1.0,7861.0,1.0,80.0,1.0,850000.0,9.4,148.0,1.0,1.0,285.0,1.0


In [19]:
set(df)
df['DEATH_EVENT'].dtype

dtype('int64')

In [25]:
df.info

<bound method DataFrame.info of       age  anaemia  creatinine_phosphokinase  diabetes  ejection_fraction  \
0    75.0        0                       582         0                 20   
1    55.0        0                      7861         0                 38   
2    65.0        0                       146         0                 20   
3    50.0        1                       111         0                 20   
4    65.0        1                       160         1                 20   
..    ...      ...                       ...       ...                ...   
294  62.0        0                        61         1                 38   
295  55.0        0                      1820         0                 38   
296  45.0        0                      2060         1                 60   
297  45.0        0                      2413         0                 38   
298  50.0        0                       196         0                 45   

     high_blood_pressure  platelets  serum_

## AutoML Configuration

Code for Auto ML settings : 
- **experiment_timeout_minutes** - to determine the time duration for an experiment
- **max_iterations** - to determine the maximum number of iterations for model runs
- **max_concurrent_iterations** - to determine the number of parallel runs
- **n_cross_validations** - for model evaluation
- **primary_metric** - model metric to optimise for (auc_roc since the the data s imabalanced)

In [26]:

automl_settings = {
    "experiment_timeout_minutes": 15,
    "iterations": 40,
    "max_concurrent_iterations": 4,
    "n_cross_validations": 5,
    "primary_metric" : 'accuracy'
}


automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "classification",
                             training_data=dataset,
                             label_column_name="DEATH_EVENT",
                             enable_early_stopping= True,
                             debug_log = "automl_errors.log",
                             **automl_settings
                            )

In [27]:
# TODO: Submit your experiment
remote_run = experiment.submit(automl_config, show_output = True)

Submitting remote run.
No run_configuration provided, running on heart-compute with default configuration
Running on remote compute: heart-compute


Experiment,Id,Type,Status,Details Page,Docs Page
automl-heart-failure-prediction,AutoML_fa14536e-4625-40e7-a2bc-5285f16808f0,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

****************************************************************************************************

TYPE:         High cardinality feature detection
STATUS

## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [31]:
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [32]:
remote_run.wait_for_completion(show_output=True)

Experiment,Id,Type,Status,Details Page,Docs Page
automl-heart-failure-prediction,AutoML_fa14536e-4625-40e7-a2bc-5285f16808f0,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation




****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

****************************************************************************************************

TYPE:         High cardinality feature detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and no high cardinality features were detected.
              Learn more abo

{'runId': 'AutoML_fa14536e-4625-40e7-a2bc-5285f16808f0',
 'target': 'heart-compute',
 'status': 'Completed',
 'startTimeUtc': '2021-10-03T10:30:31.580956Z',
 'endTimeUtc': '2021-10-03T10:59:05.988934Z',
 'services': {},
 'properties': {'num_iterations': '40',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'heart-compute',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"a7398654-f59c-416f-8798-c3bd9eaeb682\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': 'False',
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.34.0", "azureml-train": "1.34.0", "azureml-train-restclients-hyperdrive": "1.34.0", "azureml-train-core": "1.34.0", "azureml-train-automl": "1.34.0", "azureml-train-automl-runtime": "1.34.0", "azureml-train-automl-c

## Best Model

TODO: In the cell below, get the best model from the automl experiments and display all the properties of the model.



In [35]:
best_run, fitted_model = remote_run.get_output()

In [36]:
best_metrics = best_run.get_metrics()

print("Best Run Id : {}".format(best_run.id), 
      "Accuracy : {}".format(best_metrics['accuracy']), 
      "Best metrics : {}".format(best_metrics), 
      "Best model : {}".format(fitted_model), sep = '\n')

Best Run Id : AutoML_fa14536e-4625-40e7-a2bc-5285f16808f0_38
Accuracy : 0.8797175141242939
Best metrics : {'precision_score_binary': 0.8893152613976412, 'AUC_weighted': 0.9080679448135843, 'recall_score_binary': 0.7541666666666667, 'precision_score_macro': 0.8868283836983851, 'log_loss': 0.3844556178799626, 'AUC_macro': 0.9080679448135843, 'precision_score_weighted': 0.8978397339447616, 'f1_score_weighted': 0.8755182681309363, 'f1_score_macro': 0.8550784356251512, 'average_precision_score_macro': 0.8890097891117617, 'f1_score_micro': 0.8797175141242939, 'balanced_accuracy': 0.8533333333333333, 'norm_macro_recall': 0.7066666666666667, 'matthews_correlation': 0.737022258585793, 'recall_score_weighted': 0.8797175141242939, 'AUC_micro': 0.9122754476682946, 'precision_score_micro': 0.8797175141242939, 'recall_score_macro': 0.8533333333333333, 'weighted_accuracy': 0.8973107278267498, 'f1_score_binary': 0.798097131120387, 'average_precision_score_weighted': 0.9159557818244755, 'recall_score_m

In [37]:
print(fitted_model._final_estimator)

PreFittedSoftVotingClassifier(
    estimators=[('36', Pipeline(
        memory=None,
        steps=[('sparsenormalizer', Normalizer(
            copy=True,
            norm='l1'
        )), ('xgboostclassifier', XGBoostClassifier(
            random_state=0,
            n_jobs=1,
            problem_info=ProblemInfo(
                gpu_training_param_dict={'processing_unit_type': 'cpu'}
            ),
            booster='gbtree',
            colsample_bylevel=1,
            colsample_bytree=1,
            eta=0.5,
            gamma=0.01,
            max_depth=7,
            max_leaves=15,
            n_estimators=100,
            objective='reg:logistic',
            reg_alpha=0.5208333333333334,
            reg_lambda=2.291666666666667,
            subsample=1,
            tree_method='auto'
        ))],
        verbose=False
    )), ('37', Pipeline(
        memory=None,
        steps=[('sparsenormalizer', Normalizer(
            copy=True,
            norm='l2'
        )), ('xgboos

In [None]:
#TODO: Save the best model
model = best_run.register_model(model_name = "heart-failure-predict-best-model-automl", model_path = './outputs/model.pkl')
print(model)

In [None]:
for model in Model.list(ws):
    print(model.name, 'version:', model.version)

## Model Deployment

Remember you have to deploy only one of the two models you trained but you still need to register both the models. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

In [None]:
#Save the best model for the deployement
import os
os.makedirs('./aml-model', exist_ok=True)

best_run.download_file('/outputs/model.pkl',os.path.join('./aml-model','heart-failure-predict-best-model-automl.pkl'))

for f in best_run.get_file_names():
    if f.startswith('outputs'):
        output_file_path = os.path.join('./aml-model', f.split('/')[-1])
        print(f'Downloading from {f} to {output_file_path} ...')
        best_run.download_file(name=f, output_file_path=output_file_path)

In [None]:
model=best_run.register_model(
            model_name = 'heart-failure-best-model-automl', 
            model_path = './outputs/model.pkl'
)

## Set up the environment

In [None]:
best_run.download_file('outputs/conda_env_v_1_0_0.yml', 'conda_env.yml')
environment = Environment.from_conda_specification(name = 'heart-failure-env', file_path = 'conda_env.yml')

## Download the scoring file

TODO: In the cell below, send a request to the web service you deployed to test it.

In [None]:
best_run.download_file('outputs/scoring_file_v_1_0_0.py', 'score.py')

## Inference Config

In [None]:
inference_config = InferenceConfig(entry_script= 'score.py', environment=environment)

## ACI config

TODO: In the cell below, print the logs of the web service and delete the service

In [None]:
aci_config = AciWebservice.deploy_configuration(cpu_cores=1, memory_gb=1, auth_enabled=True, enable_app_insights=True)

## Deploy the model

In [None]:
webservice = Model.deploy(workspace=ws, 
                        name='heart-failure-ws', 
                        models=[model], 
                        inference_config=inference_config,
                        deployment_config=aci_config,
                        overwrite=True)

In [None]:
webservice

In [None]:
# wait for deployment to finish and display the scoring uri and swagger uri
webservice.wait_for_deployment(show_output=True)

print("Status : {}".format(webservice.state), 
      "scoring_uri : {}".format(webservice.scoring_uri), 
      "swagger_uri : {}".format(webservice.swagger_uri), sep = '\n')

## Consume

In [None]:
from pprint import pprint

# select samples from the dataframe
test_data = df.sample(5)
test_label = test_data.pop('DEATH_EVENT')

# convert the records to a json data file
data_json = test_data.to_dict(orient='records')

data = json.dumps({'data': data_json})
pprint(data)

## Get ineference

In [None]:
output = webservice.run(data)
print(output)

In [None]:

webservice.get_logs()

In [None]:
# select samples from the dataframe
test_data = df.sample(10)
test_label = test_data.pop('DEATH_EVENT')

# convert the records to a json data file
data_json = test_data.to_dict(orient='records')

data = json.dumps({'data': data_json})

# get inference
output = webservice.run(data)
print(output)

In [None]:
webservice.get_logs()

In [None]:
webservice.delete()
compute_target.delete()

**Submission Checklist**
- I have registered the model.
- I have deployed the model with the best accuracy as a webservice.
- I have tested the webservice by sending a request to the model endpoint.
- I have deleted the webservice and shutdown all the computes that I have used.
- I have taken a screenshot showing the model endpoint as active.
- The project includes a file containing the environment details.
