# Automated ML

TODO: Import Dependencies. In the cell below, import all the dependencies that you will need to complete the project.

In [2]:
import logging

from matplotlib import pyplot as plt
import pandas as pd
import os

import azureml.core
from azureml.core.experiment import Experiment
from azureml.core.workspace import Workspace
from azureml.automl.core.featurization import FeaturizationConfig
from azureml.core.dataset import Dataset
from azureml.train.automl import AutoMLConfig
from azureml.interpret import ExplanationClient
from azureml.automl.runtime.onnx_convert import OnnxConverter
from azureml.data.dataset_factory import TabularDatasetFactory
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
from azureml.core.run import Run
from sklearn.model_selection import train_test_split
from azureml.widgets import RunDetails
from azureml.train.sklearn import SKLearn

print("SDK version:", azureml.core.VERSION)

SDK version: 1.22.0


## Dataset

### Overview
TODO: In this markdown cell, give an overview of the dataset you are using. Also mention the task you will be performing.


TODO: Get data. In the cell below, write code to access the data you will be using in this project. Remember that the dataset needs to be external.

In [3]:
# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset

subscription_id = 'fa3dfb7e-5583-41a5-b60c-022e3fcc2942'
resource_group = 'mlops-rg-templateml'
workspace_name = 'mlops-aml-ws-templateml'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='HealthCareDataset_StrokeData')
data = dataset.to_pandas_dataframe()

In [4]:
data.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0
mean,36517.83,43.23,0.1,0.05,106.15,0.05
std,21161.72,22.61,0.3,0.23,45.28,0.22
min,67.0,0.08,0.0,0.0,55.12,0.0
25%,17741.25,25.0,0.0,0.0,77.24,0.0
50%,36932.0,45.0,0.0,0.0,91.88,0.0
75%,54682.0,61.0,0.0,0.0,114.09,0.0
max,72940.0,82.0,1.0,1.0,271.74,1.0


In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
id                   5110 non-null int64
gender               5110 non-null object
age                  5110 non-null float64
hypertension         5110 non-null int64
heart_disease        5110 non-null int64
ever_married         5110 non-null bool
work_type            5110 non-null object
Residence_type       5110 non-null object
avg_glucose_level    5110 non-null float64
bmi                  5110 non-null object
smoking_status       5110 non-null object
stroke               5110 non-null int64
dtypes: bool(1), float64(2), int64(4), object(5)
memory usage: 444.3+ KB


In [6]:
experiment_name = 'automl-stroke-classification-experiment'

experiment = Experiment(workspace, experiment_name)

print('Workspace name: ' + workspace.name, 
      'Azure region: ' + workspace.location, 
      'Subscription id: ' + workspace.subscription_id, 
      'Resource group: ' + workspace.resource_group, sep = '\n')

run = experiment.start_logging()

Workspace name: mlops-aml-ws-templateml
Azure region: eastus
Subscription id: fa3dfb7e-5583-41a5-b60c-022e3fcc2942
Resource group: mlops-rg-templateml


In [7]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cpu_cluster_name = "F-VM"
try:
    cpu_cluster = ComputeTarget(workspace=workspace, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D2_V2',
                                                           max_nodes=4)
    cpu_cluster = ComputeTarget.create(workspace, cpu_cluster_name, compute_config)

cpu_cluster.wait_for_completion(show_output=True)
print("Cluster details: ", cpu_cluster.get_status().serialize())

Found existing cluster, use it.

Running
Cluster details:  {'errors': [], 'creationTime': '2021-02-25T14:58:59.139839+00:00', 'createdBy': {'userObjectId': 'c3dbd685-d45f-431b-b4a5-5c916d4fe4ac', 'userTenantId': '006c1e48-e342-47e9-ab5d-0dd9ff89bd96', 'userName': None}, 'modifiedTime': '2021-02-25T15:01:48.959845+00:00', 'state': 'Running', 'vmSize': 'STANDARD_DS3_V2'}


# Analyse the data, transform and split the dataset to train and test sets

In [8]:
#Functions to clean data
def Impute_missing_values(df):
    df.drop(columns=['id'],inplace=True)
    
    #fill na
    df['age'].fillna(df['age'].median(), inplace=True)
    df['hypertension'].fillna(df['hypertension'].median(), inplace=True)
    df['heart_disease'].fillna(df['heart_disease'].median(), inplace=True)
    df['avg_glucose_level'].fillna(df['avg_glucose_level'].median(), inplace=True)
    
    #non categorical data
    from sklearn.preprocessing import OrdinalEncoder
    ord_enc = OrdinalEncoder()
    df["gender"] = ord_enc.fit_transform(df[["gender"]])
    df["ever_married"] = ord_enc.fit_transform(df[["ever_married"]])
    df["work_type"] = ord_enc.fit_transform(df[["work_type"]])
    df["Residence_type"] = ord_enc.fit_transform(df[["Residence_type"]])
    df["bmi"] = ord_enc.fit_transform(df[["bmi"]])
    df["smoking_status"] = ord_enc.fit_transform(df[["smoking_status"]])
    
    return df

def clean_data(df):
    df = Impute_missing_values(df)
    df.head()
    x_df = df
    y_df = x_df.pop("stroke")
    return x_df, y_df

x, y = clean_data(data)

In [9]:
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size=0.2)

In [10]:
df_train = pd.concat([x_train,y_train], axis=1)
df_test = pd.concat([x_test,y_test], axis=1)

#Convert x_train and y_train (Which are in pandas DataFrame format) to TabularDataset format.
try:
    os.makedirs('./data', exist_ok=True)
except OSError as error:
    print('New directory cannot be created')
    
path_train = 'data/train.csv'
path_test = 'data/test.csv'
df_train.to_csv(path_train)
df_test.to_csv(path_test)

datastore = workspace.get_default_datastore()
datastore.upload(src_dir='data', target_path='data')

train_data = TabularDatasetFactory.from_delimited_files(path=[(datastore, ('data/train.csv'))])
test_data = TabularDatasetFactory.from_delimited_files(path=[(datastore, ('data/test.csv'))])
print("Successfully converted the dataset to TabularDataset format.")


Uploading an estimated of 2 files
Target already exists. Skipping upload for data/test.csv
Target already exists. Skipping upload for data/train.csv
Uploaded 0 files
Successfully converted the dataset to TabularDataset format.


## AutoML Configuration

TODO: Explain why you chose the automl settings and cofiguration you used below.

In [11]:
from azureml.train.automl import AutoMLConfig

#Set parameters for AutoMLConfig
automl_config = AutoMLConfig(
    experiment_timeout_minutes=30,
    task="classification",
    primary_metric="accuracy",
    training_data=train_data,
    label_column_name="stroke",
    n_cross_validations=5,
    enable_onnx_compatible_models=True,
    compute_target=cpu_cluster)

In [12]:
# TODO: Submit your experiment
remote_run = experiment.submit(automl_config, show_output=True)

Running on remote.
No run_configuration provided, running on F-VM with default configuration
Running on remote compute: F-VM
Parent Run ID: AutoML_3d077790-c191-4e70-92f9-555383579bac

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+-----

## Run Details

OPTIONAL: Write about the different models trained and their performance. Why do you think some models did better than others?

TODO: In the cell below, use the `RunDetails` widget to show the different experiments.

In [13]:
from azureml.widgets import RunDetails

#Launch the widget to view the progress and results
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [14]:
%%time
remote_run.wait_for_completion()

CPU times: user 175 ms, sys: 3.96 ms, total: 179 ms
Wall time: 3.17 s


{'runId': 'AutoML_3d077790-c191-4e70-92f9-555383579bac',
 'target': 'F-VM',
 'status': 'Completed',
 'startTimeUtc': '2021-04-28T13:01:05.462226Z',
 'endTimeUtc': '2021-04-28T13:38:43.338947Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '5',
  'target': 'F-VM',
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"479837d4-36a4-4fe4-8951-fb527664c168\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"workspaceblobstore\\\\\\", \\\\\\"path\\\\\\": \\\\\\"data/train.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"mlops-rg-templateml\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"fa3dfb7e-5583-41a5-b60c-022e3fcc2942\\\\\\", \\\\\\"workspaceName\\\\\\": \\\\

## Best Model

TODO: In the cell below, get the best model from the automl experiments and display all the properties of the model.



In [15]:
best_run, onnx_model= remote_run.get_output(return_onnx_model=True)

In [16]:
OnnxConverter.save_onnx_model(onnx_model, file_path="./automl_model.onnx")

In [17]:
best_run.get_properties()

{'runTemplate': 'automl_child',
 'pipeline_id': '__AutoML_Ensemble__',
 'pipeline_spec': '{"pipeline_id":"__AutoML_Ensemble__","objects":[{"module":"azureml.train.automl.ensemble","class_name":"Ensemble","spec_class":"sklearn","param_args":[],"param_kwargs":{"automl_settings":"{\'task_type\':\'classification\',\'primary_metric\':\'accuracy\',\'verbosity\':20,\'ensemble_iterations\':15,\'is_timeseries\':False,\'name\':\'automl-stroke-classification-experiment\',\'compute_target\':\'F-VM\',\'subscription_id\':\'fa3dfb7e-5583-41a5-b60c-022e3fcc2942\',\'region\':\'eastus\',\'spark_service\':None}","ensemble_run_id":"AutoML_3d077790-c191-4e70-92f9-555383579bac_20","experiment_name":"automl-stroke-classification-experiment","workspace_name":"mlops-aml-ws-templateml","subscription_id":"fa3dfb7e-5583-41a5-b60c-022e3fcc2942","resource_group_name":"mlops-rg-templateml"}}]}',
 'training_percent': '100',
 'predicted_cost': None,
 'iteration': '20',
 '_aml_system_scenario_identification': 'Remote.C

In [18]:
best_run.get_metrics()

{'f1_score_micro': 1.0,
 'average_precision_score_macro': 1.0,
 'recall_score_macro': 1.0,
 'matthews_correlation': 1.0,
 'balanced_accuracy': 1.0,
 'average_precision_score_weighted': 1.0,
 'weighted_accuracy': 1.0,
 'recall_score_micro': 1.0,
 'recall_score_weighted': 1.0,
 'log_loss': 0.050787497746738394,
 'accuracy': 1.0,
 'average_precision_score_micro': 1.0,
 'precision_score_macro': 1.0,
 'AUC_weighted': 1.0,
 'precision_score_micro': 1.0,
 'norm_macro_recall': 1.0,
 'f1_score_macro': 1.0,
 'AUC_macro': 1.0,
 'AUC_micro': 1.0,
 'f1_score_weighted': 1.0,
 'precision_score_weighted': 1.0,
 'accuracy_table': 'aml://artifactId/ExperimentRun/dcid.AutoML_3d077790-c191-4e70-92f9-555383579bac_20/accuracy_table',
 'confusion_matrix': 'aml://artifactId/ExperimentRun/dcid.AutoML_3d077790-c191-4e70-92f9-555383579bac_20/confusion_matrix'}

In [19]:
print(best_run)

Run(Experiment: automl-stroke-classification-experiment,
Id: AutoML_3d077790-c191-4e70-92f9-555383579bac_20,
Type: azureml.scriptrun,
Status: Completed)


## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

In [20]:
model_name = best_run.properties['model_name']
script_file_name = 'score.py'
best_run.download_file('outputs/scoring_file_v_1_0_0.py', 'score.py')

In [21]:
description = 'AutoML Model'
tags = None
model = remote_run.register_model(model_name = model_name, description = description, tags = tags)
print(remote_run.model_id)

AutoML3d077790c20


In [23]:
from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice
from azureml.core.webservice import Webservice
from azureml.core.model import Model
from azureml.core.environment import Environment

inference_config = InferenceConfig(entry_script='score.py')

aciconfig = AciWebservice.deploy_configuration(cpu_cores = 1, 
                                               memory_gb = 1, 
                                               tags = {'type': "automl_classification"}, 
                                               description = 'Heart Data Service')

aci_service_name = 'automl-uciheart1'
print(aci_service_name)
aci_service = Model.deploy(workspace, aci_service_name, [model], inference_config, aciconfig)
aci_service.wait_for_deployment(True)
print(aci_service.state)

automl-uciheart1
Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running..........
Succeeded
ACI service creation operation finished, operation "Succeeded"
Healthy


In [24]:
aci_service.update(enable_app_insights=True)

In [25]:
print(f'\nservice state: {aci_service.state}\n')
print(f'scoring URI: \n{aci_service.scoring_uri}\n')
print(f'swagger URI: \n{aci_service.swagger_uri}\n')


service state: Healthy

scoring URI: 
http://5bbe88a5-7788-4805-8569-39961483f826.eastus.azurecontainer.io/score

swagger URI: 
http://5bbe88a5-7788-4805-8569-39961483f826.eastus.azurecontainer.io/swagger.json



TODO: In the cell below, send a request to the web service you deployed to test it.

In [28]:
import json
import requests

scoring_uri = aci_service.scoring_uri
# Two sets of data to score, so we get two results back
data = {"data":
        [
          { 
            "gender": 45,
            "age": 0,
            "hypertension": 2413,
            "heart_disease": 0,
            "ever_married": 38,
            "work_type": 0,
            "Residence_type": 140000,
            "avg_glucose_level": 1.4,
            "bmi": 140,
            "smoking_status": 1
          },
          {
            "gender": 50,
            "age": 0,
            "hypertension": 196,
            "heart_disease": 0,
            "ever_married": "45",
            "work_type": 0,
            "Residence_type": 395000,
            "avg_glucose_level": 1.6,
            "bmi": 136,
            "smoking_status":1
          },
      ]
    }
# Convert to JSON string
input_data = json.dumps(data)
# Set the content type
headers = {'Content-Type': 'application/json'}
# Make the request and display the response
resp = requests.post(scoring_uri, input_data, headers=headers)
print(resp)

<Response [502]>


TODO: In the cell below, print the logs of the web service and delete the service

In [29]:
aci_service.get_logs()



In [None]:
aci_service.delete()
model.delete()
compute_target.delete()