# Automated ML

Importing Depedencies required for the project

In [1]:
import os
import pandas as pd
import numpy as np
import json
import requests
import joblib
from sklearn.metrics import confusion_matrix
import itertools

from azureml.core import Dataset, Workspace, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails
from azureml.train.automl import AutoMLConfig

from azureml.core.model import InferenceConfig
from azureml.core.webservice import AciWebservice, Webservice
from azureml.core.model import Model
from azureml.core.environment import Environment
from azureml.data.dataset_factory import TabularDatasetFactory

In [2]:
ws = Workspace.from_config()

# choose a name for experiment
experiment_name = 'Auto-stoke'

experiment=Experiment(ws, experiment_name)

In [3]:
found = False
key = 'strokeDataset'
description_text = "Prediction of Stroke"

if key in ws.datasets.keys():
    found = True
    dataset = ws.datasets[key]
    
if not found :
    example = 'https://raw.githubusercontent.com/123manju900/Capstone-AzureML/main/stroke-prediction-dataset.csv'
    dataset = Dataset.Tabular.from_delimited_files(example)


    dataset = dataset.register(workspace = ws,
                          name = key , 
                          description = description_text )



In [3]:
from train2 import clean_data

In [4]:
df = dataset.to_pandas_dataframe()

In [5]:
dataset.take(5).to_pandas_dataframe()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,True,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,True,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,True,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,True,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,True,Self-employed,Rural,174.12,24.0,never smoked,1


In [7]:
df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,stroke
count,5110.0,5110.0,5110.0,5110.0,5110.0,5110.0
mean,36517.829354,43.226614,0.097456,0.054012,106.147677,0.048728
std,21161.721625,22.612647,0.296607,0.226063,45.28356,0.21532
min,67.0,0.08,0.0,0.0,55.12,0.0
25%,17741.25,25.0,0.0,0.0,77.245,0.0
50%,36932.0,45.0,0.0,0.0,91.885,0.0
75%,54682.0,61.0,0.0,0.0,114.09,0.0
max,72940.0,82.0,1.0,1.0,271.74,1.0


## Configuring Compute Cluster 

In [8]:
cpu_cluster_name = "cpu-cluster"

# Verify that cluster does not exist already
try:
    compute_target = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster, use it.')
except ComputeTargetException:
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D12_V2',
                                                           max_nodes=5)
    compute_target = ComputeTarget.create(ws, cpu_cluster_name, compute_config)

compute_target.wait_for_completion(show_output=True)


Creating...
SucceededProvisioning operation finished, operation "Succeeded"
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## AutoML Configuration



In [12]:
automl_settings = {
    "experiment_timeout_minutes": 30,
    "max_concurrent_iterations": 4,
    "primary_metric" : 'accuracy',
    "n_cross_validations": 6
}
automl_config = AutoMLConfig(compute_target=compute_target,
                             task = "classification",
                             training_data=dataset,
                             label_column_name="stroke", 
                             enable_early_stopping= True,
                             featurization= 'auto',
                             **automl_settings
                            )

In [13]:
# TODO: Submit your experiment
remote_run = experiment.submit(automl_config, show_output = True)

Submitting remote run.
No run_configuration provided, running on cpu-cluster with default configuration
Running on remote compute: cpu-cluster


Experiment,Id,Type,Status,Details Page,Docs Page
Auto-stoke,AutoML_e5d7b093-8e35-4aa7-bbfb-fac0f1d57cf5,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       ALERTED
DESCRIPTION:  To decrease model bias, please cancel the current run and fix balancing problem.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData
DETAILS:      Imbalanced data can lead to a falsely perceived positive effect of a model's accuracy because the input data has bias towards one class.
+---------------------------------+---------------------------------+--------------------------------------+
|Size of the smallest class       |Name/Label of the smallest class |Number of samples in the training data|
|249                              |1    

## Run Details


In [14]:
remote_run

Experiment,Id,Type,Status,Details Page,Docs Page
Auto-stoke,AutoML_e5d7b093-8e35-4aa7-bbfb-fac0f1d57cf5,automl,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [15]:
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [16]:
remote_run.wait_for_completion()

{'runId': 'AutoML_e5d7b093-8e35-4aa7-bbfb-fac0f1d57cf5',
 'target': 'cpu-cluster',
 'status': 'Completed',
 'startTimeUtc': '2021-04-19T10:43:43.439536Z',
 'endTimeUtc': '2021-04-19T11:04:33.262496Z',
 'properties': {'num_iterations': '1000',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'accuracy',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': '6',
  'target': 'cpu-cluster',
  'DataPrepJsonString': '{\\"training_data\\": {\\"datasetId\\": \\"404e0e1f-d3d2-4a5c-8de2-555ff34852dc\\"}, \\"datasets\\": 0}',
  'EnableSubsampling': None,
  'runTemplate': 'AutoML',
  'azureml.runsource': 'automl',
  'display_task_type': 'classification',
  'dependencies_versions': '{"azureml-widgets": "1.26.0", "azureml-train": "1.26.0", "azureml-train-restclients-hyperdrive": "1.26.0", "azureml-train-core": "1.26.0", "azureml-train-automl": "1.26.0", "azureml-train-automl-runtime": "1.26.0", "azureml-train-automl-client": "1.26.0", "azu

## Best Model

TODO: In the cell below, get the best model from the automl experiments and display all the properties of the model.



In [17]:
best_run, fitted_model = remote_run.get_output()
best_run_metrics = best_run.get_metrics()

In [18]:
print(best_run_metrics)

{'precision_score_micro': 0.7892550502279971, 'precision_score_weighted': 0.7938711188709916, 'precision_score_macro': 0.7924425401907916, 'average_precision_score_weighted': 0.8456835742822025, 'AUC_weighted': 0.8508112120519021, 'matthews_correlation': 0.5816038872679572, 'log_loss': 0.5499116896789149, 'f1_score_micro': 0.789255050227997, 'AUC_micro': 0.853851387239291, 'recall_score_weighted': 0.7892550502279968, 'average_precision_score_macro': 0.8446383122040788, 'weighted_accuracy': 0.7384877834829506, 'accuracy': 0.7892550502279984, 'AUC_macro': 0.8508112120519024, 'average_precision_score_micro': 0.8536727485399561, 'balanced_accuracy': 0.7891903300873637, 'norm_macro_recall': 0.5783806601747327, 'f1_score_weighted': 0.7887942843973047, 'recall_score_micro': 0.7892550502279968, 'f1_score_macro': 0.7880320185559949, 'recall_score_macro': 0.7891903300873637, 'confusion_matrix': 'aml://artifactId/ExperimentRun/dcid.AutoML_e5d7b093-8e35-4aa7-bbfb-fac0f1d57cf5_35/confusion_matrix',

In [19]:
print(best_run.get_details())

{'runId': 'AutoML_e5d7b093-8e35-4aa7-bbfb-fac0f1d57cf5_35', 'target': 'cpu-cluster', 'status': 'Completed', 'startTimeUtc': '2021-04-19T11:03:12.597585Z', 'endTimeUtc': '2021-04-19T11:04:22.42143Z', 'properties': {'runTemplate': 'automl_child', 'pipeline_id': '__AutoML_Ensemble__', 'pipeline_spec': '{"pipeline_id":"__AutoML_Ensemble__","objects":[{"module":"azureml.train.automl.ensemble","class_name":"Ensemble","spec_class":"sklearn","param_args":[],"param_kwargs":{"automl_settings":"{\'task_type\':\'classification\',\'primary_metric\':\'accuracy\',\'verbosity\':20,\'ensemble_iterations\':15,\'is_timeseries\':False,\'name\':\'Auto-stoke\',\'compute_target\':\'cpu-cluster\',\'subscription_id\':\'d7f39349-a66b-446e-aba6-0053c2cf1c11\',\'region\':\'southcentralus\',\'spark_service\':None}","ensemble_run_id":"AutoML_e5d7b093-8e35-4aa7-bbfb-fac0f1d57cf5_35","experiment_name":"Auto-stoke","workspace_name":"quick-starts-ws-143140","subscription_id":"d7f39349-a66b-446e-aba6-0053c2cf1c11","reso

In [20]:
best_run

Experiment,Id,Type,Status,Details Page,Docs Page
Auto-stoke,AutoML_e5d7b093-8e35-4aa7-bbfb-fac0f1d57cf5_35,azureml.scriptrun,Completed,Link to Azure Machine Learning studio,Link to Documentation


In [21]:
fitted_model


Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                reg_alpha=0.2631578947368421,
                                                                                                reg_lambda=1,
                                                                                                silent=True,
                                               

In [22]:
print('Best Run Id: ', best_run.id)
print('\n Accuracy:', best_run_metrics['accuracy'])
print(fitted_model._final_estimator)
print(best_run.get_tags())

Best Run Id:  AutoML_e5d7b093-8e35-4aa7-bbfb-fac0f1d57cf5_35

 Accuracy: 0.7892550502279984
PreFittedSoftVotingClassifier(classification_labels=None,
                              estimators=[('12',
                                           Pipeline(memory=None,
                                                    steps=[('maxabsscaler',
                                                            MaxAbsScaler(copy=True)),
                                                           ('lightgbmclassifier',
                                                            LightGBMClassifier(boosting_type='gbdt',
                                                                               class_weight=None,
                                                                               colsample_bytree=0.6933333333333332,
                                                                               importance_type='split',
                                                                         

In [23]:
os.makedirs('./outputs', exist_ok=True)

joblib.dump(fitted_model, filename='outputs/automl.joblib')

model_name = best_run.properties['model_name']
model_name

'AutoMLe5d7b093835'

In [24]:
from azureml.automl.core.shared import constants 
env = best_run.get_environment()

script_file = 'score.py'

best_run.download_file('outputs/scoring_file_v_1_0_0.py', script_file)
best_run.download_file(constants.CONDA_ENV_FILE_PATH, 'env.yml')

## Model Deployment

Remember you have to deploy only one of the two models you trained.. Perform the steps in the rest of this notebook only if you wish to deploy this model.

TODO: In the cell below, register the model, create an inference config and deploy the model as a web service.

In [25]:
#Register the fitted model
model = remote_run.register_model(model_name = model_name,
                                  description = 'AutoML_model')

In [27]:
inference_config = InferenceConfig(entry_script = script_file, environment = env)

aci_config = AciWebservice.deploy_configuration(cpu_cores = 1,
                                                memory_gb = 1, 
                                                enable_app_insights = True,
                                                auth_enabled = True)
                                            

aci_service_name = 'automl-webservice1'
print(aci_service_name)

automl-webservice1


In [28]:
service = Model.deploy(ws, aci_service_name, [model], inference_config, aci_config)
service.wait_for_deployment(True)
print("State: " + service.state)
print("Scoring URI: " + service.scoring_uri)

Tips: You can try get_logs(): https://aka.ms/debugimage#dockerlog or local deployment: https://aka.ms/debugimage#debug-locally to debug if deployment takes longer than 10 minutes.
Running
2021-04-19 11:24:55+00:00 Creating Container Registry if not exists..
2021-04-19 11:25:06+00:00 Registering the environment..
2021-04-19 11:25:08+00:00 Use the existing image.
2021-04-19 11:25:08+00:00 Generating deployment configuration.
2021-04-19 11:25:10+00:00 Submitting deployment to compute..
2021-04-19 11:25:19+00:00 Checking the status of deployment automl-webservice1..
2021-04-19 11:28:36+00:00 Checking the status of inference endpoint automl-webservice1.
Succeeded
ACI service creation operation finished, operation "Succeeded"
State: Healthy
Scoring URI: http://b9ec1a7a-ef14-46e0-b20d-e22061ad97b3.southcentralus.azurecontainer.io/score


In [29]:
print("State: " + service.state)
print("Scoring URI: " + service.scoring_uri)
print("Keys: " + service.get_keys()[0])
print("Swagger URI: " + service.swagger_uri)

State: Healthy
Scoring URI: http://b9ec1a7a-ef14-46e0-b20d-e22061ad97b3.southcentralus.azurecontainer.io/score
Keys: i0WTcGGafL3lDixE5P28o99UAU6KffPJ
Swagger URI: http://b9ec1a7a-ef14-46e0-b20d-e22061ad97b3.southcentralus.azurecontainer.io/swagger.json


TODO: In the cell below, send a request to the web service you deployed to test it.

In [30]:
data_test = df.dropna()
data_sample = data_test.sample(3)
y_true = data_sample.pop('stroke')
sample_json = json.dumps({'data':data_sample.to_dict(orient='records')})
print(sample_json)

{"data": [{"id": 60088, "gender": "Male", "age": 49.0, "hypertension": 1, "heart_disease": 0, "ever_married": true, "work_type": "Self-employed", "Residence_type": "Rural", "avg_glucose_level": 92.26, "bmi": "33.1", "smoking_status": "formerly smoked"}, {"id": 55766, "gender": "Male", "age": 41.0, "hypertension": 0, "heart_disease": 0, "ever_married": true, "work_type": "Private", "Residence_type": "Rural", "avg_glucose_level": 119.32, "bmi": "30.6", "smoking_status": "Unknown"}, {"id": 66680, "gender": "Female", "age": 49.0, "hypertension": 1, "heart_disease": 0, "ever_married": true, "work_type": "Private", "Residence_type": "Rural", "avg_glucose_level": 65.34, "bmi": "39.4", "smoking_status": "never smoked"}]}


In [31]:
primary, secondary = service.get_keys()


In [32]:
print(primary)

i0WTcGGafL3lDixE5P28o99UAU6KffPJ


In [33]:
key = primary 

In [None]:
headers['Authorization'] = f'Bearer{key}'

In [37]:
key = 'i0WTcGGafL3lDixE5P28o99UAU6KffPJ'

In [38]:
import requests
headers = {'Content-type': 'application/json'}

headers['Authorization'] = f'Bearer {key}'


response = requests.post(service.scoring_uri,sample_json, headers = headers )

In [39]:
print(response.text)

"{\"result\": [0, 0, 0]}"


In [40]:
service.get_logs(())



TODO: In the cell below, print the logs of the web service and delete the service

In [None]:
service.delete()