# Automated ML

In [1]:
#Import Dependencies
from azureml.data.dataset_factory import TabularDatasetFactory
import joblib
from azureml.train.automl import AutoMLConfig
from azureml.core import Workspace, Experiment, Dataset
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails
import os
import joblib
from azureml.core import Experiment
from azureml.core.model import Model

In [2]:
#find the workspace and create the environment with the name "AutomatedML". Start logging using exp.
ws = Workspace.from_config()
exp = Experiment(workspace=ws, name="AutomatedML")

print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep = '\n')

run = exp.start_logging()

# Choose a name for your CPU cluster
cpu_cluster_name = "cpuabccluster"

# to verify that cluster does not exist already
try:
    cpu_cluster = ComputeTarget(workspace=ws, name=cpu_cluster_name)
    print('Found existing cluster')
    
#create a new cluster with the specified configurations "vm_size='STANDARD_D12_V2' and max number of nodes as 4
except ComputeTargetException:
    
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_D12_V2', max_nodes=4)
    cpu_cluster = ComputeTarget.create(ws, cpu_cluster_name, compute_config)
    print("New cluster is created")
    
cpu_cluster.wait_for_completion(show_output=True)


Workspace name: quick-starts-ws-136095
Azure region: southcentralus
Subscription id: 3d1a56d2-7c81-4118-9790-f85d1acf0c77
Resource group: aml-quickstarts-136095
New cluster is created
Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned


## Dataset

### Overview
I pass used the an open dataset from Kaggle Heart Failure Prediction (https://www.kaggle.com/andrewmvd/heart-failure-clinical-data). The dataset has 299 records of the patients.

#### Task
Heart failure is a common event caused by Cardiovascular diseases (CVDs) and this dataset contains 12 features that can be used to predict mortality by heart failure.

#### Access
The dataset is stored at
https://raw.githubusercontent.com/1AishwaryaSH/ML-Engineer-with-Azure-Capstone/main/heart_failure_clinical_records_dataset.csv

I imported the data into azure platform by uploading it from local files into Datasets for automated ML model.

In [3]:
# azureml-core of version 1.0.72 or higher is required
# azureml-dataprep[pandas] of version 1.1.34 or higher is required
from azureml.core import Workspace, Dataset

subscription_id = '3d1a56d2-7c81-4118-9790-f85d1acf0c77'
resource_group = 'aml-quickstarts-136095'
workspace_name = 'quick-starts-ws-136095'

workspace = Workspace(subscription_id, resource_group, workspace_name)

dataset = Dataset.get_by_name(workspace, name='heart')
dataset.to_pandas_dataframe()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280,0


## AutoML Configuration

* n_cross_validations =5 parameter sets how many cross validations to perform, based on the same number of folds.
* iterations specify the total number of different algorithm and parameter combinations to test during an automated ML experiment set to 30.
* max_concurrent_iterations is 4 represents the maximum number of iterations that would be executed in parallel.
* primary_metric as Accuracy the metric that Automated Machine Learning will optimize for model selection. Automated Machine Learning collects more metrics than it can optimize. 
* DEATH_EVENT is the target column.
* classification task refers to a predictive modeling problem where a class label is predicted for a given example of input data.
* experiment_timeout_minutes =30 Maximum amount of time in minutes that all iterations combined can take before the experiment terminates.
* compute_target is the Azure Machine Learning compute target to run the Automated Machine Learning experiment on set to the cpu_cluster we created before. 


In [4]:
automl_settings = {
    "experiment_timeout_minutes":30,
    "task":'classification',
    "primary_metric":'accuracy',
    "training_data":dataset,
    "label_column_name":'DEATH_EVENT',
    "iterations":30,
    "max_concurrent_iterations": 4,
    "n_cross_validations":5,
 }

automl_config = AutoMLConfig(
    compute_target=cpu_cluster,
    **automl_settings)


In [5]:
# Submit experiment
remote_run=exp.submit(automl_config, show_output=True)


Running on remote.
No run_configuration provided, running on cpuabccluster with default configuration
Running on remote compute: cpuabccluster
Parent Run ID: AutoML_3857386a-722a-4619-b6ff-1b5c7cb813c5

Current status: FeaturesGeneration. Generating features for the dataset.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputati

## Run Details

In [6]:
RunDetails(remote_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [7]:

#getting the metrics required
best_run_auto, fitted_model_auto = remote_run.get_output()
best_run_metrics = best_run_auto.get_metrics()
parameter_values = best_run_auto.get_details()['runDefinition']['arguments']

print('The best run automl model\n', best_run_auto)

print('\nThe Id for best run is:', best_run_auto.id)
print('The Accuracy: is', best_run_metrics['accuracy'])



The best run automl model
 Run(Experiment: AutomatedML,
Id: AutoML_3857386a-722a-4619-b6ff-1b5c7cb813c5_28,
Type: azureml.scriptrun,
Status: Completed)

The Id for best run is: AutoML_3857386a-722a-4619-b6ff-1b5c7cb813c5_28
The Accuracy: is 0.8729943502824857


## Best Model

TODO: In the cell below, get the best model from the automl experiments and display all the properties of the model.



In [8]:
# Retrieve and save your best automl model.
best_run_model, fitted_model_a = remote_run.get_output()

joblib.dump(fitted_model_a, "model.joblib")

print(best_run_model)

print(fitted_model_a)

Run(Experiment: AutomatedML,
Id: AutoML_3857386a-722a-4619-b6ff-1b5c7cb813c5_28,
Type: azureml.scriptrun,
Status: Completed)
Pipeline(memory=None,
         steps=[('datatransformer',
                 DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
                                 feature_sweeping_config=None,
                                 feature_sweeping_timeout=None,
                                 featurization_config=None, force_text_dnn=None,
                                 is_cross_validation=None,
                                 is_onnx_compatible=None, logger=None,
                                 observer=None, task=None, working_dir=None)),
                ('prefittedsoftvotingclassifier',...
                                                                                                objective=None,
                                                                                                random_state=None,
                                        

In [9]:
#print the estimators
from pprint import pprint

def print_model(model, prefix=""):
    for step in model.steps:
        print(prefix + step[0])
        if hasattr(step[1], 'estimators') and hasattr(step[1], 'weights'):
            pprint({'estimators': list(
                e[0] for e in step[1].estimators), 'weights': step[1].weights})
            print()
            for estimator in step[1].estimators:
                print_model(estimator[1], estimator[0] + ' - ')
        else:
            pprint(step[1].get_params())
            print()

print_model(fitted_model_a)

datatransformer
{'enable_dnn': None,
 'enable_feature_sweeping': None,
 'feature_sweeping_config': None,
 'feature_sweeping_timeout': None,
 'featurization_config': None,
 'force_text_dnn': None,
 'is_cross_validation': None,
 'is_onnx_compatible': None,
 'logger': None,
 'observer': None,
 'task': None,
 'working_dir': None}

prefittedsoftvotingclassifier
{'estimators': ['19', '14', '21', '17', '6', '5'],
 'weights': [0.14285714285714285,
             0.14285714285714285,
             0.14285714285714285,
             0.14285714285714285,
             0.2857142857142857,
             0.14285714285714285]}

19 - sparsenormalizer
{'copy': True, 'norm': 'l1'}

19 - lightgbmclassifier
{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 0.7922222222222222,
 'importance_type': 'split',
 'learning_rate': 0.03158578947368421,
 'max_bin': 140,
 'max_depth': 7,
 'min_child_samples': 5,
 'min_child_weight': 4,
 'min_split_gain': 0.3157894736842105,
 'n_estimators': 200,
 'n_job

In [11]:
#final estimator
print(fitted_model_a._final_estimator)


PreFittedSoftVotingClassifier(classification_labels=None,
                              estimators=[('19',
                                           Pipeline(memory=None,
                                                    steps=[('sparsenormalizer',
                                                            <azureml.automl.runtime.shared.model_wrappers.SparseNormalizer object at 0x7f6730451860>),
                                                           ('lightgbmclassifier',
                                                            LightGBMClassifier(boosting_type='gbdt',
                                                                               class_weight=None,
                                                                               colsample_bytree=0.7922222222222222,
                                                                               importance_type='split',...
                                                                               num_leaves=155

In [12]:
#Register the model "best-model_auto.joblib"
the_bestmodel = best_run_model.register_model(model_name='best-model_auto.joblib', model_path='./')

In [13]:
#display the best model 
the_bestmodel

Model(workspace=Workspace.create(name='quick-starts-ws-136095', subscription_id='3d1a56d2-7c81-4118-9790-f85d1acf0c77', resource_group='aml-quickstarts-136095'), name=best-model_auto.joblib, id=best-model_auto.joblib:1, version=1, tags={}, properties={})