In [1]:
import pandas as pd
import numpy as np
from azureml.core import Workspace, Dataset, Datastore, Experiment
from azureml.core.compute import ComputeTarget
from azureml.train.automl import AutoMLConfig
from azureml.train.automl.run import AutoMLRun
from azureml.widgets import RunDetails

# Initializing The Compute Cluster

In [2]:
ws = Workspace.from_config(path = 'config.json')
compute_name = 'automl-cluster'
compute_target = ComputeTarget(ws, compute_name)

# Preparing The Diabetes Dataset

In [3]:
datastore = Datastore.get_default(ws)

In [4]:
diabetes_dataset = Dataset.get_by_name(ws, 'Diabetes Sample', version = 'latest')
diabetes_dataset.take(10).to_pandas_dataframe()

Unnamed: 0,AGE,SEX,BMI,BP,S1,S2,S3,S4,S5,S6,Y
0,59,2,32.1,101.0,157,93.2,38.0,4.0,4.8598,87,151
1,48,1,21.6,87.0,183,103.2,70.0,3.0,3.8918,69,75
2,72,2,30.5,93.0,156,93.6,41.0,4.0,4.6728,85,141
3,24,1,25.3,84.0,198,131.4,40.0,5.0,4.8903,89,206
4,50,1,23.0,101.0,192,125.4,52.0,4.0,4.2905,80,135
5,23,1,22.6,89.0,139,64.8,61.0,2.0,4.1897,68,97
6,36,2,22.0,90.0,160,99.6,50.0,3.0,3.9512,82,138
7,66,2,26.2,114.0,255,185.0,56.0,4.55,4.2485,92,63
8,60,2,32.1,83.0,179,119.4,42.0,4.0,4.4773,94,110
9,29,1,30.0,85.0,180,93.4,43.0,4.0,5.3845,88,310


In [5]:
diabetes_df = diabetes_dataset.to_pandas_dataframe()
diabetes_df_cleaned = diabetes_df.drop(['AGE', 'SEX'], axis = 1)
diabetes_dataset.Tabular.register_pandas_dataframe(diabetes_df_cleaned, datastore, 
                                                   'Diabetes Sample With Age & Sex Dropped')

Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/9931cbf2-5e30-4a23-876b-a956d763d143/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


{
  "source": [
    "('workspaceblobstore', 'managed-dataset/9931cbf2-5e30-4a23-876b-a956d763d143/')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ReadParquetFile",
    "DropColumns"
  ],
  "registration": {
    "id": "297c4497-fc87-40a6-b39e-580ec2673918",
    "name": "Diabetes Sample With Age & Sex Dropped",
    "version": 4,
    "workspace": "Workspace.create(name='auotml-example-workspace', subscription_id='0c19fc19-85fd-4aa4-b133-61dd20fa93df', resource_group='edwin.spartan117-rg')"
  }
}

In [6]:
outlier_threshold = round((np.std(diabetes_df.AGE)*3) + np.mean(diabetes_df.AGE))
diabetes_df['AGE'] = diabetes_df['AGE'].mask(diabetes_df['AGE'] > outlier_threshold, outlier_threshold)

In [7]:
diabetes_df['OBESE'] = np.where(diabetes_df['BMI'] > 30, 1, 0)

In [8]:
diabetes_dataset.Tabular.register_pandas_dataframe(diabetes_df, datastore, 'Diabetes Sample After Data Transformation')

Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to managed-dataset/43de3c93-39de-4359-8f02-1e05949ac36c/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


{
  "source": [
    "('workspaceblobstore', 'managed-dataset/43de3c93-39de-4359-8f02-1e05949ac36c/')"
  ],
  "definition": [
    "GetDatastoreFiles",
    "ReadParquetFile",
    "DropColumns"
  ],
  "registration": {
    "id": "25f7c570-d539-4de6-82ba-a14f3d82a0a1",
    "name": "Diabetes Sample After Data Transformation",
    "version": 2,
    "workspace": "Workspace.create(name='auotml-example-workspace', subscription_id='0c19fc19-85fd-4aa4-b133-61dd20fa93df', resource_group='edwin.spartan117-rg')"
  }
}

# Model Training

In [9]:
diabetes_experiment = Experiment(workspace = ws, name = 'Diabetes-Sample-Regression')

run_configuration = AutoMLConfig(task = 'regression', primary_metric = 'normalized_root_mean_squared_error', 
                                 featurization = 'auto', compute_target = compute_target, training_data = diabetes_dataset,
                                 label_column_name = 'Y', experiment_timeout_minutes = 30, enable_early_stopping = True,
                                 n_cross_validations = 10, model_explainability = True)
AutoML_run = diabetes_experiment.submit(run_configuration, show_output = True)
RunDetails(AutoML_run).show()

Submitting remote run.
No run_configuration provided, running on automl-cluster with default configuration
Running on remote compute: automl-cluster


Experiment,Id,Type,Status,Details Page,Docs Page
Diabetes-Sample-Regression,AutoML_05a05f45-6d7e-45e1-9b2a-41ff3dd9d4af,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

********************************************************************************************

TYPE:         High cardinality feature detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and no high cardinality features were detected.
              Learn more about high cardinality feature handling: https://aka.ms/AutomatedMLFeaturization

************************************************************

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

# Model Registration For Future Use

In [10]:
AutoML_run.register_model(model_name = 'Diabetes-AllColumns-Regression-AutoML', 
                          description = 'Best AutoML Regression Run using the Diabetes Sample Data. This model uses the AGE and SEX columns.',
                          tags = {'project': 'Diabetes', 'creator': 'Edwin Goh'})

Model(workspace=Workspace.create(name='auotml-example-workspace', subscription_id='0c19fc19-85fd-4aa4-b133-61dd20fa93df', resource_group='edwin.spartan117-rg'), name=Diabetes-AllColumns-Regression-AutoML, id=Diabetes-AllColumns-Regression-AutoML:2, version=2, tags={'project': 'Diabetes', 'creator': 'Edwin Goh'}, properties={})

## Using $R^2$ As The Evaluation Metric

In [11]:
AutoML_run.register_model(model_name = 'Diabetes-AllColumns-Regression-AutoML-R2', 
                          description = 'Best AutoML Regression Run using the Diabetes Sample Data. This model uses the AGE and SEX columns.',
                          tags = {'project': 'Diabetes', 'creator': 'Edwin Goh', 'metric': "R-Squared"}, metric = 'r2_score')

Model(workspace=Workspace.create(name='auotml-example-workspace', subscription_id='0c19fc19-85fd-4aa4-b133-61dd20fa93df', resource_group='edwin.spartan117-rg'), name=Diabetes-AllColumns-Regression-AutoML-R2, id=Diabetes-AllColumns-Regression-AutoML-R2:2, version=2, tags={'project': 'Diabetes', 'creator': 'Edwin Goh', 'metric': 'R-Squared'}, properties={})

In [12]:
from azureml.core.model import Model
diabetes_regression_model_r2 = Model(ws, 'Diabetes-AllColumns-Regression-AutoML-R2')