In [1]:
import pandas as pd
import numpy as np
from azureml.core import Workspace, Dataset, Datastore, Experiment
from azureml.core.compute import ComputeTarget
from azureml.train.automl import AutoMLConfig
from azureml.train.automl.run import AutoMLRun
from azureml.widgets import RunDetails

# Initializing The Compute Cluster

In [2]:
ws = Workspace.from_config(path = 'config.json')
compute_name = 'automl-cluster'
compute_target = ComputeTarget(ws, compute_name)

# Binary Classification Using The Titanic Dataset

In [3]:
datastore = Datastore.get_default(ws)
titanic_dataset = Dataset.get_by_name(ws, 'Titanic_Training_Data', version = 'latest')
titanic_dataset.take(10).to_pandas_dataframe()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,False,3,male,22.0,1,0,7.25,S
1,True,1,female,38.0,1,0,71.2833,C
2,True,3,female,26.0,0,0,7.925,S
3,True,1,female,35.0,1,0,53.1,S
4,False,3,male,35.0,0,0,8.05,S
5,False,3,male,,0,0,8.4583,Q
6,False,1,male,54.0,0,0,51.8625,S
7,False,3,male,2.0,3,1,21.075,S
8,True,3,female,27.0,0,2,11.1333,S
9,True,2,female,14.0,1,0,30.0708,C


## Data Preparation For The Titanic Dataset

### Exploring The Age Column Since The Column Contains Missing Values

In [4]:
titanic_df = titanic_dataset.to_pandas_dataframe()
round(titanic_df.groupby(['Sex'])['Age'].mean())

Sex
female    28.0
male      31.0
Name: Age, dtype: float64

In [5]:
titanic_df['Age'] = titanic_df.apply(lambda row: 31 if np.isnan(row['Age']) and row['Sex'] == 'male'
                                     else (28 if np.isnan(row['Age']) and row['Sex'] == 'female' else row['Age']), 
                                     axis = 1)

### Binning The "Age" Column To 4 Different Age Groups

In [6]:
titanic_df['Age Below 15'] = np.where(titanic_df['Age'] < 15, 1, 0)
titanic_df['Age Between 15 and 34'] = np.where((titanic_df['Age'] > 14) & (titanic_df['Age'] < 35), 1, 0)
titanic_df['Age Between 35 and 60'] = np.where((titanic_df['Age'] > 34) & (titanic_df['Age'] < 61), 1, 0)
titanic_df['Age Above 60'] = np.where(titanic_df['Age'] > 60, 1, 0)

In [7]:
titanic_df = titanic_df.drop(['Age'], axis = 1)

In [None]:
titanic_dataset.Tabular.register_pandas_dataframe(titanic_df, datastore, 'Titanic Data After Transformation')

## Model Training

In [None]:
titanic_experiment = Experiment(workspace = ws, name = 'Titanic-Classification')
titanic_transformed_dataset = Dataset.get_by_name(ws, 'Titanic Data After Transformation', version = 'latest')

run_configuration = AutoMLConfig(task = 'classification', primary_metric = 'accuracy', 
                                 num_classes = titanic_df['Survived'].nunique(), featurization = 'auto', 
                                 compute_target = compute_target, training_data = titanic_transformed_dataset,
                                 label_column_name = 'Survived', experiment_timeout_minutes = 15, 
                                 enable_early_stopping = True, n_cross_validations = 10, model_explainability = True,
                                 enable_stack_ensemble = True, enable_voting_ensemble = True)
AutoML_run = titanic_experiment.submit(run_configuration, show_output = True)
RunDetails(AutoML_run).show()

## Model Registration For Future Use

In [None]:
AutoML_run.register_model(model_name = 'Titanic-Classification-AutoML', 
                          description = 'Best AutoML Classification Model Run using the Transformed Titanic Dataset',
                          tags = {'Project': 'Titanic', 'Creator': 'Edwin Goh'})

### Using *Normalized Macro Recall* As The Evaluation Metric

In [None]:
AutoML_run.register_model(model_name = 'Titanic-Classification-AutoML-macro_recall', 
                          description = 'Best AutoML Classification Model Run using the Transformed Titanic Dataset',
                          tags = {'project': 'Diabetes', 'creator': 'Edwin Goh', 'metric': 'norm_macro_recall'}, 
                          metric = 'norm_macro_recall')

# Multi-Class Classification Using The Iris Dataset

In [8]:
iris_dataset = Dataset.get_by_name(ws, 'Iris_Dataset', version = 'latest')
iris_dataset.take(10).to_pandas_dataframe()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
5,5.4,3.9,1.7,0.4,Iris-setosa
6,4.6,3.4,1.4,0.3,Iris-setosa
7,5.0,3.4,1.5,0.2,Iris-setosa
8,4.4,2.9,1.4,0.2,Iris-setosa
9,4.9,3.1,1.5,0.1,Iris-setosa


In [9]:
iris_species = iris_dataset.to_pandas_dataframe()['species'].unique()

## Model Training

In [10]:
iris_experiment = Experiment(workspace = ws, name = 'Iris-Multi-Class-Classification')

run_configuration = AutoMLConfig(task = 'classification', primary_metric = 'accuracy', num_classes = 3, featurization = 'auto', 
                                 compute_target = compute_target, training_data = iris_dataset, label_column_name = 'species', 
                                 experiment_timeout_minutes = 15, enable_early_stopping = True, n_cross_validations = 10, 
                                 model_explainability = True, enable_stack_ensemble = True, enable_voting_ensemble = True)
AutoML_run = iris_experiment.submit(run_configuration, show_output = True)
RunDetails(AutoML_run).show()

Submitting remote run.
No run_configuration provided, running on automl-cluster with default configuration
Running on remote compute: automl-cluster


Experiment,Id,Type,Status,Details Page,Docs Page
Iris-Multi-Class-Classification,AutoML_15fe9a5f-4c50-4fbf-a51e-bb07ed4daba7,automl,NotStarted,Link to Azure Machine Learning studio,Link to Documentation



Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.
Current status: ModelSelection. Beginning model selection.

********************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

********************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       PASSED
DESCRIPTION:  No feature missing values were detected in the training data.
              Learn more about missing value imputation: https://aka.ms/AutomatedMLFeaturization

**********************************************************************************

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

## Model Registration For Future Use

In [11]:
AutoML_run.register_model(model_name = 'Iris-MultiClass-Classification-AutoML', 
                          description = 'AutoML Multi-Class Classification Run using The Iris Dataset',
                          tags = {'Project': 'Iris', 'Creator': 'Edwin Goh'})

Model(workspace=Workspace.create(name='auotml-example-workspace', subscription_id='0c19fc19-85fd-4aa4-b133-61dd20fa93df', resource_group='edwin.spartan117-rg'), name=Iris-MultiClass-Classification-AutoML, id=Iris-MultiClass-Classification-AutoML:1, version=1, tags={'Project': 'Iris', 'Creator': 'Edwin Goh'}, properties={})