In [1]:
import pandas as pd
import numpy as np

In [2]:
import azureml.core
from azureml.core import Workspace

# check core SDK version number
print("Azure ML SDK Version: ", azureml.core.VERSION)


Azure ML SDK Version:  1.6.0


In [3]:
# load workspace configuration from the config.json file in the current folder.
ws = Workspace.from_config()


In [32]:
from azureml.core import Experiment
experiment_name = 'penguins-automatedML'

experiment = Experiment(workspace=ws, name=experiment_name)

In [25]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "ds3-comptute-cls")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 2)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('found compute target. just use it. ' + compute_name)
else:
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                                min_nodes=compute_min_nodes,
                                                                max_nodes=compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(
        ws, compute_name, provisioning_config)

    # can poll for a minimum number of nodes and for a specific timeout.
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

creating a new compute target...
Creating
Succeeded
AmlCompute wait for completion finished

Minimum number of nodes requested have been provisioned
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2020-06-15T09:55:57.874000+00:00', 'errors': None, 'creationTime': '2020-06-15T09:55:50.001362+00:00', 'modifiedTime': '2020-06-15T09:56:05.614194+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 2, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_D2_V2'}


In [6]:
df = pd.read_csv('https://raw.githubusercontent.com/allisonhorst/palmerpenguins/master/data-raw/penguins_raw.csv')
df.head()


Unnamed: 0,studyName,Sample Number,Species,Region,Island,Stage,Individual ID,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo),Comments
0,PAL0708,1,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A1,Yes,2007-11-11,39.1,18.7,181.0,3750.0,MALE,,,Not enough blood for isotopes.
1,PAL0708,2,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N1A2,Yes,2007-11-11,39.5,17.4,186.0,3800.0,FEMALE,8.94956,-24.69454,
2,PAL0708,3,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A1,Yes,2007-11-16,40.3,18.0,195.0,3250.0,FEMALE,8.36821,-25.33302,
3,PAL0708,4,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N2A2,Yes,2007-11-16,,,,,,,,Adult not sampled.
4,PAL0708,5,Adelie Penguin (Pygoscelis adeliae),Anvers,Torgersen,"Adult, 1 Egg Stage",N3A1,Yes,2007-11-16,36.7,19.3,193.0,3450.0,FEMALE,8.76651,-25.32426,


In [7]:
df.describe()

Unnamed: 0,Sample Number,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Delta 15 N (o/oo),Delta 13 C (o/oo)
count,344.0,342.0,342.0,342.0,342.0,330.0,331.0
mean,63.151163,43.92193,17.15117,200.915205,4201.754386,8.733382,-25.686292
std,40.430199,5.459584,1.974793,14.061714,801.954536,0.55177,0.793961
min,1.0,32.1,13.1,172.0,2700.0,7.6322,-27.01854
25%,29.0,39.225,15.6,190.0,3550.0,8.29989,-26.320305
50%,58.0,44.45,17.3,197.0,4050.0,8.652405,-25.83352
75%,95.25,48.5,18.7,213.0,4750.0,9.172123,-25.06205
max,152.0,59.6,21.5,231.0,6300.0,10.02544,-23.78767


In [13]:
df.groupby(['Species']).size()

Species
Adelie Penguin (Pygoscelis adeliae)          152
Chinstrap penguin (Pygoscelis antarctica)     68
Gentoo penguin (Pygoscelis papua)            124
dtype: int64

In [22]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(df, test_size=0.2, random_state=42)
label = "Species"

In [28]:
import logging

automl_settings = {
    "iteration_timeout_minutes": 2,
    "iterations": 20,
    "primary_metric": 'AUC_weighted',
    "preprocess": True, # Parameter `preprocess` will be deprecated. Use `featurization`
    "verbosity": logging.INFO,
    "n_cross_validations": 2
}

In [None]:
# https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-features#featurization

# https://docs.microsoft.com/en-us/azure/machine-learning/how-to-configure-auto-features#customize-featurization


In [29]:
#https://docs.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.automlconfig.automlconfig?view=azure-ml-py
from azureml.train.automl import AutoMLConfig

automl_config = AutoMLConfig(task='classification',
                             debug_log='automated_ml_errors.log',
                             training_data=train_data,
                             label_column_name=label,
                             #blacklist_models=[], # https://docs.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.constants.supportedmodels.classification?view=azure-ml-py
                             **automl_settings)




In [33]:
local_run = experiment.submit(automl_config, show_output=True)

Running on local machine
Parent Run ID: AutoML_26369b95-c5c3-4592-977a-3be41efa93a5

Current status: DatasetEvaluation. Gathering dataset statistics.
Current status: FeaturesGeneration. Generating features for the dataset.
Current status: DatasetFeaturization. Beginning to fit featurizers and featurize the dataset.
Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: DatasetCrossValidationSplit. Generating individually featurized CV splits.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

TYPE:         Missing feature values imputation
STATUS:       DONE
DESCRIPTION:  If the missing values are expected, let the 

In [35]:
from azureml.widgets import RunDetails

RunDetails(local_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [36]:
best_run, fitted_model = local_run.get_output()

print(best_run)
print(fitted_model)

Run(Experiment: penguins-automatedML,
Id: AutoML_26369b95-c5c3-4592-977a-3be41efa93a5_18,
Type: None,
Status: Completed)
Pipeline(memory=None,
     steps=[('datatransformer', DataTransformer(enable_dnn=None, enable_feature_sweeping=None,
        feature_sweeping_config=None, feature_sweeping_timeout=None,
        featurization_config=None, force_text_dnn=None,
        is_cross_validation=None, is_onnx_compatible=None, logger=None,
        obser...666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666, 0.16666666666666666]))])
Y_transformer(['LabelEncoder', LabelEncoder()])


In [37]:
y_predict = fitted_model.predict(test_data.drop(label, axis=1))
print(y_predict[:10])

['Gentoo penguin (Pygoscelis papua)' 'Gentoo penguin (Pygoscelis papua)'
 'Gentoo penguin (Pygoscelis papua)' 'Gentoo penguin (Pygoscelis papua)'
 'Chinstrap penguin (Pygoscelis antarctica)'
 'Chinstrap penguin (Pygoscelis antarctica)'
 'Chinstrap penguin (Pygoscelis antarctica)'
 'Gentoo penguin (Pygoscelis papua)' 'Adelie Penguin (Pygoscelis adeliae)'
 'Gentoo penguin (Pygoscelis papua)']


In [38]:
class_prob = fitted_model.predict_proba(test_data.drop(label, axis=1))
print(class_prob[:10])

[[0.03526527 0.020852   0.94388272]
 [0.02766617 0.03200397 0.94032986]
 [0.02990272 0.02055203 0.94954524]
 [0.06163279 0.02014841 0.91821878]
 [0.14831231 0.81911638 0.03257132]
 [0.03166373 0.93430202 0.03403425]
 [0.14102185 0.82069391 0.03828424]
 [0.0799037  0.01980348 0.90029282]
 [0.8979937  0.01919649 0.0828098 ]
 [0.05562317 0.02168007 0.92269677]]


In [None]:
#https://github.com/Azure/MachineLearningNotebooks/tree/master/how-to-use-azureml/automated-machine-learning/forecasting-energy-demand

In [None]:
fitted_model.named_steps['timeseriestransformer']. get_engineered_feature_names ()

In [None]:
fitted_model.named_steps['timeseriestransformer'].get_featurization_summary()