In [None]:
!pip install --upgrade azureml-sdk

In [None]:
!pip install --upgrade azureml-widgets

In [None]:
!pip install --upgrade azureml-train-automl-runtime

In [1]:
import pandas as pd
import numpy as np
import azureml.core

In [2]:
from azureml.core import Workspace
ws = Workspace.from_config()

In [3]:
from azureml.core import Experiment
experiment_name = 'penguins-automatedML'

experiment = Experiment(workspace=ws, name=experiment_name)

In [13]:
from azureml.core.compute import AmlCompute
from azureml.core.compute import ComputeTarget
import os

# choose a name for your cluster
compute_name = os.environ.get("AML_COMPUTE_CLUSTER_NAME", "myComputeCluster")
compute_min_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MIN_NODES", 0)
compute_max_nodes = os.environ.get("AML_COMPUTE_CLUSTER_MAX_NODES", 2)

# This example uses CPU VM. For using GPU VM, set SKU to STANDARD_NC6
vm_size = os.environ.get("AML_COMPUTE_CLUSTER_SKU", "STANDARD_D2_V2")


if compute_name in ws.compute_targets:
    compute_target = ws.compute_targets[compute_name]
    if compute_target and type(compute_target) is AmlCompute:
        print('found compute target. just use it. ' + compute_name)
else:
    print('creating a new compute target...')
    provisioning_config = AmlCompute.provisioning_configuration(vm_size=vm_size,
                                                                min_nodes=compute_min_nodes,
                                                                max_nodes=compute_max_nodes)

    # create the cluster
    compute_target = ComputeTarget.create(
        ws, compute_name, provisioning_config)

    # can poll for a minimum number of nodes and for a specific timeout.
    # if no min node count is provided it will use the scale settings for the cluster
    compute_target.wait_for_completion(
        show_output=True, min_node_count=None, timeout_in_minutes=20)

    # For a more detailed view of current AmlCompute status, use get_status()
    print(compute_target.get_status().serialize())

found compute target. just use it. myComputeCluster


In [4]:
from azureml.core import Dataset

dataset = Dataset.get_by_name(ws, name='Penguins').drop_columns(['Individual ID', 'studyName','Sample Number', 'Stage','Region','Island','Comments'])
dataset.take(3).to_pandas_dataframe()

Unnamed: 0,Species,Clutch Completion,Date Egg,Culmen Length (mm),Culmen Depth (mm),Flipper Length (mm),Body Mass (g),Sex,Delta 15 N (o/oo),Delta 13 C (o/oo)
0,Adelie Penguin (Pygoscelis adeliae),True,2007-11-11,39.1,18.7,181,3750,MALE,,
1,Adelie Penguin (Pygoscelis adeliae),True,2007-11-11,39.5,17.4,186,3800,FEMALE,8.94956,-24.69454
2,Adelie Penguin (Pygoscelis adeliae),True,2007-11-16,40.3,18.0,195,3250,FEMALE,8.36821,-25.33302


In [5]:
dataset.to_pandas_dataframe().groupby(['Species']).size()

Species
Adelie Penguin (Pygoscelis adeliae)          152
Chinstrap penguin (Pygoscelis antarctica)     68
Gentoo penguin (Pygoscelis papua)            124
dtype: int64

In [6]:
df = dataset.to_pandas_dataframe()
df['Species'].value_counts(normalize=True)

Adelie Penguin (Pygoscelis adeliae)          0.441860
Gentoo penguin (Pygoscelis papua)            0.360465
Chinstrap penguin (Pygoscelis antarctica)    0.197674
Name: Species, dtype: float64

In [7]:
train_data, test_data = dataset.random_split(percentage=0.8, seed=42)
label = "Species"

In [8]:
train_data.to_pandas_dataframe()['Species'].value_counts(normalize=True)

Adelie Penguin (Pygoscelis adeliae)          0.460145
Gentoo penguin (Pygoscelis papua)            0.358696
Chinstrap penguin (Pygoscelis antarctica)    0.181159
Name: Species, dtype: float64

In [9]:
test_data.to_pandas_dataframe()['Species'].value_counts(normalize=True)

Gentoo penguin (Pygoscelis papua)            0.367647
Adelie Penguin (Pygoscelis adeliae)          0.367647
Chinstrap penguin (Pygoscelis antarctica)    0.264706
Name: Species, dtype: float64

In [10]:
#https://docs.microsoft.com/en-us/python/api/azureml-automl-core/azureml.automl.core.featurization.featurizationconfig.featurizationconfig?view=azure-ml-py

from azureml.automl.core.featurization import FeaturizationConfig

featurization_config = FeaturizationConfig()
featurization_config.blocked_transformers = ['OneHotEncoder'] #A list of transformer names to be blocked during featurization
#"Supported value(s): 'DateTimeTransformer, WoETargetEncoder, CountVectorizer, TfIdf, NaiveBayes, TextTargetEncoder, ImputationMarker, CatImputer, HashOneHotEncoder, Imputer, CatTargetEncoder, MaxAbsScaler, WordEmbedding, OneHotEncoder, StringCast, LabelEncoder'."

#Add or drop columns
featurization_config.drop_columns = ['Date egg'] #Specifies columns to drop from being featurized
featurization_config.add_column_purpose('Body Mass (g)', 'Numeric') #Override the feature type for the specified column
featurization_config.add_column_purpose('Comments', 'CategoricalHash')

#Transformers
featurization_config.add_transformer_params('Imputer', ['Culmen Length (mm)'], {"strategy": "median"})
featurization_config.add_transformer_params('Imputer', ['Culmen Depth (mm)'], {"strategy": "median"})
featurization_config.add_transformer_params('Imputer', ['Sex'], {"strategy": "most_frequent"})
featurization_config.add_transformer_params('OneHotEncoder', ['Sex'], {"number_of_bits": 2})

In [None]:
#For experiments that you configure with the Python SDK, you can enable or disable the featurization setting and further specify the featurization steps to be used for your experiment. 

In [11]:
import logging

automl_settings = {
    "iteration_timeout_minutes": 1,
    "iterations": 10,
    "primary_metric": 'norm_macro_recall', #precision_score_weighted, average_precision_score_weighted
    "featurization": featurization_config, #off, auto, FeaturizationConfig
    "verbosity": logging.INFO
    #,"enable_local_managed": True # to submit a local conda or local docker run
}

In [14]:
#https://docs.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.automlconfig.automlconfig?view=azure-ml-py
from azureml.train.automl import AutoMLConfig

automl_config = AutoMLConfig(task='classification',
                             debug_log='automated_ml_errors.log',
                             compute_target = compute_target,
                             training_data=train_data,
                             validation_data=test_data,
                             label_column_name=label,
                             blocked_models=['ExtremeRandomTrees','SVM'], # https://docs.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.constants.supportedmodels.classification?view=azure-ml-py
                             **automl_settings)


In [15]:
import azureml.train.automl.runtime

local_run = experiment.submit(automl_config, show_output=True)

Running on remote or ADB.
Running on remote compute: myComputeCluster
Parent Run ID: AutoML_d170c5d8-206e-4996-9873-08cb43a92387

Current status: DatasetFeaturizationCompleted. Completed fit featurizers and featurizing the dataset.
Current status: ModelSelection. Beginning model selection.

****************************************************************************************************
DATA GUARDRAILS: 

TYPE:         Class balancing detection
STATUS:       PASSED
DESCRIPTION:  Your inputs were analyzed, and all classes are balanced in your training data.
              Learn more about imbalanced data: https://aka.ms/AutomatedMLImbalancedData

****************************************************************************************************

TYPE:         Missing feature values imputation
STATUS:       DONE
DESCRIPTION:  If the missing values are expected, let the run complete. Otherwise cancel the current run and use a script to customize the handling of missing feature values t

In [16]:
from azureml.widgets import RunDetails

RunDetails(local_run).show()

_AutoMLWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', 's…

In [17]:
local_run.wait_for_completion()

{'runId': 'AutoML_d170c5d8-206e-4996-9873-08cb43a92387',
 'target': 'myComputeCluster',
 'status': 'Completed',
 'startTimeUtc': '2020-07-31T11:15:37.52927Z',
 'endTimeUtc': '2020-07-31T11:27:22.936562Z',
 'properties': {'num_iterations': '10',
  'training_type': 'TrainFull',
  'acquisition_function': 'EI',
  'primary_metric': 'norm_macro_recall',
  'train_split': '0',
  'acquisition_parameter': '0',
  'num_cross_validation': None,
  'target': 'myComputeCluster',
  'RawAMLSettingsString': None,
  'DataPrepJsonString': '{\\"training_data\\": \\"{\\\\\\"blocks\\\\\\": [{\\\\\\"id\\\\\\": \\\\\\"17d96df9-3c39-401f-8b46-84850bacf606\\\\\\", \\\\\\"type\\\\\\": \\\\\\"Microsoft.DPrep.GetDatastoreFilesBlock\\\\\\", \\\\\\"arguments\\\\\\": {\\\\\\"datastores\\\\\\": [{\\\\\\"datastoreName\\\\\\": \\\\\\"sandbox_penguins\\\\\\", \\\\\\"path\\\\\\": \\\\\\"penguins_raw.csv\\\\\\", \\\\\\"resourceGroup\\\\\\": \\\\\\"adlsgen2\\\\\\", \\\\\\"subscription\\\\\\": \\\\\\"f80606e5-788f-4dc3-a9ea-2e

In [19]:
best_run, fitted_model = local_run.get_output()

print(best_run)
print(fitted_model)

ImportError: cannot import name 'RollingOriginValidator'

In [None]:
print(best_run.get_file_names())

In [None]:
# register best model
from azureml.core.model import Model

model = best_run.register_model(model_name='penguins_class_best_model', model_path='outputs/model.pkl')

print(model.name, model.version, sep='\t')

In [None]:
# trouver les valeurs des hyperparamètres du modèle et de la featurization ?
# get_transformer_params ?

In [None]:
# load an existing model
from azureml.core.model import Model

model = Model(ws, "penguins_class_best_model")

In [None]:
y_predict = fitted_model.predict(test_data.to_pandas_dataframe().drop(label, axis=1))
print(y_predict[:68])

In [None]:
class_prob = fitted_model.predict_proba(test_data.to_pandas_dataframe().drop(label, axis=1))
print(class_prob[:68])

In [None]:
from sklearn.metrics import confusion_matrix

y_true = test_data.keep_columns(label).to_pandas_dataframe()
confusion_matrix(y_true, y_predict, labels=['Adelie Penguin (Pygoscelis adeliae)','Gentoo penguin (Pygoscelis papua)','Chinstrap penguin (Pygoscelis antarctica)'])