# Prerequisites

1. Workspace & Compute exists, and is created outside of this notebook
2. A Tabular Dataset exists, and is created and registered outside of this notebook
3. Pip requirements: 
    - dpv2-sdk (for creating and accessing AzureML resources)
    - azureml-mlflow (for loading the tracking URI)

# Setup

These environment variables enable private preview features, such as AutoML

In [1]:
%env AZURE_EXTENSION_DIR=/home/schrodinger/automl/sdk-cli-v2/src/cli/src
%env AZURE_ML_CLI_PRIVATE_FEATURES_ENABLED=true

env: AZURE_EXTENSION_DIR=/home/schrodinger/automl/sdk-cli-v2/src/cli/src
env: AZURE_ML_CLI_PRIVATE_FEATURES_ENABLED=true


## Imports

In [3]:
from azure.ml import MLClient
import mlflow

## Setting necessary context

In [4]:
subscription_id = '381b38e9-9840-4719-a5a0-61d9585e1e91'
resource_group_name = 'gasi_rg_centraleuap'

# The workspace under which to log experiments and trials
workspace_name = "gasi_ws_centraleuap"

# The experiment under which AutoML will track its trials and artifacts
experiment_name = "automl-classification-bmarketing-all"

# The compute target where AutoML will execute its trials
compute_name = "cpu-cluster"

# The datasets along with their versions
training_dataset = "bankmarketing_train:1"
test_dataset = "bankmarketing_test:1"
validation_dataset = "bankmarketing_valid:1"

### Question:

Append 'azureml:...' to the dataset names above?

## Initialize MLClient
Create an MLClient object - which is used to manage all Azure ML resources, such as workspaces, jobs, models, etc.

In [5]:
client = MLClient(subscription_id, resource_group_name, workspace_name=workspace_name)
assert client is not None

## Initialize MLFlow Client

The models and artifacts that are produced by AutoML can be accessed by the MLFlow interface. Initialize the MLFlow client here, and set the backend as Azure ML, via. the MLFlow Client.

In [7]:
tracking_uri = "TODO --> Get this from MLClient"

################################################################################
# TODO: The API to get tracking URI is not yet available on Worksapce object.
from azureml.core import Workspace as WorkspaceV1
ws = WorkspaceV1(workspace_name=workspace_name, resource_group=resource_group_name, subscription_id=subscription_id)
tracking_uri = ws.get_mlflow_tracking_uri()
del ws
################################################################################

mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment(experiment_name)

print("\nCurrent tracking uri: {}".format(mlflow.get_tracking_uri()))


Current tracking uri: azureml://master.experiments.azureml-test.net/mlflow/v1.0/subscriptions/381b38e9-9840-4719-a5a0-61d9585e1e91/resourceGroups/gasi_rg_centraleuap/providers/Microsoft.MachineLearningServices/workspaces/gasi_ws_centraleuap?


### Questions
Q: Can we set this (the tracking URI) inside AutoML, given things won't work at all w/o setting MLFlow context above?

Q: Do we need MLFlow client for job submissions?

# AutoML Job

## Job Configuration - A minimal example

Using default primary metric (accuracy for classification)

In [8]:
from azure.ml.entities import AutoMLJob

automl_job = AutoMLJob(
    compute=compute_name,
    task="classification",
    target="y",
    # proposed: data={"dataset": "", "test_dataset": "", "validation_dataset": ""},
    dataset={"train": training_dataset, "test": test_dataset, "valid": validation_dataset},
    configuration={
        "blocked_models": ["KNN", "LinearSVM"],
        "exit_criterion": {"timeout_hours": 1},
        "max_concurrent_trials": 4,
        "validation": {"n_cross_validations": 5},
    },
    properties={"save_mlflow": True}    # This should be enabled by default 
)

In [9]:
created_job = client.jobs.create_or_update(automl_job)
created_job

AutoMLJob({'type': 'automl_job', 'status': 'NotStarted', 'output': None, 'log_files': None, 'name': '40cb801c-53cc-4dad-bf07-988c6577c001', 'description': None, 'tags': {'model_explain_run': 'best_run', '_aml_system_azureml.automlComponent': 'AutoML'}, 'properties': {'num_iterations': '1000', 'training_type': 'TrainFull', 'acquisition_function': 'EI', 'primary_metric': 'accuracy', 'train_split': '0', 'acquisition_parameter': '0', 'num_cross_validation': '5', 'target': 'cpu-cluster', 'AMLSettingsJsonString': '{"path":"./sample_projects/","subscription_id":"381b38e9-9840-4719-a5a0-61d9585e1e91","resource_group":"gasi_rg_centraleuap","workspace_name":"gasi_ws_centraleuap","compute_target":"cpu-cluster","iterations":1000,"primary_metric":"accuracy","task_type":"classification","IsImageTask":false,"IsTextDNNTask":false,"n_cross_validations":5,"preprocess":true,"is_timeseries":false,"time_column_name":null,"grain_column_names":null,"max_cores_per_iteration":-1,"max_concurrent_iterations":4,"

In [10]:
print("Studio URL: ", created_job.interaction_endpoints["Studio"].endpoint)

Studio URL:  https://ml.azure.com/runs/40cb801c-53cc-4dad-bf07-988c6577c001?wsid=/subscriptions/381b38e9-9840-4719-a5a0-61d9585e1e91/resourcegroups/gasi_rg_centraleuap/workspaces/gasi_ws_centraleuap&tid=72f988bf-86f1-41af-91ab-2d7cd011db47


## Alternate Job Configurations

### Enable ONNX compatible Models

Shows a mixed use of promoted properties along with the original entity (TrainingSettings)

In [11]:
from azure.ml.entities import AutoMLJob
from azure.ml.entities._job.automl.training_settings import TrainingSettings

training_settings = TrainingSettings(enable_onnx_compatible_models=True)

automl_job = AutoMLJob(
    compute=compute_name,
    task="classification",
    target="y",
    dataset={"train": training_dataset, "test": test_dataset, "valid": validation_dataset},
    training_settings = training_settings,
    configuration={
        "blocked_models": ["KNN", "LinearSVM"],
        "exit_criterion": {"timeout_hours": 1},
        "max_concurrent_trials": 4,   
        "validation": {"n_cross_validations": 5},
    },
    properties={"save_mlflow": True}
)

In [12]:
automl_job.training_settings.__dict__

{'block_list_models': ['KNN', 'LinearSVM'],
 'allow_list_models': None,
 'enable_dnn_training': None,
 'enable_onnx_compatible_models': True,
 'enable_stack_ensemble': None,
 'enable_vote_ensemble': None,
 'ensemble_model_download_timeout': None,
 'stack_ensemble_settings': None}

### Use a non-default primary metric

In [17]:
from azure.ml.entities import AutoMLJob
from azure.ml._restclient.v2020_09_01_preview.models import GeneralSettings

general_settings = GeneralSettings(primary_metric= "auc_weighted")

automl_job = AutoMLJob(
    compute=compute_name,
    task="classification",
    target="y",
    dataset={"train": training_dataset, "test": test_dataset, "valid": validation_dataset},
    general_settings = general_settings,
    configuration={
        "blocked_models": ["KNN", "LinearSVM"],
        "exit_criterion": {"timeout_hours": 1},
        "max_concurrent_trials": 4,
        "validation": {"n_cross_validations": 5},
    },
    properties={"save_mlflow": True}
)

In [18]:
automl_job.general_settings.__dict__

{'additional_properties': {},
 'enable_model_explainability': None,
 'log_verbosity': None,
 'primary_metric': 'auc_weighted',
 'task_type': 'classification'}

### Enable Deep Neural Nets + train-valid percentage split

In [20]:
from azure.ml.entities import AutoMLJob

automl_job = AutoMLJob(
    compute=compute_name,
    task="classification",
    target="y",
    dataset={"train": training_dataset, "test": test_dataset, "valid": validation_dataset},
    configuration={
        "blocked_models": ["KNN", "LinearSVM"],
        "exit_criterion": {"timeout_hours": 1},
        "max_concurrent_trials": 4,
        "enable_dnn": True,
        "validation": {"valid_percent": 0.2},
    },
    properties={"save_mlflow": True}    # This should be enabled by default 
)

In [25]:
automl_job.training_settings.__dict__, "-----", automl_job.data_settings.validation_data.__dict__

({'block_list_models': ['KNN', 'LinearSVM'],
  'allow_list_models': None,
  'enable_dnn_training': True,
  'enable_onnx_compatible_models': None,
  'enable_stack_ensemble': None,
  'enable_vote_ensemble': None,
  'ensemble_model_download_timeout': None,
  'stack_ensemble_settings': None},
 '-----',
 {'additional_properties': {},
  'cv_split_column_names': None,
  'dataset_arm_id': 'bankmarketing_valid:1',
  'n_cross_validations': None,
  'validation_data_size': 0.2})

### Disable Ensembling

In [31]:
from azure.ml.entities import AutoMLJob
from azure.ml.entities._job.automl.training_settings import TrainingSettings

training_settings = TrainingSettings(
    enable_vote_ensemble=False, enable_stack_ensemble=False
)

automl_job = AutoMLJob(
    compute=compute_name,
    task="classification",
    target="y",
    dataset={"train": training_dataset, "test": test_dataset, "valid": validation_dataset},
    training_settings = training_settings,
    configuration={
        "blocked_models": ["KNN", "LinearSVM"],
        "exit_criterion": {"timeout_hours": 1},
        "max_concurrent_trials": 4,
        "validation": {"n_cross_validations": 5},
        "enable_dnn": True,
    },
    properties={"save_mlflow": True}
)

In [32]:
automl_job.training_settings.__dict__

{'block_list_models': ['KNN', 'LinearSVM'],
 'allow_list_models': None,
 'enable_dnn_training': True,
 'enable_onnx_compatible_models': None,
 'enable_stack_ensemble': False,
 'enable_vote_ensemble': False,
 'ensemble_model_download_timeout': None,
 'stack_ensemble_settings': None}

### Forecasting

In [33]:
from azure.ml.entities import AutoMLJob
from azure.ml.entities._job.automl.forecasting import ForecastingSettings

forecast_settings = ForecastingSettings(
    time_column_name="DATE", forecast_horizon=12, frequency='MS'
)

automl_job = AutoMLJob(
    compute=compute_name,
    task="forecasting",
    target="BeerProduction",
    dataset={"train": training_dataset, "test": test_dataset, "valid": validation_dataset},
    forecasting_settings = forecast_settings,
    configuration={
        "blocked_models": ["KNN", "LinearSVM"],
        "exit_criterion": {"timeout_hours": 1},
        "max_concurrent_trials": 4,
        "validation": {"n_cross_validations": 5},
        "enable_dnn": True,
    },
    properties={"save_mlflow": True}
)

In [35]:
automl_job.forecasting_settings.__dict__

{'country_or_region_for_holidays': None,
 'forecast_horizon': 12,
 'target_lags': None,
 'target_rolling_window_size': None,
 'frequency': 'MS',
 'feature_lags': None,
 'seasonality': None,
 'use_stl': None,
 'short_series_handling_config': None,
 'target_aggregate_function': None,
 'time_column_name': 'DATE',
 'time_series_id_column_names': None}

### Custom Featurization Settings

In [36]:
from azure.ml.entities._job.automl.featurization import ColumnTransformer, FeaturizationConfig

featurization_config = FeaturizationConfig()
featurization_config.blocked_transformers = ['LabelEncoder']
featurization_config.drop_columns = ['MMIN']

featurization_config.column_purposes = {
    'MYCT': 'Numeric',
    'VendorName': 'CategoricalHash'
}

#default strategy mean, add transformer param for for 3 columns
transformer_params_dict = {
    "Imputer": [
        ColumnTransformer(fields=["CACH"], parameters={'strategy': 'median'}),
        ColumnTransformer(fields=["CHMIN"], parameters={'strategy': 'median'}),
        ColumnTransformer(fields=["PRP"], parameters={'strategy': 'most_frequent'}),
    ],
    "HashOneHotEncoder": [
        ColumnTransformer(fields=[], parameters={'number_of_bits': 3.0})
    ]
}

featurization_config.transformer_params = transformer_params_dict

In [37]:
from azure.ml.entities import AutoMLJob
from azure.ml.entities._job.automl.featurization import FeaturizationSettings


featurization_settings = FeaturizationSettings(featurization_config=featurization_config)

automl_job = AutoMLJob(
    compute=compute_name,
    task="classification",
    target="y",
    dataset={"train": training_dataset, "test": test_dataset, "valid": validation_dataset},
    featurization_settings=featurization_settings,
    configuration={
        "blocked_models": ["KNN", "LinearSVM"],
        "exit_criterion": {"timeout_hours": 1},
        "max_concurrent_trials": 4,
        "validation": {"valid_percent": 0.2},
    },
    properties={"save_mlflow": True}    # This should be enabled by default 
)

In [39]:
automl_job.featurization_settings.featurization_config.__dict__

{'blocked_transformers': ['LabelEncoder'],
 'column_purposes': {'MYCT': 'Numeric', 'VendorName': 'CategoricalHash'},
 'transformer_params': {'Imputer': [<azure.ml.entities._job.automl.featurization.ColumnTransformer at 0x7f6cc0e712d0>,
   <azure.ml.entities._job.automl.featurization.ColumnTransformer at 0x7f6cc0e715d0>,
   <azure.ml.entities._job.automl.featurization.ColumnTransformer at 0x7f6cc0e71dd0>],
  'HashOneHotEncoder': [<azure.ml.entities._job.automl.featurization.ColumnTransformer at 0x7f6cc0e71c90>]},
 'dataset_language': None,
 'drop_columns': ['MMIN']}