# SDK Demo - Flight Delay
This demo notebook shows using the AzureML Python SDK to train a classification model using AutoML like we did from the GUI.

Note: the last section shows AutoML with many models, parallel runstep and pipelines.

## Update Environment

Before running the notebook, make sure the correct versions of these libraries are installed.

TODO: Confirm which packages and versions need to be installed with AzureML Python 3.8 kernel

In [None]:
! pip install azureml-responsibleai azureml-train-automl-client azureml-widgets azureml-train-automl azureml-automl-core azureml-train-automl-runtime==1.38.0
!pip install --upgrade azureml-responsibleai
!pip install liac-arff

In [None]:
import warnings
warnings.filterwarnings("ignore")

import logging
logging.basicConfig(level = logging.ERROR)

## Setup working directory

The cell below creates our working directory. This will hold our generated scripts.

In [5]:
import os

project_folder = './scripts'

# Working directory
if not os.path.exists(project_folder):
    os.makedirs(project_folder)

## Write helper file


In [None]:
%%writefile $project_folder/helper.py

import pandas as pd
import os
import shutil
import sys
import json
from azureml.core import Experiment
from azureml.core.run import Run
from azureml.core import Workspace
from azureml.train.automl._azureautomlsettings import AzureAutoMLSettings
sys.path.append("..")



def split_data(data_path):

    train_data_path = os.path.join(data_path, "upload_train_data")
    inference_data_path = os.path.join(data_path, "upload_inference_data")
    os.makedirs(train_data_path, exist_ok=True)
    os.makedirs(inference_data_path, exist_ok=True)

    files_list = [os.path.join(path, f) for path, _, files in os.walk(data_path) for f in files
                  if path not in (train_data_path, inference_data_path)]

    for file in files_list:
        file_name = os.path.basename(file)
        file_extension = os.path.splitext(file_name)[1].lower()
        df = pd.read_csv(file)
        train_df, inference_df = df, df
        train_df.to_csv(os.path.join(train_data_path, file_name), index=None, header=True)
        inference_df.to_csv(os.path.join(inference_data_path, file_name), index=None, header=True)

    return train_data_path, inference_data_path

def validate_parallel_run_config(parallel_run_config):
    max_concurrency = 20
    if (parallel_run_config.process_count_per_node * parallel_run_config.node_count) > max_concurrency:
        print("Please decrease concurrency to maximum of 20 as currently AutoML does not support it.")
        raise ValueError("node_count*process_count_per_node must be between 1 and max_concurrency {}"
                         .format(max_concurrency))


def get_automl_environmentx(workspace: Workspace, automl_settings_dict: AzureAutoMLSettings):
    from azureml.core import RunConfiguration
    from azureml.train.automl._environment_utilities import modify_run_configuration
    import logging
    null_logger = logging.getLogger("manymodels_null_logger")
    null_logger.addHandler(logging.NullHandler())
    null_logger.propagate = False
    automl_settings_obj = AzureAutoMLSettings.from_string_or_dict(
        automl_settings_dict)
    run_configuration = modify_run_configuration(
        automl_settings_obj,
        RunConfiguration(),
        logger=null_logger)
    train_env = run_configuration.environment
    train_env.environment_variables['DISABLE_ENV_MISMATCH'] = True
    train_env.environment_variables['AZUREML_FLUSH_INGEST_WAIT'] = ''
    train_env.environment_variables['AZUREML_METRICS_POLLING_INTERVAL'] = '30'
    return run_configuration.environment


def get_output(run, results_name, output_name):
    # remove previous run results, if present
    shutil.rmtree(results_name, ignore_errors=True)

    parallel_run_output_file_name = "parallel_run_step.txt"

    # download the contents of the output folder
    batch_run = next(run.get_children())
    batch_output = batch_run.get_output_data(output_name)
    batch_output.download(local_path=results_name)

    keep_root_folder(results_name, results_name)
    for root, dirs, files in os.walk(results_name):
        for file in files:
            if file.endswith(parallel_run_output_file_name):
                result_file = os.path.join(root, file)
                break

    return result_file


def keep_root_folder(root_path, cur_path):
    for filename in os.listdir(cur_path):
        if os.path.isfile(os.path.join(cur_path, filename)):
            shutil.move(os.path.join(cur_path, filename),
                        os.path.join(root_path, filename))
        elif os.path.isdir(os.path.join(cur_path, filename)):
            keep_root_folder(root_path, os.path.join(cur_path, filename))
        else:
            sys.exit("No files found.")

    # remove empty folders
    if root_path != cur_path:
        os.rmdir(cur_path)
    return

def write_automl_settings_to_file(automl_settings):
    with open('scripts//automlconfig.json', 'w', encoding='utf-8') as f:
        json.dump(automl_settings, f, ensure_ascii=False, indent=4)


def cancel_runs_in_experiment(ws, experiment):
    failed_experiment = Experiment(ws, experiment)
    all_runs = failed_experiment.get_runs()
    for idx, run in enumerate(all_runs):
        try:
            if run.status == 'Running':
                run = Run(failed_experiment, run.id)
                print('Canceling run: ', run)
                run.cancel()
        except Exception as e:
            print('Canceling run failed due to ', e)


def build_parallel_run_config(train_env, compute, nodecount, workercount, timeout):
    from azureml.pipeline.steps import ParallelRunConfig
    parallel_run_config = ParallelRunConfig(
        source_directory='./scripts',
        entry_script='train_minibatch.py',
        mini_batch_size="1",  # do not modify this setting
        run_invocation_timeout=timeout,
        error_threshold=-1,
        output_action="append_row",
        environment=train_env,
        process_count_per_node=workercount,
        compute_target=compute,
        node_count=nodecount)
    validate_parallel_run_config(parallel_run_config)
    return parallel_run_config


def get_automl_environment(workspace: Workspace, automl_settings_dict: dict):
    return get_automl_environmentx(workspace, automl_settings_dict)


def get_training_output(run, training_results_name, training_output_name):
    return get_output(run, training_results_name, training_output_name)

## Define the training scripts

In [None]:
%%writefile $project_folder/train_helper.py
import argparse
import os
import pickle


class MetadataFileHandler:

    # Metadata file names
    ARGS_FILE_NAME = "args.pkl"
    AUTOML_SETTINGS_FILE_NAME = "automl_settings.pkl"
    LOGS_FILE_NAME = "logs.pkl"
    RUN_DTO_FILE_NAME = "run_dto.pkl"

    def __init__(self, data_dir):
        # Directory where metadata files live
        self.data_dir = data_dir

        # Full paths to metadata files
        self._args_file_path = os.path.join(self.data_dir, self.ARGS_FILE_NAME)
        self._automl_settings_file_path = os.path.join(self.data_dir, self.AUTOML_SETTINGS_FILE_NAME)
        self._logs_file_path = os.path.join(self.data_dir, self.LOGS_FILE_NAME)
        self._run_dto_file_name = os.path.join(self.data_dir, self.RUN_DTO_FILE_NAME)

    def delete_logs_file_if_exists(self):
        if not os.path.exists(self._logs_file_path):
            return
        os.remove(self._logs_file_path)

    def load_automl_settings(self):
        return self.load_obj_from_disk(self._automl_settings_file_path)

    def load_args(self):
        return self.load_obj_from_disk(self._args_file_path)

    def load_logs(self):
        return self.load_obj_from_disk(self._logs_file_path)

    def load_run_dto(self):
        return self.load_obj_from_disk(self._run_dto_file_name)

    def write_args_to_disk(self, args):
        self.serialize_obj_to_disk(args, self._args_file_path)

    def write_automl_settings_to_disk(self, automl_settings):
        self.serialize_obj_to_disk(automl_settings, self._automl_settings_file_path)

    def write_logs_to_disk(self, logs):
        self.serialize_obj_to_disk(logs, self._logs_file_path)

    def write_run_dto_to_disk(self, run_dto):
        self.serialize_obj_to_disk(run_dto, self._run_dto_file_name)

    @classmethod
    def load_obj_from_disk(cls, file_path):
        with open(file_path, 'rb') as f:
            return pickle.load(f)

    @classmethod
    def serialize_obj_to_disk(cls, obj, file_path):
        with open(file_path, 'wb') as f:
            pickle.dump(obj, f)


class TrainUtil:

    @staticmethod
    def str2bool(v):

        if isinstance(v, bool):
            return v
        if v.lower() in ('yes', 'true', 't', 'y', '1'):
            return True
        elif v.lower() in ('no', 'false', 'f', 'n', '0'):
            return False
        else:
            raise argparse.ArgumentTypeError('Boolean value expected.')

In [None]:
%%writefile $project_folder/train_minibatch.py

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import argparse
import json
import os
import sys
import tempfile
from multiprocessing import current_process
from pathlib import Path
from random import randint
from subprocess import PIPE, Popen
from time import sleep

import pandas as pd
from azureml.automl.core.shared import log_server
from azureml.core import Run

from train_helper import MetadataFileHandler, TrainUtil


# This is used by UI to display the many model settings
many_model_run_properties = {'many_models_run': True}

parser = argparse.ArgumentParser("split")
parser.add_argument("--process_count_per_node", default=1, type=int, help="number of processes per node")
parser.add_argument(
    "--retrain_failed_models", default=False, type=TrainUtil.str2bool, help="retrain failed models only")

args, _ = parser.parse_known_args()


def read_from_json():
    full_path = Path(__file__).absolute().parent
    with open(str(full_path) + "/automlconfig.json") as json_file:
        return json.load(json_file)


automl_settings = read_from_json()
current_step_run = Run.get_context()
metadata_file_handler = MetadataFileHandler(tempfile.mkdtemp())

target_column = automl_settings.get('label_column_name', None)

print("target_column: {}".format(target_column))
print("retrain_failed_models: {}".format(args.retrain_failed_models))


def init():
    output_folder = os.path.join(os.environ.get("AZ_BATCHAI_INPUT_AZUREML", ""), "temp/output")
    working_dir = os.environ.get("AZ_BATCHAI_OUTPUT_logs", "")
    ip_addr = os.environ.get("AZ_BATCHAI_WORKER_IP", "")
    log_dir = os.path.join(working_dir, "user", ip_addr, current_process().name)
    t_log_dir = Path(log_dir)
    t_log_dir.mkdir(parents=True, exist_ok=True)
    automl_settings['many_models'] = True
    automl_settings['many_models_process_count_per_node'] = args.process_count_per_node

    # Try stopping logging server in the parent minibatch process.
    # Otherwise, the logging server will progressively consume more and more CPU, leading to
    # CPU starvation on the box. TODO: diagnose why this happens and fix
    try:
        log_server.server.stop()
    except Exception as e:
        print("Stopping the AutoML logging server in the entry script parent process failed with exception: {}"
              .format(e))

    debug_log = automl_settings.get('debug_log', None)
    if debug_log is not None:
        automl_settings['debug_log'] = os.path.join(log_dir, debug_log)
        automl_settings['path'] = tempfile.mkdtemp()
        print(f"{__file__}.AutoML debug log:{automl_settings['debug_log']}")

    # Write metadata files to disk, so they can be consumed by subprocesses that run AutoML
    metadata_file_handler.write_args_to_disk(args)
    metadata_file_handler.write_automl_settings_to_disk(automl_settings)
    metadata_file_handler.write_run_dto_to_disk(current_step_run._client.run_dto)

    print(f"{__file__}.output_folder:{output_folder}")
    print("init()")
    sleep(randint(1, 120))


def run(input_data_files):
    print("Entering run()")
    os.makedirs('./outputs', exist_ok=True)
    resultList = []
    for input_data_file in input_data_files:
        print("Launch subprocess to run AutoML on the data")
        env = os.environ.copy()
        # Aggressively buffer I/O from the subprocess
        env['PYTHONUNBUFFERED'] = '0'
        subprocess = Popen([
            sys.executable,
            os.path.join(os.path.dirname(os.path.realpath(__file__)), 'train_model.py'),
            input_data_file,
            metadata_file_handler.data_dir], env=env, stdout=PIPE, stderr=PIPE)
        for line in subprocess.stdout:
            print(line.decode().rstrip())
        subprocess.wait()
        print("Subprocess completed with exit code: {}".format(subprocess.returncode))
        subprocess_stderr = subprocess.stderr.read().decode().rstrip()
        if subprocess_stderr:
            print("stderr from subprocess:\n{}\n".format(subprocess_stderr))
        if subprocess.returncode != 0:
            raise Exception("AutoML training subprocess exited unsuccesffuly with error code: {}\n"
                            "stderr from subprocess: \n{}\n".format(subprocess.returncode, subprocess_stderr))
        logs = metadata_file_handler.load_logs()
        resultList.append(logs)
        metadata_file_handler.delete_logs_file_if_exists()
    print("Constructing DataFrame from results")
    result = pd.DataFrame(data=resultList)
    print("Ending run()\n")
    return result

In [None]:
%%writefile $project_folder/train_model.py

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

from train_helper import MetadataFileHandler

# TODO: Remove once Batch AI has fixed this issue.
# Exclude mounted blobfuse folders from sys.path, preventing Python from scanning
# folders in the blob container when resolving import statements. This significantly reduces traffic
# to the storage account.
import sys
sys.path = [p for p in sys.path if not p.startswith('/mnt/batch')]

import datetime  # noqa: E402
import hashlib  # noqa: E402
import os  # noqa: E402

import pandas as pd  # noqa: E402
from azureml.automl.core.shared import constants  # noqa: E402
from azureml.automl.core.shared.exceptions import AutoMLException, ClientException, ErrorTypes  # noqa: E402
from azureml.automl.core.shared.utilities import get_error_code  # noqa: E402
from azureml.core import Run  # noqa: E402
from azureml.core.model import Model  # noqa: E402
from azureml.train.automl import AutoMLConfig  # noqa: E402


# This is used by UI to display the many model settings
many_model_run_properties = {'many_models_run': True}


def compose_logs(file_name, model, start_time):
    logs = []
    logs.append('AutoML')
    logs.append(file_name)
    logs.append(None)
    logs.append(None)
    logs.append(model.name)
    logs.append(model.tags)
    logs.append(start_time)
    logs.append(datetime.datetime.now())
    logs.append(None)
    logs.append(None)
    logs.append(None)
    return logs


def train_model(file_path, data, automl_settings, current_step_run):
    file_name = file_path.split('/')[-1][:-4]
    print(file_name)
    print("in train_model")
    print('data')
    print(data.head(5))
    print(automl_settings)
    automl_config = AutoMLConfig(training_data=data, **automl_settings)

    print("submit_child")
    local_run = current_step_run.submit_child(automl_config, show_output=True)

    local_run.add_properties({
        k: str(many_model_run_properties[k])
        for k in many_model_run_properties
    })

    print(local_run)

    best_child_run, fitted_model = local_run.get_output()

    return fitted_model, local_run, best_child_run


def run(file_path, args, automl_settings, current_step_run):
    model_name = None
    current_run = None
    error_message = None
    error_code = None
    error_type = None
    tags_dict = None

    logs = []
    date1 = datetime.datetime.now()
    print('start (' + file_path + ') ' + str(datetime.datetime.now()))

    file_name_with_extension = os.path.basename(file_path)
    file_name, file_extension = os.path.splitext(file_name_with_extension)

    try:
        if file_extension.lower() == ".parquet":
            data = pd.read_parquet(file_path)
        else:
            data = pd.read_csv(file_path)

        tags_dict = {'ModelType': 'AutoML'}

        if args.retrain_failed_models:
            print('querying for existing models')
            try:
                tags = [[k, v] for k, v in tags_dict.items()]
                models = Model.list(current_step_run.experiment.workspace, tags=tags, latest=True)

                if models:
                    print("model already exists for the dataset " + models[0].name)
                    return compose_logs(file_name, models[0], date1)
            except Exception as error:
                print('Failed to list the models. ' + 'Error message: ' + str(error))

        tags_dict.update({'InputData': file_name_with_extension})
        tags_dict.update({'StepRunId': current_step_run.id})
        tags_dict.update({'RunId': current_step_run.parent.id})

        # train model
        many_model_run_properties['many_models_input_file'] = file_name_with_extension

        fitted_model, current_run, best_child_run = train_model(file_path, data, automl_settings, current_step_run)
        model_string = '_'.join(file_name)
        print("model string to encode " + model_string)
        sha = hashlib.sha256()
        sha.update(model_string.encode())
        model_name = 'automl_' + sha.hexdigest()
        tags_dict.update({'Hash': sha.hexdigest()})
        try:
            print('done training')
            print('Trained best model ' + model_name)

            print(best_child_run)
            print(fitted_model)
            print(model_name)

            print('register model')

            best_child_run.register_model(
                model_name=model_name, model_path=constants.MODEL_PATH, description='AutoML', tags=tags_dict)
            print('Registered ' + model_name)
        except Exception as error:
            error_type = ErrorTypes.Unclassified
            error_message = 'Failed to register the model. ' + 'Error message: ' + str(error)
            print(error_message)

        date2 = datetime.datetime.now()

        logs.append('AutoML')
        logs.append(file_name)
        logs.append(current_run.id)
        logs.append(current_run.get_status())
        logs.append(model_name)
        logs.append(tags_dict)
        logs.append(str(date1))
        logs.append(str(date2))
        logs.append(error_type)
        logs.append(error_code)
        logs.append(error_message)

        print('ending (' + file_path + ') ' + str(date2))

    # 10.1 Log the error message if an exception occurs
    except (ClientException, AutoMLException) as error:
        date2 = datetime.datetime.now()
        error_message = 'Failed to train the model. ' + 'Error : ' + str(error)

        logs.append('AutoML')
        logs.append(file_name)

        if current_run:
            logs.append(current_run.id)
            logs.append(current_run.get_status())
        else:
            logs.append(current_run)
            logs.append('Failed')

        logs.append(model_name)
        logs.append(tags_dict)
        logs.append(str(date1))
        logs.append(str(date2))
        if isinstance(error, AutoMLException):
            logs.append(error.error_type)
        else:
            logs.append(None)
        logs.append(get_error_code(error))
        logs.append(error_message)

        print(error_message)
        print('ending (' + file_path + ') ' + str(date2))

    return logs


if __name__ == '__main__':
    data_file_path = sys.argv[1]
    data_dir = sys.argv[2]
    metadata_file_handler = MetadataFileHandler(data_dir)
    args = metadata_file_handler.load_args()
    automl_settings = metadata_file_handler.load_automl_settings()
    run_dto = metadata_file_handler.load_run_dto()
    experiment, run_id = Run._load_scope()
    current_step_run = Run(experiment, run_id, _run_dto=run_dto)
    logs = run(data_file_path, args, automl_settings, current_step_run)
    metadata_file_handler.write_logs_to_disk(logs)

# Import and verify the Azure ML SDK.

In [None]:
import azureml.core

azureml.core.VERSION

## Load Data from Azure Dataset Registry

First step is to get our data using the Dataset module, the function `Dataset.get_by_name()` returns a registered Dataset from a given `workspace` and its registration `name`.

Then the tabular dataset ios converted to a Pandas Dataframe 

For more information on **Dataset**, please visit: [Microsoft Dataset Documentation](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.dataset.dataset?view=azure-ml-py#get-by-name-workspace--name--version--latest--)


In [None]:
from azureml.core.workspace import Workspace
from azureml.core import Dataset

ws = Workspace.from_config()

tabular = Dataset.get_by_name(ws, '/flightdelay/flightdelayweather_ds')

data = tabular.to_pandas_dataframe()
tabular.take(3).to_pandas_dataframe()

## Create AzureML Compute Cluster

Firstly, check for the existence of the cluster. If it already exists, we are able to reuse it. Checking for the existence of the cluster can be performed by calling the constructor `ComputeTarget()` with the current workspace and name of the cluster.

In case the cluster does not exist, the next step will be to provide a configuration for the new AML cluster by calling the function `AmlCompute.provisioning_configuration()`. It takes as parameters the VM size and the max number of nodes that the cluster can scale up to. After the configuration has executed, `ComputeTarget.create()` should be called with the previously configuration object and the workspace object.

For more information on **ComputeTarget**, please visit: [Microsoft ComputeTarget Documentation](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.compute.computetarget?view=azure-ml-py)

For more information on **AmlCompute**, please visit: [Microsoft AmlCompute Documentation](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.compute.akscompute?view=azure-ml-py)


**Note:** Please wait for the execution of the cell to finish before moving forward.

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

### Create AML CPU Compute Cluster

try:
    compute_target = ComputeTarget(workspace=ws, name='cpu-cluster')
    print('Found existing compute target.')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='Standard_DS12_v2',
                                                           max_nodes=4)

    # create the cluster
    compute_target = ComputeTarget.create(ws, 'cpu-cluster', compute_config)

    compute_target.wait_for_completion(show_output=True)

# Create Dataset
This step uploads the training data and creates a dataset if it was not previously created

In [None]:
from azureml.core import Workspace, Datastore, Dataset
from azureml.data.datapath import DataPath

ws = Workspace.from_config()
datastore = Datastore.get(ws, 'workspaceblobstore')
ds = Dataset.File.upload_directory(src_dir='../../azureStorageFiles',
    target=DataPath(datastore,'/flightdelay'),
    overwrite=True,
    show_progress=True)

flight_dataset_2008_with_weather = Dataset.Tabular.from_delimited_files(path=[(datastore, '/flightdelay/flight_dataset_2008_with_weather.csv')])

flightdelayweather_ds = flight_dataset_2008_with_weather.register(workspace=ws, name='/flightdelay/flightdelayweather_ds', create_new_version=True)

# Automated Machine Learning

![automl](./automl.gif)

## Instantiate an Automated ML Config

Before the execution of an Automated ML run, the `AutoMLConfig` should be setup. `AutoMLConfig` is a configuration object that contains and persists the parameters for configuring the experiment run parameters. This configuration is a key element in the execution of the run since it defines things such as the number of iterations and primary metric to optimize on. In the example below the run will be setup to execute a regression task with 25 iterations and using `accuracy` as primary metric.

For more information on **AutoMLConfig**, please visit: [Microsoft AutoMLConfig Documentation](https://docs.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.automlconfig?view=azure-ml-py)

In [16]:
from azureml.train.automl import AutoMLConfig

training_data, validation_data = tabular.random_split(percentage=0.9, seed=1)

automl_config = AutoMLConfig(task = 'classification',
                             max_iterations = 3,
                             iteration_timeout_minutes = 5, 
                             max_cores_per_iteration = 4,
                             primary_metric = 'accuracy',
                             debug_log = 'automl.log',
                             training_data = training_data,
                             validation_data = validation_data,
                             label_column_name = "ArrDelay15",
                             compute_target = compute_target,
                             path = project_folder,
                             model_explainability = True,
                             experiment_exit_score = 0.9,
                             enable_early_stopping = True,
                             enable_onnx_compatible_models=True)

## Run our Experiment on the Compute Cluster

The Experiment constructor allows to create an experiment instance. The constructor takes in the current workspace, which is fetched by calling `Workspace.from_config()` and an experiment name. 

The `experiment.submit()` function is called to send the experiment for execution. The only parameter received by this function is the `AutoMLConfig` object instantiated previously in this module.

For more information on **Experiment**, please visit: [Microsoft Experiment Documentation](https://docs.microsoft.com/en-us/python/api/azureml-core/azureml.core.experiment.experiment?view=azure-ml-py)

The Compute Cluster will take a few minutes to scale up. You can monitor this in the Studio from the link.

In [None]:
from azureml.core.experiment import Experiment

# Get an instance of the Workspace from the config file
ws = Workspace.from_config()

# Create Experiment
experiment = Experiment(ws, 'flight-delay-exp')

remote_run = experiment.submit(automl_config, show_output=False)
remote_run

## Display Automated ML Run Details

The creation of an object of type `AutoMLRun` will enable us to observe the experiment progress and results. The object is created by calling the constructor `AutoMLRun()`. It takes as arguments the experiment and the identifier of the run to fetch. After the object has been instantiated, the `RunDetails()` function will retrieve the progress, metrics, and tasks for the specified run. They will be displayed by calling the function `show()` over the mentioned object.

For more information on **AutoMLRun**, please visit: [Microsoft AutoMLRun Documentation](https://docs.microsoft.com/en-us/python/api/azureml-train-automl-client/azureml.train.automl.run.automlrun?view=azure-ml-py)

For more information on **RunDetails**, please visit: [Microsoft RunDetails Documentation](https://docs.microsoft.com/en-us/python/api/azureml-widgets/azureml.widgets.rundetails?view=azure-ml-py)


**Note:** Please wait for the execution of the cell to finish before moving forward. (Status should be **Completed**)

In [None]:
from azureml.train.automl.run import AutoMLRun 
from azureml.widgets import RunDetails
from azureml.core.experiment import Experiment

experiment = Experiment(ws, 'flight-delay-exp')
remote_run = AutoMLRun(experiment=experiment, run_id=remote_run.id)

RunDetails(remote_run).show()

## Show best run

Select the best model from your iterations. The `get_output` function returns the best run and the fitted model for the last fit invocation. By using the overloads on get_output, you can retrieve the best run and fitted model for any logged metric or a particular iteration.

In [None]:
remote_run.wait_for_completion()
best_run, fitted_model = remote_run.get_output()
print(best_run)
print(fitted_model)

## Show best model type

Models generated by AutoML are not black boxes and can be inspected.

## Class balancing detection

Imbalanced data is commonly found in data for machine learning classification scenarios, and refers to data that contains a disproportionate ratio of observations in each class. This imbalance can lead to a falsely perceived positive effect of a model's accuracy, because the input data has bias towards one class, which results in the trained model to mimic that bias.

`get_guardraild()` = Function which prints and returns detailed results from running Guardrail verification.

In [None]:
guardrails = remote_run.get_guardrails()

## Feature Engineering

In Azure Machine Learning, data-scaling and normalization techniques are applied to make feature engineering easier. Collectively, these techniques and this feature engineering are called featurization in automated machine learning, or AutoML, experiments.

Automated ML makes it transparent to get this information from the fitted_model output from automated ML.

By calling `get_featurization_summary()` we will be able to retrieve the featurization summary for all the input features.

In [None]:
import pandas as pd
feature_engineered_sum = pd.DataFrame(fitted_model.named_steps['datatransformer'].get_featurization_summary())
feature_engineered_sum

## Engineered Features Importance

The ExplanationClient object defines the client that uploads and downloads explanations. The best run is passed as an argument, and from it the explanations are stored as raw_explanations.

For more information on **ExplanationClient**, please visit: [Microsoft ExplanationClient Class Documentation](https://docs.microsoft.com/en-us/python/api/azureml-interpret/azureml.interpret.explanationclient?view=azure-ml-py)

In [None]:
import matplotlib.pyplot as plt
from azureml.interpret import ExplanationClient
import seaborn as sns

client = ExplanationClient.from_run(best_run)

## Raw Features Importance

The `download_model_explanation()` function downloads a model explanation that has been stored in run history.

The  `get_feature_importance_dict()` function prints out a dictionary that holds the top feature names and values

In [None]:
raw_explanations = client.download_model_explanation(raw=True)
summ = pd.DataFrame(raw_explanations.get_feature_importance_dict(), index = ['Importance'])
with sns.plotting_context('notebook', font_scale=1.4):
    plt.subplots(figsize=(13,9))
    sns.barplot(data=summ.iloc[:10,:10], orient='h').set_title('Raw Explanations Feature Importance')

In [None]:
print("You can visualize the engineered explanations under the 'Explanations' tab in the AutoML run at:\n" + best_run.get_portal_url())

# Responsible ML

## InterpretML

Interpretability is critical for data scientists, auditors, and business decision makers alike to ensure compliance with company policies, industry standards, and government regulations

Using the classes and methods in the SDK, you can:

* Explain model prediction by generating feature importance values for the entire model and/or individual datapoints.
* Achieve model interpretability on real-world datasets at scale, during training and inference.
* Use an interactive visualization dashboard to discover patterns in data and explanations at training time

As our next step we will retrieve our trained model and instantiate the Explainability Dashboard with the data we encoded above.

After the Explainability Dashboard has loaded you will be able to navigate through the user interface to identify the most important features of your new model.

In [None]:
import joblib
from interpret_community.widget import ExplanationDashboard
from azureml.interpret import ExplanationClient

best_run.download_file('outputs/model.pkl')
model_testing = joblib.load('model.pkl')

explanations = client.download_model_explanation(raw=True)

val = validation_data.to_pandas_dataframe()
val = val.drop(columns=['ArrDelay15'])
ExplanationDashboard(explanations, model_testing, datasetX=val)

## Fairlearn

Artificial intelligence and machine learning systems can display unfair behavior.

Let's use Fairlearn open-source Python package with Azure Machine Learning to perform the following tasks:

* Assess the fairness of your model predictions. To learn more about fairness in machine learning, see the fairness in machine learning article.
* Upload, list and download fairness assessment insights to/from Azure Machine Learning studio.
* See a fairness assessment dashboard in Azure Machine Learning studio to interact with your model(s)' fairness insights.

In [28]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
Y = le.fit_transform(data['ArrDelay15'].values)

X_train = data.drop(columns=['ArrDelay15'])
X_test = data.drop(columns=['ArrDelay15'])
Y_train = Y
A_test = data['UniqueCarrier'].to_frame()
Y_test = Y

## Display Fairlearn Dashboard

Let's assess how a model’s predictions impact different groups, and also for comparing multiple models along different fairness and performance metrics.

The FairlearnDashboard class, wraps the dashboard component.

In [None]:
from fairlearn.widget import FairlearnDashboard

model_2 = model_testing.fit(X_train, Y_train)
preds = model_testing.predict(X_test)

FairlearnDashboard(sensitive_features=A_test,
                   sensitive_feature_names=['UniqueCarrier'],
                   y_true=Y_test.tolist(),
                   y_pred=[preds.tolist()])

# Register Model

Next, register the model obtained from the best run. In order to register the model, the function `register_model()` should be called. This will take care of registering the model obtained from the best run.

In [None]:
# register the model for deployment
model = best_run.register_model(model_name='flight_delay_weather', 
                                model_path='outputs/model.pkl',
                                datasets=[(Dataset.Scenario.TRAINING, tabular)],
                                description='This model was developed by researchers at OpenAI to help us understand how the capabilities of language model capabilities scale as a function of the size of the models',
                       tags={'title': 'Flight Delay Model',
    'datasheet_description':
"""
Last updated: October 2020

Based on dataset from by [Statistical Computing Statistical Graphics](http://stat-computing.org/dataexpo/2009/the-data.html)

""",
    'details': 'This model was developed for Microsoft.',
    'date': 'October 2020, trained on data that cuts off at the end of 2008.', 
    'type': 'Classification model',
    'version': '1.0',
    'help': 'https://www.azure.com/',
    'usecase_primary': 
"""
Developed for Flight Delay Demo.

""",
    'usecase_secondary':
"""
Field demos and marketing.

""",
    'usecase_outofscope':
"""
Do not use for production environments.

""",
    'dataset_description':
"""
The data comes originally from RITA where it is described in detail. You can download the data there, or from the bzipped csv files listed below. These files have derivable variables removed, are packaged in yearly chunks and have been more heavily compressed than the originals.

""",
    'motivation': 'Demo the main features behind the Azure ML Workspace environment',
    'caveats':
"""
"""})

print("Model name: " + model.name, "Model version: " + str(model.version), sep="\n")

## Upload Fairlearn Dashboard to run

Create a dashboard dictionary using Fairlearn's metrics package. The `_create_group_metric_set` method has arguments similar to the Dashboard constructor, except that the sensitive features are passed as a dictionary (to ensure that names are available).

In [31]:
from fairlearn.metrics._group_metric_set import _create_group_metric_set

#  Create a dictionary of model(s) you want to assess for fairness 
sf = { 'UniqueCarrier': A_test.UniqueCarrier }
ys_pred = { model.id: model_testing.predict(X_test) }

dash_dict = _create_group_metric_set(y_true=Y_test,
                                    predictions=ys_pred,
                                    sensitive_features=sf,
                                    prediction_type='binary_classification')

Now we can fetch the Experiment, then a Run, and upload our dashboard to it:

In [None]:
from azureml.contrib.fairness import upload_dashboard_dictionary, download_dashboard_by_upload_id

run = experiment.start_logging(snapshot_directory=None)
try:
    dashboard_title = "Fairness in Flight Delay Classifier"
    upload_id = upload_dashboard_dictionary(run,
                                        dash_dict,
                                        dashboard_name=dashboard_title)
    print("\nUploaded to id: {0}\n".format(upload_id))

    downloaded_dict = download_dashboard_by_upload_id(run, upload_id)
    
    
finally:
    run.complete()

# Deployment

## Create managed-endpoints directory

Create a new directory to hold the configuration files for deploying a managed endpoint.

In [None]:
import os

managed_endpoints = './managed-endpoints'

# Working directory
if not os.path.exists(managed_endpoints):
    os.makedirs(managed_endpoints)
    
if os.path.exists(os.path.join(managed_endpoints,".amlignore")):
  os.remove(os.path.join(managed_endpoints,".amlignore"))

## Create Scoring File

Creating the scoring file is next step before deploying the service. This file is responsible for the actual generation of predictions using the model. The values or scores generated can represent predictions of future values, but they might also represent a likely category or outcome.

The first thing to do in the scoring file is to fetch the model. This is done by calling `Model.get_model_path()` and passing the model name as a parameter.

After the model has been loaded, the function `model.predict()` function should be called to start the scoring process.

For more information on **Machine Learning - Score**, please visit: [Microsoft Machine Learning - Score Documentation](https://docs.microsoft.com/en-us/azure/machine-learning/studio-module-reference/machine-learning-score)

In [None]:
%%writefile $managed_endpoints/score.py
import os
import pickle
import json
import joblib
import time
import numpy as np
import pandas as pd
import azureml.automl.core
from azureml.core.model import Model
 
def init():
    global model
    print ("model initialized" + time.strftime("%H:%M:%S"))
    model_path = os.path.join(os.getenv("AZUREML_MODEL_DIR"), "model.pkl")
    model = joblib.load(model_path)
    
def run(data):
    try:
        data = json.loads(data)["data"]
        df = pd.DataFrame(data, columns=['Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'CRSArrTime', 'UniqueCarrier', 'CRSElapsedTime', 'Origin', 'Dest', 'Distance', 'Origin_Lat', 'Origin_Lon', 'Origin_State', 'Dest_Lat', 'Dest_Lon', 'Dest_State', 'Origin_dayl', 'Dest_dayl', 'Origin_prcp', 'Dest_prcp', 'Origin_srad', 'Dest_srad', 'Origin_swe', 'Dest_swe', 'Origin_tmax', 'Dest_tmax', 'Origin_tmin', 'Dest_tmin', 'Origin_vp', 'Dest_vp']) 
        result = model.predict(df)
    except Exception as e:
        result = str(e)
        return {"error": result}
    return {"result":result.tolist()}

## Create the environment definition

The following file contains the details of the environment to host the model and code. 

In [None]:
%%writefile $managed_endpoints/score-new.yml
name: fd-endpoint-managed-env
dependencies:
- python=3.6.2
- pip:
  - azureml-sdk[notebooks,automl]~=1.32.0
  - azureml-defaults~=1.32.0
  - inference-schema
  - azureml-monitoring
  - shap==0.39.0
- numpy
- scikit-learn==0.22.1
channels:
- anaconda
- conda-forge

## Define the endpoint configuration
Specific inputs are required to deploy a model on an online endpoint:

1. Model files.
1. The code that's required to score the model.
1. An environment in which your model runs.
1. Settings to specify the instance type and scaling capacity.

In [None]:
%%writefile $managed_endpoints/endpointconfig.yml
name: fd-endpoint-managed
type: online
auth_mode: key
traffic:
  blue: 100

deployments:
  #blue deployment
  - name: blue
    model: azureml:flight_delay_weather:1
    code_configuration:
      code:
        local_path: ./
      scoring_script: score.py
    environment: 
      name: fd-endpoint-managed-env
      version: 1
      path: ./
      conda_file: file:./score-new.yml
      docker:
          image: mcr.microsoft.com/azureml/openmpi3.1.2-ubuntu18.04:20210727.v1
    instance_type: Standard_DS3_v2
    scale_settings:
      scale_type: manual
      instance_count: 1
      min_instances: 1
      max_instances: 2

## Deploy your managed online endpoint to Azure

This deployment might take up to 15 minutes, depending on whether the underlying environment or image is being built for the first time. Subsequent deployments that use the same environment will finish processing more quickly.

In [None]:
!az ml endpoint create -g [your resource group name] -w [your AML workspace name] -n fd-managed-endpoint -f ./managed-endpoints/endpointconfig.yml

## Generate a sample request JSON file

Export some test data to a JSON file we can send to the endpoint.

In [None]:
%%writefile $managed_endpoints/sample-request.json
{"data": [
[1, 30, 3, 1312, 1459, "DL", 287, "CVG", "LAX", 1900, 39.04614278, -84.6621725, "KY", 33.94253611, -118.4080744, "CA", 35942.4, 36979.2, 9.0, 0.0, 268.8, 304.0, 8.0, 0.0, 5.0, 15.5, -10.5, 6.0, 280.0, 920.0],
[5, 30, 5, 1920, 2202, "OH", 102, "HSV", "DCA", 613, 34.6404475, -86.77310944, "AL", 38.85208333, -77.03772222, "VA", 50803.2, 52185.6, 0.0, 0.0, 393.6, 499.2, 0.0, 0.0, 28.5, 26.5, 18.0, 11.5, 2080.0, 1360.0],
[7, 14, 1, 1310, 1550, "UA", 160, "LAX", "SEA", 954, 33.94253611, -118.4080744, "CA", 47.44898194, -122.3093131, "WA", 50457.6, 55296.0, 0.0, 0.0, 441.6, 454.4, 0.0, 0.0, 27.0, 26.5, 19.0, 12.5, 840.0, 1080.0],
[3, 4, 2, 1735, 2100, "AA", 265, "BOS", "DFW", 1562, 42.3643475, -71.00517917, "MA", 32.89595056, -97.0372, "TX", 40089.6, 41126.4, 10.0, 7.0, 316.8, 262.4, 64.0, 0.0, 14.5, 12.5, 2.0, 0.5, 720.0, 640.0],
[7, 18, 5, 730, 945, "AA", 75, "ORD", "DTW", 235, 41.979595, -87.90446417, "IL", 42.21205889, -83.34883583, "MI", 52876.8, 52876.8, 0.0, 0.0, 329.6, 396.8, 0.0, 0.0, 31.5, 32.0, 22.5, 20.0, 2720.0, 2320.0],
[10, 15, 3, 1750, 1850, "AA", 60, "DFW", "SAT", 247, 32.89595056, -97.0372, "TX", 29.53369444, -98.46977778, "TX", 40089.6, 40435.2, 17.0, 9.0, 172.8, 185.6, 0.0, 0.0, 25.0, 30.0, 17.0, 20.5, 1920.0, 2400.0],
[5, 14, 3, 1835, 1928, "US", 53, "BWI", "PHL", 90, 39.17540167, -76.66819833, "MD", 39.87195278, -75.24114083, "PA", 50803.2, 51148.8, 0.0, 0.0, 499.2, 505.6, 0.0, 0.0, 22.5, 24.5, 8.5, 8.5, 1120.0, 1120.0],
[2, 27, 3, 1600, 1830, "FL", 150, "LGA", "ATL", 761, 40.77724306, -73.87260917, "NY", 33.64044444, -84.42694444, "GA", 39398.4, 40435.2, 0.0, 7.0, 352.0, 256.0, 0.0, 0.0, 7.0, 9.5, -4.0, -2.5, 440.0, 520.0],
[11, 24, 1, 1837, 2015, "OO", 98, "LNK", "ORD", 466, 40.85097222, -96.75925, "NE", 41.979595, -87.90446417, "IL", 33868.8, 33523.2, 0.0, 2.0, 240.0, 172.8, 0.0, 0.0, 11.0, 5.0, -4.5, -2.0, 440.0, 520.0]]}

## Invoke the endpoint to score data by using your model

You can use either the invoke command or a REST client of your choice to invoke the endpoint and score against it.


In [None]:
!az ml endpoint invoke -g [your resource group name] -w [your AML workspace name] -n fd-managed-endpoint --request-file ./managed-endpoints/sample-request.json

# ML Pipelines

![pipeline](./pipeline.gif)

## Solution Accelerator: Many Models

In the real world, many problems can be too complex to be solved by a single machine learning model. Whether that be predicting sales for each individual store, building a predictive maintanence model for hundreds of oil wells, or tailoring an experience to individual users, building a model for each instance can lead to improved results on many machine learning problems.

Azure Machine Learning (AML) makes it easy to train, operate, and manage hundreds or even thousands of models.

## Split data in two sets

Let's setup the path that holds the files for the individual airports and split the data into training and testing/inferencing.

In [None]:
from scripts.helper import split_data

target_path = 'airports' 
os.makedirs(target_path, exist_ok=True)

# Split each file and store in corresponding directory
train_path, inference_path = split_data(target_path)

## Upload data to Datastore in AML Workspace

Next step is to upload our splitted data into our Workspace default datastore.

In [None]:
# Connect to default datastore
datastore = ws.get_default_datastore()

# Upload train data
ds_train_path = target_path + '_train'
datastore.upload(src_dir=train_path, target_path=ds_train_path, overwrite=True)

# Upload inference data
ds_inference_path = target_path + '_inference'
datastore.upload(src_dir=inference_path, target_path=ds_inference_path, overwrite=True)

## Register dataset

From our datastore lets register our new file dataset.

In [None]:
from azureml.core.dataset import Dataset

# Create file datasets
ds_train = Dataset.File.from_files(path=datastore.path(ds_train_path), validate=False)
ds_inference = Dataset.File.from_files(path=datastore.path(ds_inference_path), validate=False)

# Register the file datasets
dataset_name = 'airport_data'
train_dataset_name = dataset_name + '_train'
inference_dataset_name = dataset_name + '_inference'
ds_train.register(ws, train_dataset_name, create_new_version=True)
ds_inference.register(ws, inference_dataset_name, create_new_version=True)

## Create Experiment

Afterwards lets setup a new Experiment for our Pipeline.

In [None]:
from azureml.core import Experiment

experiment = Experiment(ws, 'manymodels-training-pipeline')

print('Experiment name: ' + experiment.name)

## Fetch registered dataset

Fetch the training and testing data from the Workspace Dataset registry.

In [None]:
from azureml.core.dataset import Dataset

filedst_5_models = Dataset.get_by_name(ws, name='airport_data_train')
filedst_5_models_input = filedst_5_models.as_named_input('train_5_models')

## Build the training pipeline

This dictionary defines the AutoML settings.

In [None]:
import logging
from scripts.helper import write_automl_settings_to_file

automl_settings = {
    "task" : 'classification',
    "primary_metric" : 'accuracy',
    "iteration_timeout_minutes" : 30,
    "iterations" : 25,
    "label_column_name" : 'ArrDelay15',
    "verbosity" : logging.INFO, 
    "debug_log": 'automl_many_models_debug.txt',
    "experiment_exit_score": 0.8
}

write_automl_settings_to_file(automl_settings)

## Set up environment  for ParallelRunStep


Environment defines a collection of resources that we will need to run our pipelines. We configure a reproducible Python environment for our training script.

In [None]:
from scripts.helper import get_automl_environment
train_env = get_automl_environment(workspace=ws, automl_settings_dict=automl_settings)

## Set up ParallelRunConfig

ParallelRunConfig is configuration for parallel run step. You will need to determine the number of workers and nodes appropriate for your use case. The process_count_per_node is based off the number of cores of the compute VM. The node_count will determine the number of master nodes to use, increasing the node count will speed up the training process.

node_count: The number of compute nodes to be used for running the user script. We recommend to start with 3 and increase the node_count if the training time is taking too long.

process_count_per_node: The number of processes per node.

run_invocation_timeout: The run() method invocation timeout in seconds. The timeout should be set to maximum training time of one AutoML run(with some buffer), by default it's 60 seconds.

In [None]:
from scripts.helper import build_parallel_run_config

node_count=1
process_count_per_node=6
run_invocation_timeout=3700

parallel_run_config = build_parallel_run_config(train_env, compute_target, node_count, process_count_per_node, run_invocation_timeout)

## Setup Pipeline output path

This ParallelRunStep is the main step in our pipeline. First, we set up the output directory and define the Pipeline's output name. The datastore that stores the pipeline's output data is Workspace's default datastore.

In [None]:
from azureml.pipeline.core import PipelineData

training_output_name = "training_output"

output_dir = PipelineData(name=training_output_name, 
                          datastore=datastore)

## Instantiate ParallelRunStep

We specify the following parameters:

**name**: We set a name for our ParallelRunStep.

**parallel_run_config**: We then pass the previously defined ParallelRunConfig.

**allow_reuse**: Indicates whether the step should reuse previous results when re-run with the same settings.

**inputs**: We are going to use the registered FileDataset that we called earlier in the Notebook. inputs points to a registered file dataset in AML studio that points to a path in the blob container. The number of files in that path determines the number of models will be trained in the ParallelRunStep.

**output**: The output directory we just defined. A PipelineData object that corresponds to the output directory.

**models**: Zero or more model names already registered in the Azure Machine Learning model registry.

In [None]:
from azureml.pipeline.steps import ParallelRunStep

parallel_run_step = ParallelRunStep(
    name="many-models-training",
    parallel_run_config=parallel_run_config,
    allow_reuse = False,
    inputs=[filedst_5_models_input], # train 10 models
    output=output_dir)

## Submit the pipeline to run

Next we submit our pipeline to run.

In [None]:
from azureml.pipeline.core import Pipeline
from azureml.widgets import RunDetails
from azureml.core.run import Run

pipeline = Pipeline(workspace=ws, steps=parallel_run_step)
run = experiment.submit(pipeline)
RunDetails(run).show()

In [None]:
run.wait_for_completion(show_output=True)

## Get list of AutoML runs along with registered model names and tags

The following code snippet will iterate through all the automl runs for the experiment and list the details.

In [None]:
from scripts.helper import get_training_output
import os
import pandas as pd

training_results_name = "training_results"

training_file = get_training_output(run, training_results_name, training_output_name)
all_columns = ["Framework", "Dataset", "Run", "Status", "Model", "Tags", "StartTime", "EndTime" , "ErrorType", "ErrorCode", "ErrorMessage" ]
df = pd.read_csv(training_file, delimiter=" ", header=None, names=all_columns)
training_csv_file = "training.csv"
df.to_csv(training_csv_file)
df[['Framework','Dataset','Run','Status','Model','StartTime','EndTime']]

## Publish the pipeline

Publish the pipeline for easy re-execution.

In [None]:
published_pipeline = pipeline.publish(name = 'fd_train_many_models',
                                      description = 'Flight Delay - train many models',
                                      version = '1',
                                      continue_on_step_failure = False)