First we fetch the data

In [1]:
import shap
from sklearn.model_selection import train_test_split


X, y = shap.datasets.adult()
print("Data fetched")
target_feature = "income"
y = [1 if y_i else 0 for y_i in y]

full_data = X.copy()
full_data[target_feature] = y

data_train, data_test = train_test_split(
    full_data, test_size=4000, random_state=96132, stratify=full_data[target_feature]
)

# Don't write out the row indices to the CSV.....
print("Saving to files")
data_train.to_parquet("adult_train.parquet", index=False)
data_test.to_parquet("adult_test.parquet", index=False)

Data fetched
Saving to files


Now create an MLClient:

In [2]:
subscription_id = '589c7ae9-223e-45e3-a191-98433e0821a9'
resource_group = 'amlisdkv2-rg-1638957740'
workspace_name = 'amlisdkv21638957740'

In [3]:
from azure.ml import MLClient
from azure.identity import DefaultAzureCredential
ml_client = MLClient(credential=DefaultAzureCredential(exclude_shared_token_cache_credential=True),
                     subscription_id=subscription_id,
                     resource_group_name=resource_group,
                     workspace_name=workspace_name,
                     logging_enable=True)

Upload the datasets

In [4]:
from azure.ml.entities import Data

train_dataset = Data(
    name="Adult_Train_from_Notebook",
    local_path="adult_train.parquet",
    version="1"
)

In [5]:
ml_client.data.create_or_update(train_dataset)

Uploading adult_train.parquet: 100%|█████████████████████████████████████████████████| 158k/158k [00:00<00:00, 925kB/s]


Data({'is_anonymous': False, 'auto_increment_version': False, 'name': 'Adult_Train_from_Notebook', 'description': None, 'tags': {}, 'properties': {}, 'id': '/subscriptions/589c7ae9-223e-45e3-a191-98433e0821a9/resourceGroups/amlisdkv2-rg-1638957740/providers/Microsoft.MachineLearningServices/workspaces/amlisdkv21638957740/data/Adult_Train_from_Notebook/versions/1', 'base_path': './', 'creation_context': <azure.ml._restclient.v2021_03_01_preview.models._models_py3.SystemData object at 0x000001C99C78C2E0>, 'serialize': <msrest.serialization.Serializer object at 0x000001C99C784FA0>, 'version': '1', 'local_path': None, 'path': 'LocalUpload/3c08dc2a27945f6adbd2ef2dfe7ab079/adult_train.parquet', 'datastore': '/subscriptions/589c7ae9-223e-45e3-a191-98433e0821a9/resourceGroups/amlisdkv2-rg-1638957740/providers/Microsoft.MachineLearningServices/workspaces/amlisdkv21638957740/datastores/workspaceblobstore'})

In [6]:
test_dataset = Data(
    name="Adult_Test_from_Notebook",
    local_path="adult_test.parquet",
    version="1"
)
ml_client.data.create_or_update(test_dataset)

Uploading adult_test.parquet: 100%|████████████████████████████████████████████████| 31.6k/31.6k [00:00<00:00, 430kB/s]


Data({'is_anonymous': False, 'auto_increment_version': False, 'name': 'Adult_Test_from_Notebook', 'description': None, 'tags': {}, 'properties': {}, 'id': '/subscriptions/589c7ae9-223e-45e3-a191-98433e0821a9/resourceGroups/amlisdkv2-rg-1638957740/providers/Microsoft.MachineLearningServices/workspaces/amlisdkv21638957740/data/Adult_Test_from_Notebook/versions/1', 'base_path': './', 'creation_context': <azure.ml._restclient.v2021_03_01_preview.models._models_py3.SystemData object at 0x000001C99C76EAC0>, 'serialize': <msrest.serialization.Serializer object at 0x000001C99C76E9D0>, 'version': '1', 'local_path': None, 'path': 'LocalUpload/0f2894e892201167f37ded36e2d2d0b2/adult_test.parquet', 'datastore': '/subscriptions/589c7ae9-223e-45e3-a191-98433e0821a9/resourceGroups/amlisdkv2-rg-1638957740/providers/Microsoft.MachineLearningServices/workspaces/amlisdkv21638957740/datastores/workspaceblobstore'})

# Creating the Model

To simplify the model creation process, we're going to use a pipeline.

First, the training script:

In [8]:
%%writefile training_script.py

import argparse
import os
import shutil
import tempfile


from azureml.core import Run

import mlflow
import mlflow.sklearn

import pandas as pd
from sklearn.linear_model import LogisticRegression

def parse_args():
    # setup arg parser
    parser = argparse.ArgumentParser()

    # add arguments
    parser.add_argument("--training_data", type=str, help="Path to training data")
    parser.add_argument("--target_column_name", type=str, help="Name of target column")
    parser.add_argument("--model_output", type=str, help="Path of output model")

    # parse args
    args = parser.parse_args()

    # return args
    return args


def main(args):
    current_experiment = Run.get_context().experiment
    tracking_uri = current_experiment.workspace.get_mlflow_tracking_uri()
    print("tracking_uri: {0}".format(tracking_uri))
    mlflow.set_tracking_uri(tracking_uri)
    mlflow.set_experiment(current_experiment.name)

    # Read in data
    print("Reading data")
    all_data = pd.read_parquet(args.training_data)

    print("Extracting X_train, y_train")
    print("all_data cols: {0}".format(all_data.columns))
    y_train = all_data[args.target_column_name]
    X_train = all_data.drop(labels=args.target_column_name, axis="columns")
    print("X_train cols: {0}".format(X_train.columns))

    print("Training model")
    model = LogisticRegression(solver="liblinear")
    model.fit(X_train, y_train)

    # print("Registering via MLFlow")
    # mlflow.sklearn.log_model(sk_model=model, registered_model_name="lr_classifier_mlflow", artifact_path="some_path")

    # Saving model with mlflow
    with tempfile.TemporaryDirectory() as td:
        print("Saving model with MLFlow to temporary directory")
        tmp_output_dir = os.path.join(td, "my_model_dir")
        mlflow.sklearn.save_model(sk_model=model, path=tmp_output_dir)

        print("Copying MLFlow model to output path")
        for file_name in os.listdir(tmp_output_dir):
            print("  Copying: ", file_name)
            # As of Python 3.8, copytree will acquire dirs_exist_ok as
            # an option, removing the need for listdir
            shutil.copy2(src=os.path.join(tmp_output_dir, file_name), dst=os.path.join(args.model_output, file_name))


# run script
if __name__ == "__main__":
    # add space in logs
    print("*" * 60)
    print("\n\n")

    # parse args
    args = parse_args()

    # run main function
    main(args)

    # add space in logs
    print("*" * 60)
    print("\n\n")


Overwriting training_script.py


Now, we want to place this into a component:

In [17]:
from azure.ml.entities import Code, CommandComponent

training_code = Code(
    local_path='training_script.py'
)

training_inputs = {
    'training_data': { 'type': 'path'},
    'target_column_name': { 'type': 'string'}
}

training_outputs = {
    'model_output': { 'type': 'path'}
}

training_component = CommandComponent(
    name="MyTrainingComponent",
    version="1",
    display_name="Simple training component",
    code=training_code,
    environment="AML-RAI-Environment:1638957740",
    inputs=training_inputs,
    outputs=training_outputs,
    command="python training_script.py " \
            "--training_data ${{inputs.training_data}} " \
            "--target_column_name ${{inputs.target_column_name}} " \
            "--model_output ${{outputs.model_output}}"
)

ml_client.components.create_or_update(training_component)

CommandComponent({'auto_increment_version': False, 'is_anonymous': False, 'name': 'MyTrainingComponent', 'description': None, 'tags': {}, 'properties': {}, 'id': '/subscriptions/589c7ae9-223e-45e3-a191-98433e0821a9/resourceGroups/amlisdkv2-rg-1638957740/providers/Microsoft.MachineLearningServices/workspaces/amlisdkv21638957740/components/MyTrainingComponent/versions/1', 'base_path': None, 'creation_context': <azure.ml._restclient.v2021_10_01.models._models_py3.SystemData object at 0x000001C99CA2BB50>, 'serialize': <msrest.serialization.Serializer object at 0x000001C99CA38850>, 'command': 'python training_script.py --training_data ${{inputs.training_data}} --target_column_name ${{inputs.target_column_name}} --model_output ${{outputs.model_output}}', 'code': '/subscriptions/589c7ae9-223e-45e3-a191-98433e0821a9/resourceGroups/amlisdkv2-rg-1638957740/providers/Microsoft.MachineLearningServices/workspaces/amlisdkv21638957740/codes/8702f8b6-d966-4546-a9dd-db4133259936/versions/1', 'environme