First we fetch the data:

In [4]:
import pandas as pd
programmers_train_data = pd.read_csv('C:\\fibberio\\sandbox\\programmers-train.csv', skipinitialspace=True)
# programmers_test_data = pd.read_csv('C:\\fibberio\\sandbox\\programmers-test.csv', skipinitialspace=True
programmers_test_data = pd.read_csv('C:\\fibberio\\sandbox\\programmers-test.csv', skipinitialspace=True)
programmers_train_data = programmers_train_data.drop(['first_name', 'last_name'], axis=1)
programmers_test_data = programmers_test_data.drop(['first_name', 'last_name'], axis=1)
# delete first column
del programmers_train_data[programmers_train_data.columns[0]]
del programmers_test_data[programmers_test_data.columns[0]]

In [6]:
programmers_test_data

Unnamed: 0,score,style,YOE,IDE,Programming language,location,Number of github repos contributed to,Employer,OS,job title,age
0,8.0,spaces,16.0,Emacs,R,Antarctica,2.0,Snapchat,MacOS,Principal Engineer,32.0
1,3.0,tabs,9.0,pyCharm,Swift,Antarctica,2.0,Instagram,Linux,Distinguished Engineer,35.0
2,1.0,tabs,7.0,XCode,C#,Antarctica,0.0,Uber,MacOS,Senior Engineer,32.0
3,6.0,spaces,15.0,Visual Studio,R,Antarctica,0.0,Amazon,Linux,Principal Engineer,32.0
4,5.0,tabs,7.0,Eclipse,Java,Antarctica,0.0,Twitter,Windows,SWE 2,33.1
...,...,...,...,...,...,...,...,...,...,...,...
795,4.0,tabs,6.0,VSCode,Python,Antarctica,0.0,Amazon,Windows,SWE 1,30.7
796,1.0,tabs,6.0,Visual Studio,Python,Antarctica,0.0,Apple,Windows,SWE 1,31.0
797,3.0,tabs,7.0,XCode,Java,Antarctica,3.0,Uber,Windows,SWE 2,30.3
798,7.0,tabs,8.0,Emacs,Javascript,North America,1.0,Microsoft,Windows,SWE 2,30.2


In [7]:
from sklearn.model_selection import train_test_split

target_column_name = "score"

data_train = programmers_train_data
data_test = programmers_test_data

Now create an MLClient:

In [8]:
from azure.ml import MLClient
from azure.identity import DefaultAzureCredential
ml_client = MLClient.from_config(credential=DefaultAzureCredential(exclude_shared_token_cache_credential=True),
                     logging_enable=True)

Found the config file in: C:\RAI-vNext-Preview\config.json


In [9]:
data_train.to_parquet("programmers-train.parquet")
data_test.to_parquet("programmers-test.parquet")

Upload the datasets:

In [10]:
from azure.ml.entities import Dataset

train_dataset = Dataset(
    name="Programmers_Train_from_Notebook",
    local_path='programmers-train.parquet',
)
ml_client.datasets.create_or_update(train_dataset)

[32mUploading programmers-train.parquet[32m (< 1 MB): 100%|#######################| 19.5k/19.5k [00:00<00:00, 264kB/s][0m
[39m



Dataset({'paths': [<azure.ml._restclient.v2021_10_01.models._models_py3.UriReference object at 0x00000183C6C305E0>], 'is_anonymous': False, 'auto_increment_version': False, 'name': 'Programmers_Train_from_Notebook', 'description': None, 'tags': {}, 'properties': {}, 'id': '/subscriptions/fac34303-435d-4486-8c3f-7094d82a0b60/resourceGroups/RAIPM/providers/Microsoft.MachineLearningServices/workspaces/RAIPM2/datasets/Programmers_Train_from_Notebook/versions/5', 'base_path': './', 'creation_context': <azure.ml._restclient.v2021_10_01.models._models_py3.SystemData object at 0x00000183C6C30EB0>, 'serialize': <msrest.serialization.Serializer object at 0x00000183C6BF1D90>, 'version': '5', 'local_path': None})

In [11]:
test_dataset = Dataset(
    name="Programmers_Test_from_Notebook",
    local_path='programmers-test.parquet',
)
ml_client.datasets.create_or_update(test_dataset)

[32mUploading programmers-test.parquet[32m (< 1 MB): 100%|########################| 13.3k/13.3k [00:00<00:00, 323kB/s][0m
[39m



Dataset({'paths': [<azure.ml._restclient.v2021_10_01.models._models_py3.UriReference object at 0x00000183C6C30DC0>], 'is_anonymous': False, 'auto_increment_version': False, 'name': 'Programmers_Test_from_Notebook', 'description': None, 'tags': {}, 'properties': {}, 'id': '/subscriptions/fac34303-435d-4486-8c3f-7094d82a0b60/resourceGroups/RAIPM/providers/Microsoft.MachineLearningServices/workspaces/RAIPM2/datasets/Programmers_Test_from_Notebook/versions/5', 'base_path': './', 'creation_context': <azure.ml._restclient.v2021_10_01.models._models_py3.SystemData object at 0x00000183C6C30940>, 'serialize': <msrest.serialization.Serializer object at 0x00000183C6C30490>, 'version': '5', 'local_path': None})

# Creating the Model

To simplify the model creation process, we're going to use a pipeline.

Before we do anything else, we need to specify the version of the RAI components:

In [12]:
version_string = '10'

Now we can create the training script:

In [13]:
%%writefile training_script_reg.py

import argparse
import os
import shutil
import tempfile


from azureml.core import Run

import mlflow
import mlflow.sklearn

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split

def parse_args():
    # setup arg parser
    parser = argparse.ArgumentParser()

    # add arguments
    parser.add_argument("--training_data", type=str, help="Path to training data")
    parser.add_argument("--target_column_name", type=str, help="Name of target column")
    parser.add_argument("--model_output", type=str, help="Path of output model")

    # parse args
    args = parser.parse_args()

    # return args
    return args

def create_regression_pipeline(X, y):
    pipe_cfg = {
        'num_cols': X.dtypes[X.dtypes == 'int64'].index.values.tolist(),
        'cat_cols': X.dtypes[X.dtypes == 'object'].index.values.tolist(),
    }
    num_pipe = Pipeline([
        ('num_imputer', SimpleImputer(strategy='median')),
        ('num_scaler', StandardScaler())
    ])
    cat_pipe = Pipeline([
        ('cat_imputer', SimpleImputer(strategy='constant', fill_value='?')),
        ('cat_encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
    ])
    feat_pipe = ColumnTransformer([
        ('num_pipe', num_pipe, pipe_cfg['num_cols']),
        ('cat_pipe', cat_pipe, pipe_cfg['cat_cols'])
    ])

    # Append classifier to preprocessing pipeline.
    # Now we have a full prediction pipeline.
    pipeline = Pipeline(steps=[('preprocessor', feat_pipe),
                               ('model', LinearRegression())])
    return pipeline.fit(X, y)

def main(args):
    current_experiment = Run.get_context().experiment
    tracking_uri = current_experiment.workspace.get_mlflow_tracking_uri()
    print("tracking_uri: {0}".format(tracking_uri))
    mlflow.set_tracking_uri(tracking_uri)
    mlflow.set_experiment(current_experiment.name)
    
    #train_file = None
    #for filename in os.listdir(args.training_data):
    #    if filename.endswith('.csv'):
    #        train_file = filename
    #        break
    #print(train_file)
    #print(os.path.join(args.training_data, train_file))
    # Read in data
    print("Reading data")
    all_data = pd.read_parquet(args.training_data)

    print("Extracting X_train, y_train")
    print("all_data cols: {0}".format(all_data.columns))
    y_train = all_data[args.target_column_name]
    X_train = all_data.drop(labels=args.target_column_name, axis="columns")
    print("X_train cols: {0}".format(X_train.columns))

    print("Training model")
    # The estimator can be changed to suit
    model = create_regression_pipeline(X_train, y_train)

    # Saving model with mlflow - leave this section unchanged
    with tempfile.TemporaryDirectory() as td:
        print("Saving model with MLFlow to temporary directory")
        tmp_output_dir = os.path.join(td, "my_model_dir")
        mlflow.sklearn.save_model(sk_model=model, path=tmp_output_dir)

        print("Copying MLFlow model to output path")
        for file_name in os.listdir(tmp_output_dir):
            print("  Copying: ", file_name)
            # As of Python 3.8, copytree will acquire dirs_exist_ok as
            # an option, removing the need for listdir
            shutil.copy2(src=os.path.join(tmp_output_dir, file_name), dst=os.path.join(args.model_output, file_name))


# run script
if __name__ == "__main__":
    # add space in logs
    print("*" * 60)
    print("\n\n")

    # parse args
    args = parse_args()

    # run main function
    main(args)

    # add space in logs
    print("*" * 60)
    print("\n\n")

Overwriting training_script_reg.py


Now, we want to place this into a component:

In [14]:
from azure.ml.entities import Code, CommandComponent

training_code = Code(
    local_path='training_script_reg.py'
)

training_inputs = {
    'training_data': { 'type': 'path'},
    'target_column_name': { 'type': 'string'}
}

training_outputs = {
    'model_output': { 'type': 'path'}
}

training_component = CommandComponent(
    name="ProgrammersTestRegTrainingComponent",
    version="5",
    display_name="Simple reg training component",
    code=training_code,
    environment=f"AML-RAI-Environment:1",
    inputs=training_inputs,
    outputs=training_outputs,
    command="python training_script_reg.py " \
            "--training_data ${{inputs.training_data}} " \
            "--target_column_name ${{inputs.target_column_name}} " \
            "--model_output ${{outputs.model_output}}"
)

ml_client.components.create_or_update(training_component)

CommandComponent({'auto_increment_version': False, 'is_anonymous': False, 'name': 'ProgrammersTestRegTrainingComponent', 'description': None, 'tags': {}, 'properties': {}, 'id': '/subscriptions/fac34303-435d-4486-8c3f-7094d82a0b60/resourceGroups/RAIPM/providers/Microsoft.MachineLearningServices/workspaces/RAIPM2/components/ProgrammersTestRegTrainingComponent/versions/5', 'base_path': None, 'creation_context': <azure.ml._restclient.v2021_10_01.models._models_py3.SystemData object at 0x00000183C6C92CA0>, 'serialize': <msrest.serialization.Serializer object at 0x00000183C6C87250>, 'command': 'python training_script_reg.py --training_data ${{inputs.training_data}} --target_column_name ${{inputs.target_column_name}} --model_output ${{outputs.model_output}}', 'code': '/subscriptions/fac34303-435d-4486-8c3f-7094d82a0b60/resourceGroups/RAIPM/providers/Microsoft.MachineLearningServices/workspaces/RAIPM2/codes/f456aba5-fbec-43da-b0cb-5f4404c1945d/versions/1', 'environment_variables': {}, 'enviro

# Running a training pipeline
Now we have a script which can train a model, we need to run it:

In [15]:
import time

from azure.ml.entities import JobInput, ComponentJob, PipelineJob

model_name_suffix = int(time.time())
model_name = 'my_trained_reg_nb_model'

This is going to be a two component pipeline. The first will be the one we created above, which will train our model. The second will register it in AzureML:

In [16]:
# The overall inputs for the pipeline

pipeline_inputs = {
    'target_column_name': target_column_name,
    'my_training_data': JobInput(dataset=f"Programmers_Train_from_Notebook:5", mode="download"),
    'my_test_data': JobInput(dataset=f"Programmers_Test_from_Notebook:5", mode="download")
}

# Specify the training job
train_job_inputs = {
    'target_column_name': '${{inputs.target_column_name}}',
    'training_data': '${{inputs.my_training_data}}',
}
train_job_outputs = {
    'model_output': None
}
train_job = ComponentJob(
    component=f"ProgrammersTestRegTrainingComponent:5",
    inputs=train_job_inputs,
    outputs=train_job_outputs
)

# The model registration job
register_job_inputs = {
    'model_input_path': '${{jobs.train-model-job.outputs.model_output}}',
    'model_base_name': model_name,
    'model_name_suffix': model_name_suffix
}
register_job_outputs = {
    'model_info_output_path': None
}
register_job = ComponentJob(
    component=f"RegisterModel:{version_string}",
    inputs=register_job_inputs,
    outputs=register_job_outputs
)

With our jobs specified, assemble them into a pipeline:

In [17]:
model_registration_pipeline_job = PipelineJob(
    experiment_name=f"Register_Reg_Model_From_Notebook_01",
    description="Create and register a model from a notebook",
    jobs={
        'train-model-job': train_job,
        'register-model-job': register_job,
    },
    inputs=pipeline_inputs,
    outputs=register_job_outputs,
    compute="rai-cluster"
)

And submit it:

In [18]:
from azure.ml.entities import PipelineJob

def submit_and_wait(ml_client, pipeline_job) -> PipelineJob:
    created_job = ml_client.jobs.create_or_update(pipeline_job)
    assert created_job is not None

    while created_job.status not in ['Completed', 'Failed', 'Canceled', 'NotResponding']:
        time.sleep(30)
        created_job = ml_client.jobs.get(created_job.name)
        print("Latest status : {0}".format(created_job.status))
    assert created_job.status == 'Completed'
    return created_job

In [19]:
# This is the actual submission

training_job = submit_and_wait(ml_client, model_registration_pipeline_job)

compute is not a known attribute of class <class 'azure.ml._restclient.v2021_10_01.models._models_py3.PipelineJob'> and will be ignored


Latest status : Running
Latest status : Completed


# Creating the RAI Insights
We have a registered model, and can now run a pipeline to create the RAI insights. First off, compute the name of the model we registered (this is not straightforward since the Register Model component is used in testing):

In [20]:
expected_model_id = f'{model_name}_{model_name_suffix}:1'

Now, we create the RAI pipeline itself. There are three 'component stages' in this pipeline:

1. Fetch the model
1. Construct an empty RAI dashboard
1. Run the RAI tool components

The job to fetch the registered model is:

In [21]:
# This won't be necessary once models are types within the pipeline graph

fetch_job_inputs = {
    'model_id': expected_model_id
}
fetch_job_outputs = {
    'model_info_output_path': None
}
fetch_job = ComponentJob(
    component=f"FetchRegisteredModel:{version_string}",
    inputs=fetch_job_inputs,
    outputs=fetch_job_outputs
)

With this registered model (and our datasets), we can create an empty RAI dashboard:

In [22]:
# Top level RAI Insights component

# We will reuse the same pipeline_inputs object in the end
create_rai_inputs = {
    'title': 'Run built from a Notebook',
    'task_type': 'regression',
    'model_info_path': '${{jobs.fetch-model-job.outputs.model_info_output_path}}',
    'train_dataset': '${{inputs.my_training_data}}',
    'test_dataset': '${{inputs.my_test_data}}',
    'target_column_name': '${{inputs.target_column_name}}',
    'categorical_column_names': '["location", "style", "job title", "OS", "Employer", "IDE", "Programming language"]',
}
create_rai_outputs = {
    'rai_insights_dashboard': None # Could theoretically redirect the datastore here
}
create_rai_job = ComponentJob(
    component=f"RAIInsightsConstructor:{version_string}",
    inputs=create_rai_inputs,
    outputs=create_rai_outputs
)

Now, create an instance of each of our RAI tools:

In [23]:
# Setup the explanation
explain_inputs = {
   'comment': 'Insert text here',
    'rai_insights_dashboard': '${{jobs.create-rai-job.outputs.rai_insights_dashboard}}'
}
explain_outputs = {
    'explanation': None
}
explain_job = ComponentJob(
    component=f"RAIInsightsExplanation:{version_string}",
    inputs=explain_inputs,
    outputs=explain_outputs
)

# Setup causal
causal_inputs = {
    'rai_insights_dashboard': '${{jobs.create-rai-job.outputs.rai_insights_dashboard}}',
    'treatment_features': '["Number of github repos contributed to", "YOE"]'
}
causal_outputs = {
    'causal': None
}
causal_job = ComponentJob(
    component=f"RAIInsightsCausal:{version_string}",
    inputs=causal_inputs,
    outputs=causal_outputs
)

# Setup counterfactual
counterfactual_inputs = {
    'rai_insights_dashboard': '${{jobs.create-rai-job.outputs.rai_insights_dashboard}}',
    'total_CFs': '10',
    'desired_range': '[5, 10]'
}
counterfactual_outputs = {
    'counterfactual': None
}
counterfactual_job = ComponentJob(
    component=f"RAIInsightsCounterfactual:{version_string}",
    inputs=counterfactual_inputs,
    outputs=counterfactual_outputs
)

# Setup error analysis
error_analysis_inputs = {
    'rai_insights_dashboard': '${{jobs.create-rai-job.outputs.rai_insights_dashboard}}',
    'filter_features': '["style", "Employer"]'
}
error_analysis_outputs = {
    'error_analysis': None
}
error_analysis_job = ComponentJob(
    component=f"RAIInsightsErrorAnalysis:{version_string}",
    inputs=error_analysis_inputs,
    outputs=error_analysis_outputs
)

Now the 'gather' component which assembles everything into an `RAIInsights` object, and computes the JSON for the UX:

In [24]:
# Configure the gather component
gather_inputs = {
    'constructor': '${{jobs.create-rai-job.outputs.rai_insights_dashboard}}',
    'insight_1': '${{jobs.explain-job.outputs.explanation}}',
    'insight_2': '${{jobs.causal-job.outputs.causal}}',
    'insight_3': '${{jobs.counterfactual-job.outputs.counterfactual}}',
    'insight_4': '${{jobs.error-analysis-job.outputs.error_analysis}}'
}
gather_outputs = {
    'dashboard': None,
    'ux_json': None
}
gather_job = ComponentJob(
    component=f"RAIInsightsGather:{version_string}",
    inputs=gather_inputs,
    outputs=gather_outputs
)

Finally, the pipeline itself:

In [25]:
# Pipeline to construct the RAI Insights
insights_pipeline_job = PipelineJob(
    experiment_name=f"Compute_Insights_from_Notebook_{version_string}",
    description="Python submitted Orange insights using fetched model",
    jobs={
        'fetch-model-job': fetch_job,
        'create-rai-job': create_rai_job,
        'causal-job': causal_job,
        'counterfactual-job': counterfactual_job,
        'error-analysis-job': error_analysis_job,
        'explain-job': explain_job,
        'gather-job': gather_job
    },
    inputs=pipeline_inputs,
    outputs=None,
    compute="rai-cluster"
)

And submit it:

In [26]:
insights_job = submit_and_wait(ml_client, insights_pipeline_job)

compute is not a known attribute of class <class 'azure.ml._restclient.v2021_10_01.models._models_py3.PipelineJob'> and will be ignored


Latest status : Running
Latest status : Running
Latest status : Running
Latest status : Running
Latest status : Running
Latest status : Running
Latest status : Running
Latest status : Completed


# Download and display the insights
Now we can download the insights we have computed. To start, we need to obtain the Run id of the 'gather-job' which ran as part of the previous pipeline. We have a helper for this, but the name of the experiment is required:

In [None]:
from azure_ml_rai import list_rai_insights

In [None]:
run_list = list_rai_insights(ml_client, insights_pipeline_job.experiment_name)

print(insights_pipeline_job.experiment_name)
display(run_list)

We can use the mini SDK to download to a local directory:

In [None]:
from azure_ml_rai import download_rai_insights

download_dir = 'my_downloaded_insight'

download_rai_insights(
    ml_client,
    rai_insight_id=run_list[0],
    path=download_dir,
)

And with everything downloaded, we can load the RAIInsights object and instantiate the dashboard:

In [None]:
from responsibleai import RAIInsights
from raiwidgets import ResponsibleAIDashboard

rai_i = RAIInsights.load(download_dir)

ResponsibleAIDashboard(rai_i)

If for some reason we only need the JSON file holding the contents of `RAIInsights.get_data()`, we can download the other output port of the 'Gather' component:

In [None]:
from azure_ml_rai import download_rai_insights_ux

download_ux_dir = 'my_ux_insight'

download_rai_insights_ux(
    ml_client,
    rai_insight_id=run_list[0],
    path=download_ux_dir,
)