In [None]:
import os
import azureml.core
import pandas as pd
from azureml.core.runconfig import JarLibrary
from azureml.core.compute import ComputeTarget, DatabricksCompute
from azureml.exceptions import ComputeTargetException
from azureml.core.datastore import Datastore
from azureml.data.data_reference import DataReference
from azureml.core.databricks import PyPiLibrary

from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal
from azureml.core import Workspace, Environment, Experiment, Datastore, Dataset, ScriptRunConfig
from azureml.pipeline.core import Pipeline, PipelineData, TrainingOutput
from azureml.pipeline.steps import  PythonScriptStep
from azureml.train.hyperdrive import choice, loguniform

from sklearn.model_selection import train_test_split
from azureml.train.automl import AutoMLConfig

# Check core SDK version number
print("SDK version:", azureml.core.VERSION) 

# Based on 
# https://learn.microsoft.com/en-us/azure/machine-learning/v1/how-to-use-automlstep-in-pipelines
# https://github.com/Azure/azureml-examples/blob/main/v1/python-sdk/tutorials/automl-with-azureml/regression/auto-ml-regression.ipynb
# https://github.com/Azure/azureml-examples/blob/main/v1/python-sdk/tutorials/automl-with-azureml/automl-nlp-multiclass/automl-nlp-text-classification-multiclass.ipynb
# https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-with-automated-machine-learning-step.ipynb 

In [None]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')


In [None]:
from azureml.pipeline.core import PipelineParameter
from azureml.pipeline.core.pipeline_output_dataset import PipelineOutputAbstractDataset

def_blob_store = ws.get_default_datastore()
print('Datastore {} will be used'.format(def_blob_store.name))


In [None]:
source_directory = "./project"

In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "gpu-cluster"
compute_target = ComputeTarget(workspace=ws, name=cluster_name)

In [None]:
# %%writefile conda_dependencies.yml
# 
# channels:
#   - pytorch
#   - anaconda
#   - conda-forge
# dependencies:
#   - python=3.7
#   - pip=21.1.2
#   - pip:
#       - azureml-core==1.44.0
#       - azureml-mlflow==1.44.0
#       - azureml-automl-core==1.44.0
#       - azureml-automl-dnn-nlp==1.44.0
#       - azureml-responsibleai==1.44.0
#       - azureml-automl-runtime==1.44.0
#       - azureml-train-automl-client==1.44.0
#       - azureml-train-automl-runtime==1.44.0
#       - horovod==0.21.3
#   - numpy~=1.18.5
#   - pandas~=1.1.5
#   - scikit-learn~=0.22.1
#   - pytorch==1.7.1

In [None]:
if 'nlp-accelerator' not in ws.environments:
    base_env = Environment.get(workspace=ws, name="AzureML-AutoML-DNN-Text-GPU")
    env = base_env.clone("nlp-accelerator")

    conda_dep = env.python.conda_dependencies
    conda_dep.add_pip_package('nvitop')
    conda_dep.add_pip_package('azureml-train-automl==1.48.0')


    env.python.conda_dependencies = conda_dep

    env.register(ws)
    print(f'registering new env {env}')
    
else:
    env = Environment.get(workspace=ws, name="nlp-accelerator")
    print(f'got env {env}')


In [None]:
# to get larger datasets: http://jmcauley.ucsd.edu/data/amazon/

In [None]:
!wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Automotive_5.json.gz -P data/

In [None]:
import pandas as pd
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

pdf_main = getDF('data/reviews_Automotive_5.json.gz')
pdf_main.shape

In [None]:
pdf_main.loc[pdf_main['overall'] >= 4, 'sentiment'] = 1
pdf_main.loc[pdf_main['overall'] < 3, 'sentiment'] = 0

pdf_main.head()

In [None]:
def generate_datasets(pdf_target_training, label = 'sentiment'):
    X_train, X_test_val, y_train, y_test_val = train_test_split(pdf_target_training.drop(label, axis=1), pdf_target_training[label],
                                                        stratify=pdf_target_training[label],
                                                        shuffle=True,
                                                        test_size=0.20)

    X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val,
                                                        stratify=y_test_val,
                                                        shuffle=True,
                                                        test_size=0.5)
    pdf_X_train = X_train
    pdf_X_val = X_val
    pdf_X_test = X_test

    pdf_X_train['sentiment'] = y_train
    pdf_X_val['sentiment'] = y_val
    pdf_X_test['sentiment'] = y_test
    
    print(f'Total records for: "pdf_X_train": [{pdf_X_train.shape[0]}]')
    print(f'Total records for: "pdf_X_val": [{pdf_X_val.shape[0]}]')
    print(f'Total records for: "pdf_X_test": [{pdf_X_test.shape[0]}]')
    
    return pdf_X_train, pdf_X_val, pdf_X_test

In [None]:
pdf_train, pdf_val, pdf_test = generate_datasets(pdf_main[['reviewText', 'sentiment']].dropna(), 'sentiment')

In [None]:
def_blob_store = ws.get_default_datastore()

ds_train_set = Dataset.Tabular.register_pandas_dataframe(dataframe=pdf_train, target=(def_blob_store, 'nlp'), name="train_set", description="Small amazon review for sentiment analysis [train set]")
ds_val_set = Dataset.Tabular.register_pandas_dataframe(dataframe=pdf_val, target=(def_blob_store, 'nlp'), name="val_set", description="Small amazon review for sentiment analysis [val set]")
ds_test_set = Dataset.Tabular.register_pandas_dataframe(dataframe=pdf_test, target=(def_blob_store, 'nlp'), name="test_set", description="Small amazon review for sentiment analysis [test set]")

## Add AutoML Step 
Compare results from AutoML steps

In [None]:
from azureml.pipeline.core import TrainingOutput, PipelineData

metrics_data = PipelineData(name='automl_metrics_data',
                            datastore=def_blob_store,
                            pipeline_output_name='metrics_output',
                            training_output=TrainingOutput(type='Metrics'))

model_data = PipelineData(name='automl_best_model_data',
                          datastore=def_blob_store,
                          pipeline_output_name='model_output',
                          training_output=TrainingOutput(type='Model'))

## AutoML Parameters


```json
{ 
    "experiment_timeout_minutes": 120,
    "primary_metric": "accuracy",
    "primary_metric" : "AUC_weighted",
    "iteration_timeout_minutes" : 10,
    "iterations" : 20,
    "experiment_timeout_hours" : 1,
    "max_concurrent_iterations": 1,
    "max_cores_per_iteration": -1,
    "enable_early_stopping": "True",
    "enable_dnn": "True",
    "blacklist_algos":["TensorFlowDNN","TensorFlowLinearRegressor"],
    "max_concurrent_iterations": 1,
    "enable_batch_run":"False",
    "enable_dnn": "true",
     "model_explainability" : "True"
}
```

In [None]:
from azureml.train.automl import AutoMLConfig
from azureml.pipeline.steps import AutoMLStep

automl_settings = {
    "verbosity": logging.INFO,
    "experiment_timeout_minutes": 240,
    "primary_metric": "AUC_weighted",
    "enable_early_stopping" : "true",
    #"ensemble_iterations" : 3,
    #"enable_stack_ensembling" : "true",
    #"enable_ensembling" : "true",
    #"save_mlflow": "true",
    #"max_cores_per_iteration": -1,
    #"max_concurrent_iterations": 3,
    "send_telemetry" : "true",
    #"experiment_timeout_minutes": 1440,
    #"iteration_timeout_minutes": 1440,
    "many_models": True,
    #"pipeline_fetch_max_batch_size": 15,
    #"iteration_timeout_minutes" : 30,
    #"iterations" : 5 
}

target_column_name = "sentiment"

automl_config = AutoMLConfig(
    task="text-classification",
    debug_log="automl_errors.log",
    compute_target=compute_target,
    training_data=ds_train_set ,
    validation_data=ds_val_set ,
    featurization = 'auto',
    label_column_name=target_column_name,
#    blocked_models=["TensorFlowDNN", "TensorFlowLinearRegressor"],
    **automl_settings
)


automl_step = AutoMLStep(name='AutoML_Classification',
    automl_config=automl_config,
    passthru_automl_config=False,
    outputs=[metrics_data,model_data],
    enable_default_model_output=False,
    enable_default_metrics_output=False,
    allow_reuse=True)



In [None]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
cpu_compute = ComputeTarget(workspace=ws, name="cpu-cluster")

In [None]:
env_cpu = Environment.get(workspace=ws, name="AzureML-AutoML-DNN") #name="AzureML-sklearn-1.0-ubuntu20.04-py38-cpu")

In [None]:
from azureml.core.runconfig import RunConfiguration

rcfg = RunConfiguration()
rcfg.environment = env

test_model_step = PythonScriptStep(script_name='test_model.py',
                                       source_directory=source_directory,
                                       name="Test_AutoML_Best_Model",
                                       compute_target=compute_target,
                                       arguments=[
                                                  '--metric-name', 'AUC_weighted',
                                                  '--target-name', 'sentiment',
                                                  '--text-field-name', 'reviewText',
                                                  '--test_dataset', ds_test_set.as_named_input('test_dataset'),
                                                  '--model-data', model_data
                                                 
                                                 ],
                                       inputs=[ model_data],          
                                       allow_reuse=False,
                                       runconfig=rcfg)



test_model_step.run_after(automl_step)

In [None]:
from azureml.core.runconfig import RunConfiguration

rcfg = RunConfiguration()
rcfg.environment = env_cpu

register_model_step = PythonScriptStep(script_name='register_model.py',
                                       source_directory=source_directory,
                                       name="Register_Best_Model",
                                       compute_target=cpu_compute,
                                       arguments=['--is-test', 0,
                                                  '--test-run-id', '',
                                                  '--metric-name', 'test_AUC_weighted',
                                                  '--target-name', 'sentiment',
                                                  '--model-name', 'sentiment_classifier',
                                                
                                                  ],
                                             
                                       allow_reuse=True,
                                       runconfig=rcfg)



register_model_step.run_after(test_model_step)


In [None]:
rcfg = RunConfiguration()
rcfg.environment = env_cpu

deploy_model_step = PythonScriptStep(script_name='deploy_model.py',
                                       source_directory=source_directory,
                                       name="Deploy_Latest_Model",
                                       compute_target=cpu_compute,
                                       arguments=['--endpoint-name', 'sentiment-endpoint-2',
                                                  '--model-name', 'sentiment_classifier'],
                                       allow_reuse=True,
                                       runconfig=rcfg)

deploy_model_step.run_after(register_model_step)

In [None]:
exp = Experiment(workspace=ws, name='transformer_automl')
steps = [deploy_model_step]
pipeline = Pipeline(workspace=ws, steps=steps)


In [None]:
pipeline.submit(exp.name) #, credential_passthrough=True)


In [None]:
from datetime import datetime

timenow = datetime.now().strftime('%Y-%m-%d-%H-%M')

pipeline_name = f"Sentiment-Classifier-{timenow}-Pipeline"
print(pipeline_name)

published_pipeline = pipeline.publish(
    name=pipeline_name, 
    description=pipeline_name)
print("Newly published pipeline id: {}".format(published_pipeline.id))