In [28]:
import os
import azureml.core
import pandas as pd
from azureml.core.runconfig import JarLibrary
from azureml.core.compute import ComputeTarget, DatabricksCompute
from azureml.exceptions import ComputeTargetException
from azureml.core.datastore import Datastore
from azureml.data.data_reference import DataReference
from azureml.core.databricks import PyPiLibrary

from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal
from azureml.core import Workspace, Environment, Experiment, Datastore, Dataset, ScriptRunConfig
from azureml.pipeline.core import Pipeline, PipelineData, TrainingOutput
from azureml.pipeline.steps import DatabricksStep, PythonScriptStep
from azureml.train.hyperdrive import choice, loguniform

from sklearn.model_selection import train_test_split
from azureml.train.automl import AutoMLConfig

# Check core SDK version number
print("SDK version:", azureml.core.VERSION)


SDK version: 1.48.0


In [29]:
ws = Workspace.from_config()
print(ws.name, ws.resource_group, ws.location, ws.subscription_id, sep = '\n')


nlp-workspace
openaml
eastus2
f9b97038-ed78-4a26-a1a7-51e81e75d867


In [30]:
from azureml.pipeline.core import PipelineParameter
from azureml.pipeline.core.pipeline_output_dataset import PipelineOutputAbstractDataset

def_blob_store = ws.get_default_datastore()
print('Datastore {} will be used'.format(def_blob_store.name))


Datastore workspaceblobstore will be used


In [31]:
source_directory = "./project"

In [32]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

cluster_name = "gpu-cluster"
compute_target = ComputeTarget(workspace=ws, name=cluster_name)

In [33]:
# %%writefile conda_dependencies.yml
# 
# channels:
#   - pytorch
#   - anaconda
#   - conda-forge
# dependencies:
#   - python=3.7
#   - pip=21.1.2
#   - pip:
#       - azureml-core==1.44.0
#       - azureml-mlflow==1.44.0
#       - azureml-automl-core==1.44.0
#       - azureml-automl-dnn-nlp==1.44.0
#       - azureml-responsibleai==1.44.0
#       - azureml-automl-runtime==1.44.0
#       - azureml-train-automl-client==1.44.0
#       - azureml-train-automl-runtime==1.44.0
#       - horovod==0.21.3
#   - numpy~=1.18.5
#   - pandas~=1.1.5
#   - scikit-learn~=0.22.1
#   - pytorch==1.7.1

In [34]:
if 'nlp-accelerator' not in ws.environments:
    base_env = Environment.get(workspace=ws, name="AzureML-AutoML-DNN-Text-GPU")
    env = base_env.clone("nlp-accelerator")

    conda_dep = env.python.conda_dependencies
    conda_dep.add_pip_package('nvitop')

    env.python.conda_dependencies = conda_dep

    env.register(ws)
else:
    env = Environment.get(workspace=ws, name="nlp-accelerator")


In [35]:
# to get larger datasets: http://jmcauley.ucsd.edu/data/amazon/

In [36]:
!wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Automotive_5.json.gz -P data/

--2023-01-18 23:57:14--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Automotive_5.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4669048 (4.5M) [application/x-gzip]
Saving to: ‘data/reviews_Automotive_5.json.gz.69’


2023-01-18 23:57:17 (1.95 MB/s) - ‘data/reviews_Automotive_5.json.gz.69’ saved [4669048/4669048]



In [37]:
import pandas as pd
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

pdf_main = getDF('data/reviews_Automotive_5.json.gz')
pdf_main.shape

(20473, 9)

In [38]:
pdf_main.loc[pdf_main['overall'] >= 4, 'sentiment'] = 1
pdf_main.loc[pdf_main['overall'] < 3, 'sentiment'] = 0

pdf_main.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,sentiment
0,A3F73SC1LY51OO,B00002243X,Alan Montgomery,"[4, 4]",I needed a set of jumper cables for my new car...,5.0,Work Well - Should Have Bought Longer Ones,1313539200,"08 17, 2011",1.0
1,A20S66SKYXULG2,B00002243X,alphonse,"[1, 1]","These long cables work fine for my truck, but ...",4.0,Okay long cables,1315094400,"09 4, 2011",1.0
2,A2I8LFSN2IS5EO,B00002243X,Chris,"[0, 0]",Can't comment much on these since they have no...,5.0,Looks and feels heavy Duty,1374710400,"07 25, 2013",1.0
3,A3GT2EWQSO45ZG,B00002243X,DeusEx,"[19, 19]",I absolutley love Amazon!!! For the price of ...,5.0,Excellent choice for Jumper Cables!!!,1292889600,"12 21, 2010",1.0
4,A3ESWJPAVRPWB4,B00002243X,E. Hernandez,"[0, 0]",I purchased the 12' feet long cable set and th...,5.0,"Excellent, High Quality Starter Cables",1341360000,"07 4, 2012",1.0


In [39]:
def generate_datasets(pdf_target_training, label = 'sentiment'):
    X_train, X_test_val, y_train, y_test_val = train_test_split(pdf_target_training.drop(label, axis=1), pdf_target_training[label],
                                                        stratify=pdf_target_training[label],
                                                        shuffle=True,
                                                        test_size=0.20)

    X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val,
                                                        stratify=y_test_val,
                                                        shuffle=True,
                                                        test_size=0.5)
    pdf_X_train = X_train
    pdf_X_val = X_val
    pdf_X_test = X_test

    pdf_X_train['sentiment'] = y_train
    pdf_X_val['sentiment'] = y_val
    pdf_X_test['sentiment'] = y_test
    
    print(f'Total records for: "pdf_X_train": [{pdf_X_train.shape[0]}]')
    print(f'Total records for: "pdf_X_val": [{pdf_X_val.shape[0]}]')
    print(f'Total records for: "pdf_X_test": [{pdf_X_test.shape[0]}]')
    
    return pdf_X_train, pdf_X_val, pdf_X_test

In [40]:
pdf_train, pdf_val, pdf_test = generate_datasets(pdf_main[['reviewText', 'sentiment']].dropna(), 'sentiment')


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [41]:
def_blob_store = ws.get_default_datastore()

ds_train_set = Dataset.Tabular.register_pandas_dataframe(dataframe=pdf_train, target=(def_blob_store, 'nlp'), name="train_set", description="Small amazon review for sentiment analysis [train set]")
ds_val_set = Dataset.Tabular.register_pandas_dataframe(dataframe=pdf_val, target=(def_blob_store, 'nlp'), name="val_set", description="Small amazon review for sentiment analysis [val set]")
ds_test_set = Dataset.Tabular.register_pandas_dataframe(dataframe=pdf_test, target=(def_blob_store, 'nlp'), name="test_set", description="Small amazon review for sentiment analysis [test set]")

Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to nlp/d2cb00f3-4bf6-43bb-b42c-e531686831e2/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to nlp/7c63efb6-63b2-4e4b-bd8f-2705255e5f1b/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to nlp/8a1efe33-5a0e-46bc-ba47-f46c7a448bb3/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


In [42]:
from azureml.core import ScriptRunConfig

args = [
        '--target-name', 'sentiment',
        '--training-dataset', ds_train_set.as_named_input('train_set'),
        '--val-dataset', ds_val_set.as_named_input('val_set'),
        '--test-dataset', ds_test_set.as_named_input('test_set'),
        '--text-field', 'reviewText',
        '--is-test', 1,
        '--is-final', 0,
        '--is-jump', 0,
        '--is-local', 0,
        '--evaluation-strategy', "epoch",
        '--collect-resource-utilization', 1, # 
        '--resource-utilization-interval', 5.0 # seconds
]

src = ScriptRunConfig(source_directory=source_directory,
                      script='train_transformer.py',
                      arguments=args,
                      compute_target=compute_target,
                      environment=env)




In [43]:
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal
from azureml.train.hyperdrive import choice, loguniform

ps = RandomParameterSampling(
    {
        # bert-base-cased model could fit into all NC series, but if you're interested in trying larger models, then you need to make sure the VM type can handle the size of the model
        '--base-checkpoint': choice("bert-base-cased"), #, "bert-base-cased"), # , "bert-large-cased", "microsoft/deberta-v3-small", "distilbert-base-uncased", "bert-base-uncased"),
        '--batch-size': choice(8),
        '--no-epochs': choice(4),
        '--learning-rate': choice(5.5e-5, 5e-5, 4.5e-5, 4e-5, 5.5e-5, 6e-5, 3.5e-5, 6.5e-5),
        '--warmup-steps': choice(0),
        '--weight-decay': choice(0.0),
        '--adam-beta1': choice(0.9),
        '--adam-beta2': choice(0.999),
        '--adam-epsilon': choice(1e-8)
    }
)

In [44]:
policy = BanditPolicy(evaluation_interval=5, slack_factor=0.1)
hyperdrive_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                     primary_metric_name='test_AUC_weighted',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=20,
                                     max_concurrent_runs=3)


In [45]:
from azureml.pipeline.steps import HyperDriveStep, HyperDriveStepRun, PythonScriptStep

hd_step_name='HyperDrive_Step'
hd_step = HyperDriveStep(
    name=hd_step_name,
    hyperdrive_config=hyperdrive_config,
    allow_reuse=True)


## Add AutoML Step 
Compare results from AutoML steps

In [46]:
from azureml.pipeline.core import TrainingOutput, PipelineData

metrics_data = PipelineData(name='metrics_data',
                            datastore=def_blob_store,
                            pipeline_output_name='metrics_output',
                            training_output=TrainingOutput(type='Metrics'))

model_data = PipelineData(name='best_model_data',
                          datastore=def_blob_store,
                          pipeline_output_name='model_output',
                          training_output=TrainingOutput(type='Model'))

## AutoML Parameters

```json
{ "experiment_timeout_minutes": 120,
    "primary_metric": "accuracy",
    "primary_metric" : "AUC_weighted",
    "iteration_timeout_minutes" : 10,
    "iterations" : 20,
    "experiment_timeout_hours" : 1,
    "max_concurrent_iterations": 1,
    "max_cores_per_iteration": -1,
    "enable_early_stopping": "True",
    "enable_dnn": "True",
    "blacklist_algos":["TensorFlowDNN","TensorFlowLinearRegressor"],
    "max_concurrent_iterations": 1,
    "enable_batch_run":"False",
    "enable_dnn": "true",
     "model_explainability" : "True"
}
```

In [47]:
from azureml.train.automl import AutoMLConfig
from azureml.pipeline.steps import AutoMLStep

automl_settings = {
    "verbosity": logging.INFO,
    "experiment_timeout_minutes": 240,
    "primary_metric": "AUC_weighted",
    "enable_early_stopping" : "true",
    #"ensemble_iterations" : 3,
    #"enable_stack_ensembling" : "true",
    #"enable_ensembling" : "true",
    "save_mlflow": "true",
    #"max_cores_per_iteration": -1,
    #"max_concurrent_iterations": 3,
    "send_telemetry" : "true",
    #"experiment_timeout_minutes": 1440,
    #"iteration_timeout_minutes": 1440,
    "many_models": True,
    #"pipeline_fetch_max_batch_size": 15,
    #"iteration_timeout_minutes" : 30,
    #"iterations" : 5 
}

target_column_name = "sentiment"

automl_config = AutoMLConfig(
    task="text-classification",
    debug_log="automl_errors.log",
    compute_target=compute_target,
    training_data=ds_train_set ,
    validation_data=ds_val_set ,
    featurization = 'auto',
    label_column_name=target_column_name,
#    blocked_models=["TensorFlowDNN", "TensorFlowLinearRegressor"],
    **automl_settings
)


automl_step = AutoMLStep(name='AutoML_Classification',
    automl_config=automl_config,
    passthru_automl_config=False,
    outputs=[metrics_data,model_data],
    enable_default_model_output=False,
    enable_default_metrics_output=False,
    allow_reuse=True)




### Test AutoML Trained Model

In [48]:
from azureml.core.runconfig import RunConfiguration

rcfg = RunConfiguration()
rcfg.environment = env

test_automl_step = PythonScriptStep(script_name='test_model.py',
                                       source_directory=source_directory,
                                       name="Test_AutoML_Best_Model",
                                       compute_target=compute_target,
                                       arguments=[
                                                  '--metric-name', 'AUC_weighted',
                                                  '--target-name', 'sentiment',
                                                  '--text-field-name', 'reviewText',
                                                  '--test_dataset', ds_test_set.as_named_input('test_dataset'),
                                                  '--model-data', model_data
                                                 
                                                 ],
                                       inputs=[ model_data],          
                                       allow_reuse=True,
                                       runconfig=rcfg)



test_automl_step.run_after(automl_step)

In [49]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
cpu_compute = ComputeTarget(workspace=ws, name="cpu-cluster")

In [50]:
env_cpu = Environment.get(workspace=ws, name="AzureML-sklearn-1.0-ubuntu20.04-py38-cpu")

In [51]:
from azureml.core.runconfig import RunConfiguration

rcfg = RunConfiguration()
rcfg.environment = env_cpu

register_model_step = PythonScriptStep(script_name='register_model.py',
                                       source_directory=source_directory,
                                       name="Register_Best_Model",
                                       compute_target=cpu_compute,
                                       arguments=['--is-test', 0,
                                                  '--test-run-id', '',
                                                  '--metric-name', 'test_AUC_weighted',
                                                  '--target-name', 'sentiment',
                                                  '--model-name', 'sentiment_classifier'],
                                       allow_reuse=True,
                                       runconfig=rcfg)



register_model_step.run_after(test_automl_step)
register_model_step.run_after(hd_step)


In [52]:
rcfg = RunConfiguration()
rcfg.environment = env_cpu

deploy_model_step = PythonScriptStep(script_name='deploy_model.py',
                                       source_directory=source_directory,
                                       name="Deploy_Latest_Model",
                                       compute_target=cpu_compute,
                                       arguments=['--endpoint-name', 'sentiment-endpoint-2',
                                                  '--model-name', 'sentiment_classifier'],
                                       allow_reuse=True,
                                       runconfig=rcfg)

deploy_model_step.run_after(register_model_step)


In [53]:
exp = Experiment(workspace=ws, name='transformer_hp')
steps = [deploy_model_step]
pipeline = Pipeline(workspace=ws, steps=steps)


--- Logging error ---
Traceback (most recent call last):
  File "/anaconda/envs/azureml_py38/lib/python3.8/logging/__init__.py", line 1085, in emit
    self.flush()
  File "/anaconda/envs/azureml_py38/lib/python3.8/logging/__init__.py", line 1065, in flush
    self.stream.flush()
OSError: [Errno 9] Bad file descriptor
Call stack:
  File "/anaconda/envs/azureml_py38/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/anaconda/envs/azureml_py38/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/anaconda/envs/azureml_py38/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/anaconda/envs/azureml_py38/lib/python3.8/site-packages/traitlets/config/application.py", line 976, in launch_instance
    app.start()
  File "/anaconda/envs/azureml_py38/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 677, in start
    self.io_loop.start()
 

In [54]:
pipeline.submit(exp.name) #, credential_passthrough=True)


Created step Deploy_Latest_Model [06d961d9][cb5c9f91-4eb9-42d7-a807-1637a4a76ff4], (This step is eligible to reuse a previous run's output)
Created step Register_Best_Model [618e1472][7e7c047c-9676-4af5-bbb0-707b996b68ee], (This step is eligible to reuse a previous run's output)
Created step Test_AutoML_Best_Model [f8343973][768e3e9c-58d0-4c0d-bd79-dccccfd8afec], (This step is eligible to reuse a previous run's output)
Created step AutoML_Classification [e91c065a][0181d052-72f7-42df-a5f9-939fc1616a19], (This step will run and generate new outputs)Created step HyperDrive_Step [87b5dc07][57fe8577-3714-4a80-b850-e7ec29e7c6bd], (This step is eligible to reuse a previous run's output)

Submitted PipelineRun 96ffe0c1-0cd7-4169-835e-d0c708672071
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/96ffe0c1-0cd7-4169-835e-d0c708672071?wsid=/subscriptions/f9b97038-ed78-4a26-a1a7-51e81e75d867/resourcegroups/openaml/workspaces/nlp-workspace&tid=4460d6c7-3cdd-4d85-bda4-87c85c98af04


Experiment,Id,Type,Status,Details Page,Docs Page
transformer_hp,96ffe0c1-0cd7-4169-835e-d0c708672071,azureml.PipelineRun,Preparing,Link to Azure Machine Learning studio,Link to Documentation


In [55]:
from datetime import datetime

timenow = datetime.now().strftime('%Y-%m-%d-%H-%M')

pipeline_name = f"Sentiment-Classifier-{timenow}-Pipeline"
print(pipeline_name)

published_pipeline = pipeline.publish(
    name=pipeline_name, 
    description=pipeline_name)
print("Newly published pipeline id: {}".format(published_pipeline.id))

Sentiment-Classifier-2023-01-18-23-57-Pipeline
Newly published pipeline id: 08465d7f-0b8f-4819-af8b-edbbce327556
