In [1]:
from azure.ai.ml import MLClient
from azure.identity import DefaultAzureCredential

# Enter details of your AML workspace
subscription_id = "dac8073e-1c2d-4a7d-a53b-c3655e291d58"
resource_group = "Learning"
workspace = "learningmain"

# get a handle to the workspace
ml_client = MLClient(
    DefaultAzureCredential(), subscription_id, resource_group, workspace
)

print(ml_client.subscription_id, ml_client.resource_group_name, ml_client.workspace_name, sep='\n')

dac8073e-1c2d-4a7d-a53b-c3655e291d58
Learning
learningmain


In [2]:
# to get larger datasets, visit: http://jmcauley.ucsd.edu/data/amazon/

!wget http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Automotive_5.json.gz -P data/

--2023-01-13 18:17:55--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Automotive_5.json.gz
Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4669048 (4.5M) [application/x-gzip]
Saving to: ‘data/reviews_Automotive_5.json.gz.16’


2023-01-13 18:17:57 (3.24 MB/s) - ‘data/reviews_Automotive_5.json.gz.16’ saved [4669048/4669048]



In [3]:
import pandas as pd
import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

pdf_main = getDF('data/reviews_Automotive_5.json.gz')
pdf_main.shape

(20473, 9)

In [4]:
pdf_main.loc[pdf_main['overall'] >= 4, 'sentiment'] = 1
pdf_main.loc[pdf_main['overall'] < 3, 'sentiment'] = 0

pdf_main.describe()

Unnamed: 0,overall,unixReviewTime,sentiment
count,20473.0,20473.0,19043.0
mean,4.471841,1365018000.0,0.939715
std,0.940337,36212660.0,0.23802
min,1.0,1121386000.0,0.0
25%,4.0,1354838000.0,1.0
50%,5.0,1373414000.0,1.0
75%,5.0,1390435000.0,1.0
max,5.0,1405901000.0,1.0


In [6]:
from sklearn.model_selection import train_test_split

def generate_datasets(pdf_target_training, label = 'sentiment'):
    X_train, X_test_val, y_train, y_test_val = train_test_split(pdf_target_training.drop(label, axis=1), pdf_target_training[label],
                                                        stratify=pdf_target_training[label],
                                                        shuffle=True,
                                                        test_size=0.20)

    X_val, X_test, y_val, y_test = train_test_split(X_test_val, y_test_val,
                                                        stratify=y_test_val,
                                                        shuffle=True,
                                                        test_size=0.5)
    pdf_X_train = X_train
    pdf_X_val = X_val
    pdf_X_test = X_test

    pdf_X_train['sentiment'] = y_train
    pdf_X_val['sentiment'] = y_val
    pdf_X_test['sentiment'] = y_test
    
    print(f'Total records for: "pdf_X_train": [{pdf_X_train.shape[0]}]')
    print(f'Total records for: "pdf_X_val": [{pdf_X_val.shape[0]}]')
    print(f'Total records for: "pdf_X_test": [{pdf_X_test.shape[0]}]')
    


    
    return pdf_X_train, pdf_X_val, pdf_X_test

In [8]:
pdf_train, pdf_val, pdf_test = generate_datasets(pdf_main[['reviewText', 'sentiment']].dropna(), 'sentiment')

pdf_train.to_csv('data/pdf_train.csv')
pdf_val.to_csv('data/pdf_val.csv')
pdf_test.to_csv('data/pdf_test.csv')


Total records for: "pdf_X_train": [15234]
Total records for: "pdf_X_val": [1904]
Total records for: "pdf_X_test": [1905]


In [11]:
from azure.ai.ml.entities import Data
from azure.ai.ml import Input
from azure.ai.ml.constants import AssetTypes

# === Note on path ===
# can be can be a local path or a cloud path. AzureML supports https://`, `abfss://`, `wasbs://` and `azureml://` URIs.
# Local paths are automatically uploaded to the default datastore in the cloud.
# More details on supported paths: https://docs.microsoft.com/azure/machine-learning/how-to-read-write-data-v2#supported-paths

def gen_input_data(url):
    inputs = {
        "input_data": Input(type=AssetTypes.URI_FILE, path=url)
    }

    return inputs


In [12]:
ds_train = gen_input_data('data/pdf_test.csv')
ds_val = gen_input_data('data/pdf_val.csv')
ds_test = gen_input_data('data/pdf_test.csv')


In [43]:
source_directory = "./project"

In [33]:
%%writefile environments/conda_dependencies.yml

channels:
  - pytorch
  - anaconda
  - conda-forge
dependencies:
  - python=3.7
  - pip=21.1.2
  - pip:
      - azure-ai-ml==1.2.0
      - mlflow== 1.26.1
      - azureml-mlflow==1.42.0
      - nvitop
  - numpy~=1.21.6
  - pandas~=1.1.5
  - shap=0.39.0
  - scikit-learn~=0.22.1
  - pytorch==1.7.1
name: nlp_training_environment

Overwriting environments/conda_dependencies.yml


In [41]:
ml_client.environments.get(name="nlp-accelerator-sdk-v2", version="latest_version")

ResourceNotFoundError: (UserError) No environment exists for name: nlp-accelerator-sdk-v2, version: latest_version, label: 
Code: UserError
Message: No environment exists for name: nlp-accelerator-sdk-v2, version: latest_version, label: 

In [43]:
from azure.ai.ml.entities import Environment

env_list = list(ml_client.environments.list(name="nlp-accelerator-sdk-v2")) # (name="nlp-accelerator-sdk-v2", version='3')
if len(env_list) > 0:
    env = env_list[0]
else:
    env = Environment(
        image="mcr.microsoft.com/azureml/openmpi4.1.0-cuda11.1-cudnn8-ubuntu20.04:latest",
        conda_file='environments/conda_dependencies.yml',
        name="nlp-accelerator-sdk-v2",
        description='This environment is curated to run NLP Transformer based models using AML SDK-v2 and native MLFlow integration'
    )

    ml_client.environments.create_or_update(env)

env

Environment({'is_anonymous': False, 'auto_increment_version': False, 'name': 'nlp-accelerator-sdk-v2', 'description': 'This environment is curated to run NLP Transformer based models using AML SDK-v2 and native MLFlow integration', 'tags': {}, 'properties': {}, 'id': '/subscriptions/dac8073e-1c2d-4a7d-a53b-c3655e291d58/resourceGroups/Learning/providers/Microsoft.MachineLearningServices/workspaces/learningmain/environments/nlp-accelerator-sdk-v2/versions/3', 'Resource__source_path': None, 'base_path': '/mnt/batch/tasks/shared/LS_root/mounts/clusters/hosarsha23/code/Users/hosarsha/nlp-aml', 'creation_context': <azure.ai.ml.entities._system_data.SystemData object at 0x7f6c52b3d570>, 'serialize': <msrest.serialization.Serializer object at 0x7f6c52b3de70>, 'version': '3', 'latest_version': None, 'conda_file': {'channels': ['pytorch', 'anaconda', 'conda-forge'], 'dependencies': ['python=3.7', 'pip=21.1.2', {'pip': ['azure-ai-ml==1.2.0', 'mlflow== 1.26.1', 'azureml-mlflow==1.42.0', 'nvitop']}

In [31]:
cluster_name = "cpu-cluster" # "a100-cluster"
compute_target = ml_client.compute.get(cluster_name)

In [53]:
pdf_train, pdf_val, pdf_test = generate_datasets(pdf_main[['reviewText', 'sentiment']].dropna(), 'sentiment')


Total records for: "pdf_X_train": [15234]
Total records for: "pdf_X_val": [1904]
Total records for: "pdf_X_test": [1905]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pdf_X_val['sentiment'] = y_val
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pdf_X_test['sentiment'] = y_test


In [54]:
def_blob_store = ws.get_default_datastore()

ds_train_set = Dataset.Tabular.register_pandas_dataframe(dataframe=pdf_train, target=(def_blob_store, 'nlp'), name="train_set", description="Small amazon review for sentiment analysis [train set]")
ds_val_set = Dataset.Tabular.register_pandas_dataframe(dataframe=pdf_val, target=(def_blob_store, 'nlp'), name="val_set", description="Small amazon review for sentiment analysis [val set]")
ds_test_set = Dataset.Tabular.register_pandas_dataframe(dataframe=pdf_test, target=(def_blob_store, 'nlp'), name="test_set", description="Small amazon review for sentiment analysis [test set]")

Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to nlp/610a98c0-026d-4779-8ef4-9d68119149b6/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to nlp/b1370624-cf1e-4032-ab68-0706dcb4d433/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.
Validating arguments.
Arguments validated.
Successfully obtained datastore reference and path.
Uploading file to nlp/9e9e0881-90a4-4978-a280-30eae16fc993/
Successfully uploaded file to datastore.
Creating and registering a new dataset.
Successfully created and registered a new dataset.


In [55]:
from azureml.core import ScriptRunConfig

args = [
        '--target-name', 'sentiment',
        '--training-dataset', ds_train_set.as_named_input('train_set'),
        '--val-dataset', ds_val_set.as_named_input('val_set'),
        '--test-dataset', ds_test_set.as_named_input('test_set'),
        '--text-field', 'reviewText',
        '--is-test', 1,
        '--is-final', 0,
        '--is-jump', 0,
        '--is-local', 0,
        '--evaluation-strategy', "epoch",
        '--collect-resource-utilization', 1, # 
        '--resource-utilization-interval', 5.0 # seconds
]

src = ScriptRunConfig(source_directory=source_directory,
                      script='train_transformer.py',
                      arguments=args,
                      compute_target=compute_target,
                      environment=env)




In [56]:
from azureml.train.hyperdrive import RandomParameterSampling, BanditPolicy, HyperDriveConfig, PrimaryMetricGoal
from azureml.train.hyperdrive import choice, loguniform

ps = RandomParameterSampling(
    {
        # bert-base-cased model could fit into all NC series, but if you're interested in trying larger models, then you need to make sure the VM type can handle the size of the model
        '--base-checkpoint': choice("bert-base-cased"), #, "bert-base-cased"), # , "bert-large-cased", "microsoft/deberta-v3-small", "distilbert-base-uncased", "bert-base-uncased"),
        '--batch-size': choice(8),
        '--no-epochs': choice(4),
        '--learning-rate': choice(5.5e-5, 5e-5, 4.5e-5, 4e-5, 5.5e-5, 6e-5, 3.5e-5, 6.5e-5),
        '--warmup-steps': choice(0),
        '--weight-decay': choice(0.0),
        '--adam-beta1': choice(0.9),
        '--adam-beta2': choice(0.999),
        '--adam-epsilon': choice(1e-8)
    }
)


In [57]:
policy = BanditPolicy(evaluation_interval=5, slack_factor=0.1)
hyperdrive_config = HyperDriveConfig(run_config=src,
                                     hyperparameter_sampling=ps,
                                     policy=policy,
                                     primary_metric_name='eval_f1_weighted',
                                     primary_metric_goal=PrimaryMetricGoal.MAXIMIZE,
                                     max_total_runs=20,
                                     max_concurrent_runs=3)


In [58]:
from azureml.pipeline.steps import HyperDriveStep, HyperDriveStepRun, PythonScriptStep

hd_step_name='HyperDrive_Step'
hd_step = HyperDriveStep(
    name=hd_step_name,
    hyperdrive_config=hyperdrive_config,
    allow_reuse=True)


In [59]:
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException

# choose a name for your cluster
cpu_compute = ComputeTarget(workspace=ws, name="cpu-cluster")

In [60]:
env_cpu = Environment.get(workspace=ws, name="AzureML-sklearn-1.0-ubuntu20.04-py38-cpu")
base_env = Environment.get(workspace=ws, name="AzureML-AutoML-DNN-Text-GPU")
env = base_env.clone("nlp-accelerator")

conda_dep = env.python.conda_dependencies
conda_dep.add_pip_package('nvitop')

env.python.conda_dependencies = conda_dep

env.register(ws)

In [61]:
from azureml.core.runconfig import RunConfiguration

rcfg = RunConfiguration()
rcfg.environment = env_cpu

register_model_step = PythonScriptStep(script_name='register_model.py',
                                       source_directory=source_directory,
                                       name="Register_Best_Model",
                                       compute_target=cpu_compute,
                                       arguments=['--is-test', 0,
                                                  '--test-run-id', '',
                                                  '--metric-name', 'test_f1_weighted',
                                                  '--second-metric', 'test_f1',
                                                  '--target-name', 'sentiment',
                                                  '--model-name', 'sentiment_classifier'],
                                       allow_reuse=True,
                                       runconfig=rcfg)

# register_model_step.run_after(hd_step)


In [62]:
from azureml.core.runconfig import RunConfiguration


rcfg = RunConfiguration() # conda_dependencies=conda_dep)
rcfg.environment = env_cpu

deploy_model_step = PythonScriptStep(script_name='deploy_model.py',
                                       source_directory=source_directory,
                                       name="Deploy_Latest_Model",
                                       compute_target=cpu_compute,
                                       arguments=['--endpoint-name', 'sentiment-endpoint-2',
                                                  '--model-name', 'sentiment_classifier'],
                                       allow_reuse=True,
                                       runconfig=rcfg)

deploy_model_step.run_after(register_model_step)


In [63]:
exp = Experiment(workspace=ws, name='transformer_hp')
steps = [deploy_model_step]
pipeline = Pipeline(workspace=ws, steps=steps)


In [64]:
pipeline.submit(exp.name) # , credential_passthrough=True)


Created step Deploy_Latest_Model [29041d0f][86b676f8-33ab-4ef0-933f-cce4a7f83404], (This step will run and generate new outputs)Created step Register_Best_Model [ef3c94c2][acbc100f-c4b1-4791-94ef-821844982d35], (This step will run and generate new outputs)

Submitted PipelineRun ffd4451b-385b-4959-abcc-9c2cf408d66d
Link to Azure Machine Learning Portal: https://ml.azure.com/runs/ffd4451b-385b-4959-abcc-9c2cf408d66d?wsid=/subscriptions/dac8073e-1c2d-4a7d-a53b-c3655e291d58/resourcegroups/learning/workspaces/learningmain&tid=16b3c013-d300-468d-ac64-7eda0820b6d3


Experiment,Id,Type,Status,Details Page,Docs Page
transformer_hp,ffd4451b-385b-4959-abcc-9c2cf408d66d,azureml.PipelineRun,Preparing,Link to Azure Machine Learning studio,Link to Documentation


In [None]:
from datetime import datetime

timenow = datetime.now().strftime('%Y-%m-%d-%H-%M')

pipeline_name = f"Sentiment-Classifier-{timenow}-Pipeline"
print(pipeline_name)

# published_pipeline = pipeline.publish(
#     name=pipeline_name, 
#     description=pipeline_name)
# print("Newly published pipeline id: {}".format(published_pipeline.id))