*Copyright (c) Microsoft Corporation. All rights reserved.*

*Licensed under the MIT License.*

# Text Classification of MultiNLI Sentences using BERT with Azure ML Pipelines

In [26]:
import sys
sys.path.append("../../")
import os
import random
import shutil
import pandas as pd

from utils_nlp.bert.common import Language, Tokenizer
from utils_nlp.azureml import azureml_utils
from utils_nlp.dataset.multinli import get_generator

from sklearn.preprocessing import LabelEncoder
from azureml.core import Datastore, Experiment,  get_run
from azureml.core.conda_dependencies import CondaDependencies
from azureml.core.runconfig import RunConfiguration
from azureml.core.compute import ComputeTarget
from azureml.exceptions import ComputeTargetException
from azureml.data.data_reference import DataReference
from azureml.pipeline.steps import PythonScriptStep
from azureml.pipeline.core import Pipeline, PipelineData
from azureml.widgets import RunDetails
from azureml.train.dnn import PyTorch
from azureml.core.runconfig import MpiConfiguration
from azureml.pipeline.steps import EstimatorStep

## 0. Introduction

In this notebook, we fine-tune and evaluate a pretrained BERT model on a subset of the MultiNLI dataset by using Azure ML Pipelines.

In [27]:
LABEL_COL = "genre"
TEXT_COL = "sentence1"
DATA_FOLDER = "../../data/temp"
TRAIN_FOLDER = "../../data/temp/train"
TEST_FOLDER = "../../data/temp/test"
BERT_CACHE_DIR = "../../data/temp"
LANGUAGE = Language.ENGLISH
TO_LOWER = True
MAX_LEN = 150
BATCH_SIZE = 32
NUM_GPUS = 2
NUM_EPOCHS = 1
TRAIN_SIZE = 0.6
TEXT_COL = "sentence1"
ENCODED_LABEL_COL = "label"
TOKEN_COL = "tokens"
MASK_COL = "mask"
NUM_BATCHES = None
LABELS = ['telephone', 'government', 'travel', 'slate', 'fiction']

In this example we will use AzureML pipelines to execute our training pipelines. Each preprocessing step is included as a step in the pipeline. For a more detailed walkthrough of what pipelines are with a getting started guidelines check this [notebook](https://github.com/Azure/MachineLearningNotebooks/blob/master/how-to-use-azureml/machine-learning-pipelines/intro-to-pipelines/aml-pipelines-getting-started.ipynb). We start by doing some AzureML related setup below

### 0.1 Create a workspace

First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`. This will create a config.json file containing the values needed below to create a workspace.

**Note**: you do not need to fill in these values if you have a config.json in the same folder as this notebook

In [28]:
ws = azureml_utils.get_or_create_workspace(
    subscription_id="<SUBSCRIPTION_ID>",
    resource_group="<RESOURCE_GROUP>",
    workspace_name="<WORKSPACE_NAME>",
    workspace_region="<WORKSPACE_REGION>",
)

Performing interactive authentication. Please follow the instructions on the terminal.




Interactive authentication successfully completed.


### 0.2 Setup experiment and logging

In [29]:
# Make a folder for the project
project_folder = "../../"

# Set up an experiment
experiment_name = "pipelines-tc"
experiment = Experiment(ws, experiment_name)

# Add logging to our experiment
run = experiment.start_logging()

### 0.3 Create a compute target

In [30]:
# choose your cluster
cluster_name = "pipelines-tc-12"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print("Found existing compute target.")
except ComputeTargetException:
    print("Creating a new compute target...")
    compute_config = AmlCompute.provisioning_configuration(
        vm_size="STANDARD_NC12", max_nodes=8
    )

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True)

# use get_status() to get a detailed status for the current AmlCompute.
print(compute_target.get_status().serialize())

Found existing compute target.
{'currentNodeCount': 2, 'targetNodeCount': 2, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 2, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-07-26T09:02:40.581000+00:00', 'errors': None, 'creationTime': '2019-07-25T04:16:20.598768+00:00', 'modifiedTime': '2019-07-25T04:16:36.486727+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 2, 'maxNodeCount': 10, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC12'}


## 1. Preprocessing

The pipeline is defined by a series of steps, the first being a PythonScriptStep which utilizes [DASK](https://dask.org/) to load dataframes in batches allowing us to load and preprocess different sets of data in parallel.

### 1.1 Read Dataset

In [31]:
train_batches = get_generator(DATA_FOLDER, "train", num_batches=NUM_BATCHES, batch_size=10e6)
test_batches = get_generator(DATA_FOLDER, "dev_matched", num_batches=NUM_BATCHES, batch_size=10e6)

### 1.2 Preprocess and Tokenize

In the classification task, use the first sentence only as the text input, and the corresponding genre as the label. Select the examples corresponding to one of the entailment labels (*neutral* in this case) to avoid duplicate rows, as the sentences are not unique, whereas the sentence pairs are.

Once filtered, we encode the labels. To do this, fit a label encoder with the know labels in a MNLI dataset.

In [32]:
if not os.path.exists(TRAIN_FOLDER):
    os.makedirs(TRAIN_FOLDER)
if not os.path.exists(TEST_FOLDER):
    os.makedirs(TEST_FOLDER)

labels = LABELS
label_encoder = LabelEncoder()
label_encoder.fit(labels)

num_train_batches = 0
for batch in train_batches:
    batch = batch[batch["gold_label"]=="neutral"]
    batch[ENCODED_LABEL_COL] = label_encoder.transform(batch[LABEL_COL])
    batch.to_csv(TRAIN_FOLDER+"/batch{}.csv".format(str(num_train_batches)))
    num_train_batches += 1
    
num_test_batches = 0
for batch in test_batches:
    batch = batch[batch["gold_label"]=="neutral"]
    batch[ENCODED_LABEL_COL] = label_encoder.transform(batch[LABEL_COL])
    batch.to_csv(TEST_FOLDER+"/batch{}.csv".format(str(num_test_batches)))
    num_test_batches += 1

Once we have batches of data ready they are uploaded to the datastore.

In [33]:
ds = ws.get_default_datastore()
ds.upload(src_dir=TRAIN_FOLDER, target_path="mnli_data/train", overwrite=True, show_progress=False)
ds.upload(src_dir=TEST_FOLDER, target_path="mnli_data/test", overwrite=True, show_progress=False)

$AZUREML_DATAREFERENCE_21d2acf3bc184522a97d7f6233318c77

In [34]:
shutil.rmtree(TRAIN_FOLDER)
shutil.rmtree(TEST_FOLDER)

We can now parallely operate on each batch to tokenize the data and preprocess the tokens. To do this, we create a PythonScript step below.

In [35]:
%%writefile ../../utils_nlp/bert/preprocess.py
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.
import argparse
import logging
import os

import pandas as pd

from utils_nlp.bert.common import Language, Tokenizer

LABEL_COL = "genre"
TEXT_COL = "sentence1"
LANGUAGE = Language.ENGLISH
TO_LOWER = True
MAX_LEN = 150

logger = logging.getLogger(__name__)


def tokenize(df):
    """Tokenize the text documents and convert them to lists of tokens using the BERT tokenizer.
    Args:
        df(pd.Dataframe): Dataframe with training or test samples

    Returns:

        list: List of lists of tokens for train set.

    """
    tokenizer = Tokenizer(
        LANGUAGE, to_lower=TO_LOWER)
    tokens = tokenizer.tokenize(list(df[TEXT_COL]))

    return tokens


def preprocess(tokens):
    """ Preprocess method that does the following,
            Convert the tokens into token indices corresponding to the BERT tokenizer's vocabulary
            Add the special tokens [CLS] and [SEP] to mark the beginning and end of a sentence
            Pad or truncate the token lists to the specified max length
            Return mask lists that indicate paddings' positions
            Return token type id lists that indicate which sentence the tokens belong to (not needed
            for one-sequence classification)

    Args:
        tokens(pd.Dataframe): Dataframe with tokens for train set.

    Returns:
        list: List of lists of tokens for train or test set with special tokens added.
        list: Input mask.
    """
    tokenizer = Tokenizer(
        LANGUAGE, to_lower=TO_LOWER)
    tokens, mask, _ = tokenizer.preprocess_classification_tokens(
        tokens, MAX_LEN
    )

    return tokens, mask


parser = argparse.ArgumentParser()
parser.add_argument("--input_data", type=str, help="input data")
parser.add_argument("--output_data", type=str, help="Path to the output file.")

args = parser.parse_args()
input_data = args.input_data
output_data = args.output_data
output_dir = os.path.dirname(os.path.abspath(output_data))

if output_dir is not None:
    os.makedirs(output_dir, exist_ok=True)
    logger.info("%s created" % output_dir)

df = pd.read_csv(args.input_data)
tokens_array = tokenize(df)
tokens_array, mask_array = preprocess(tokens_array)

df['tokens'] = tokens_array
df['mask'] = mask_array

# Filter columns
cols = ['tokens', 'mask', 'label']
df = df[cols]
df.to_csv(output_data, header=False, index=False)
logger.info("Completed")

Overwriting ../../utils_nlp/bert/preprocess.py


Create a conda environment for the steps below.

In [36]:
conda_dependencies = CondaDependencies.create(
    conda_packages=[
        "numpy",
        "scikit-learn",
        "pandas",
    ],
    pip_packages=["azureml-sdk==1.0.43.*", 
                  "torch==1.1", 
                  "tqdm==4.31.1",
                 "pytorch-pretrained-bert>=0.6"],
    python_version="3.6.8",
)
run_config = RunConfiguration(conda_dependencies=conda_dependencies)
run_config.environment.docker.enabled = True

Then create the list of steps that use the preprocess.py created above. Add these steps into a pipeline and validate it to ensure there are no errors.

In [37]:
processed_train_files = []
processed_test_files = []
ds = ws.get_default_datastore()

for i in range(num_train_batches):
        input_data = DataReference(datastore=ds, 
                                   data_reference_name='train_batch_{}'.format(str(i)), 
                                   path_on_datastore='mnli_data/train/batch{}.csv'.format(str(i)),
                                   overwrite=False)

        output_data = PipelineData(name="train{}".format(str(i)), datastore=ds,
                       output_path_on_compute='mnli_data/processed_train/batch{}.csv'.format(str(i)))

        step = PythonScriptStep(
            name='preprocess_step_train_{}'.format(str(i)),
            arguments=["--input_data", input_data, "--output_data", output_data],
            script_name= "utils_nlp/bert/preprocess.py",
            inputs=[input_data],
            outputs=[output_data],
            source_directory=project_folder,
            compute_target=compute_target,
            runconfig=run_config,
            allow_reuse=False,
        )
        
        processed_train_files.append(output_data)         
            
for i in range(num_test_batches):
            input_data = DataReference(datastore=ds, 
                                       data_reference_name='test_batch_{}'.format(str(i)), 
                                       path_on_datastore='mnli_data/test/batch{}.csv'.format(str(i)),
                                       overwrite=False)
        
            output_data = PipelineData(name="test{}".format(str(i)), datastore=ds,
                        output_path_on_compute='mnli_data/processed_test/batch{}.csv'.format(str(i)))
            
            step = PythonScriptStep(
                name='preprocess_step_test_{}'.format(str(i)),
                arguments=["--input_data", input_data, "--output_data", output_data],
                script_name= "utils_nlp/bert/preprocess.py",
                inputs=[input_data],
                outputs=[output_data],
                source_directory=project_folder,
                compute_target=compute_target,
                runconfig=run_config,
                allow_reuse=False,
            )
            
            processed_test_files.append(output_data)
            
print(processed_train_files)
print(processed_test_files)

[$AZUREML_DATAREFERENCE_train0, $AZUREML_DATAREFERENCE_train1, $AZUREML_DATAREFERENCE_train2, $AZUREML_DATAREFERENCE_train3, $AZUREML_DATAREFERENCE_train4, $AZUREML_DATAREFERENCE_train5, $AZUREML_DATAREFERENCE_train6, $AZUREML_DATAREFERENCE_train7, $AZUREML_DATAREFERENCE_train8, $AZUREML_DATAREFERENCE_train9, $AZUREML_DATAREFERENCE_train10, $AZUREML_DATAREFERENCE_train11, $AZUREML_DATAREFERENCE_train12, $AZUREML_DATAREFERENCE_train13, $AZUREML_DATAREFERENCE_train14, $AZUREML_DATAREFERENCE_train15, $AZUREML_DATAREFERENCE_train16, $AZUREML_DATAREFERENCE_train17, $AZUREML_DATAREFERENCE_train18, $AZUREML_DATAREFERENCE_train19, $AZUREML_DATAREFERENCE_train20, $AZUREML_DATAREFERENCE_train21, $AZUREML_DATAREFERENCE_train22, $AZUREML_DATAREFERENCE_train23, $AZUREML_DATAREFERENCE_train24, $AZUREML_DATAREFERENCE_train25, $AZUREML_DATAREFERENCE_train26, $AZUREML_DATAREFERENCE_train27, $AZUREML_DATAREFERENCE_train28, $AZUREML_DATAREFERENCE_train29, $AZUREML_DATAREFERENCE_train30, $AZUREML_DATAREFE

In [38]:
#pipeline = Pipeline(workspace=ws, steps=steps)
#pipeline.validate()

In [39]:
#pipeline_run = Experiment(ws, 'TC-Preprocessing-BERT').submit(pipeline)
#RunDetails(pipeline_run).show()

In [40]:
# ToDo: Clean up local preprocess file

## 2. Train and Score

Once the data is processed and available on datastore, we  train the classifier using the training examples. This involves fine-tuning the BERT Transformer and learning a linear classification layer on top of that. After training is complete we score the performance of the model on the test dataset

The training is distributed and is done AzureML's capability to support distributed using MPI with horovod. 


### 2.1 Setup training script

In [41]:
%%writefile ../../utils_nlp/bert/train.py
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import os
import logging
import argparse
import json

from sklearn.metrics import classification_report

from utils_nlp.bert.common import Language
from utils_nlp.bert.sequence_classification_distributed import (
    BERTSequenceDistClassifier,
)
from utils_nlp.common.timer import Timer

BATCH_SIZE = 32
NUM_GPUS = 2
NUM_EPOCHS = 1
LABELS = ["telephone", "government", "travel", "slate", "fiction"]

logger = logging.getLogger(__name__)


parser = argparse.ArgumentParser()
parser.add_argument(
    "--train_files",
    nargs="+",
    default=[],
    help="List of file paths to all the files in train dataset.",
)

parser.add_argument(
    "--test_files",
    nargs="+",
    default=[],
    help="List of file paths to all the files in test dataset.",
)

parser.add_argument(
    "--result_file",
    type=str,
    help="Path to the result file containing confidence report",
)

args = parser.parse_args()
train_files = [file.strip() for file in args.train_files]
test_files = [file.strip() for file in args.test_files]
result_file = args.result_file

# Handle square brackets from train list
train_files[0] = train_files[0][1:]
train_files[len(train_files) - 1] = train_files[len(train_files) - 1][:-1]

# Handle square brackets from test list
test_files[0] = test_files[0][1:]
test_files[len(test_files) - 1] = test_files[len(test_files) - 1][:-1]

# Create result directory
result_dir = os.path.dirname(os.path.abspath(result_file))
if result_dir is not None:
    os.makedirs(result_dir, exist_ok=True)
    logger.info("%s created" % result_dir)

# Train
classifier = BERTSequenceDistClassifier(
    language=Language.ENGLISH, num_labels=len(LABELS)
)
with Timer() as t:
    classifier.fit(
        train_files,
        num_gpus=NUM_GPUS,
        num_epochs=NUM_EPOCHS,
        batch_size=BATCH_SIZE,
        verbose=True,
    )
logger.info("Training Time {}".format(t.interval / 3600))

# Predict
preds, labels_test = classifier.predict(
    test_files, num_gpus=NUM_GPUS, batch_size=BATCH_SIZE
)

results = classification_report(labels_test, preds, target_names=LABELS)
print(results)
with open(result_file, "w") as fp:
    fp.write(json.dumps(results))

Overwriting ../../utils_nlp/bert/train.py


### 2.2 Create a Pytorch Estimator

We create a Pytorch Estimator using AzureML SDK and additonally define an EstimatorStep to run it on AzureML pipelines.

In [42]:
ds = ws.get_default_datastore()

result_file = PipelineData(name="results", 
                 datastore=ds,
                 output_path_on_compute='mnli_data/results/result.json')

In [43]:
estimator = PyTorch(source_directory=project_folder,
                    compute_target=compute_target,
                    entry_script='utils_nlp/bert/train.py',
                    node_count=4,
                    distributed_training=MpiConfiguration(),
                    process_count_per_node=2,
                    use_gpu=True,
                    conda_packages=['scikit-learn=0.20.3', 'numpy>=1.16.0', 'pandas'],
                    pip_packages=["tqdm==4.31.1","pytorch-pretrained-bert>=0.6"]
                   )



In [44]:
inputs = processed_train_files + processed_test_files

est_step = EstimatorStep(name="Estimator-Train", 
                         estimator=estimator, 
                         estimator_entry_script_arguments=[
                             '--train_files',  str(processed_train_files),
                             '--test_files', str(processed_test_files),
                             '--result_file', result_file],
                         inputs = inputs,
                         outputs =[result_file],
                         runconfig_pipeline_params=None, 
                         compute_target=compute_target)

### 2.3 Submit the pipeline.

In [45]:
pipeline = Pipeline(workspace=ws, steps=[est_step])
#pipeline.validate()

In [46]:
pipeline_run = Experiment(ws, 'NLP-TC-BERT-distributed').submit(pipeline)

Created step Estimator-Train [2478e9a4][2ea6e3aa-e024-4553-a518-5436f90d0961], (This step will run and generate new outputs)
Created step preprocess_step_train_0 [47077bd1][ad10ca15-61ac-46e6-b58f-907b0ca3affe], (This step will run and generate new outputs)
Created step preprocess_step_train_1 [80f459ff][9cee1d8b-32b0-4f01-a0ad-69f79c306a0a], (This step will run and generate new outputs)
Created step preprocess_step_train_2 [d204911e][7d26964a-99a4-4046-8e32-41e113c58717], (This step will run and generate new outputs)
Created step preprocess_step_train_3 [587308c5][84783339-4613-4063-a460-8b9caf16792b], (This step will run and generate new outputs)
Created step preprocess_step_train_4 [80f9bf25][29eec2d0-e643-470d-94ad-1f5ac0932965], (This step will run and generate new outputs)
Created step preprocess_step_train_5 [eeecc0d5][0d9b727f-4dc7-4997-83f0-c4853c8760b5], (This step will run and generate new outputs)
Created step preprocess_step_train_6 [3feb594d][c1ec1660-1436-4ba7-9763-e9d64

Created data reference train_batch_7 for StepId [7b408b77][d2cb8ea4-4650-4143-85cc-3a50b2d92fb3], (Consumers of this data will generate new runs.)
Created data reference train_batch_8 for StepId [a23544df][0b8ae803-5f49-4361-9c27-156a4f69486b], (Consumers of this data will generate new runs.)
Created data reference train_batch_9 for StepId [06db1042][2d50bd14-2fb0-4742-8805-3bd1dc19c3c3], (Consumers of this data will generate new runs.)
Created data reference train_batch_10 for StepId [9d24583a][cba68341-2c4a-432e-a4bc-38af276684a6], (Consumers of this data will generate new runs.)
Created data reference train_batch_11 for StepId [f0a34518][f7250703-3bad-4db6-9c4a-e68c98565f0a], (Consumers of this data will generate new runs.)
Created data reference train_batch_12 for StepId [93055109][1e833559-7b27-422d-ad18-c7b8421af99a], (Consumers of this data will generate new runs.)
Created data reference train_batch_13 for StepId [84b936a5][c28e0d83-f13d-4bb4-9be5-d0da1d983202], (Consumers of th

In [47]:
RunDetails(pipeline_run).show()

_PipelineWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': True, 'log_level': 'INFO', '…

In [67]:
#If you would like to cancel the job for any reasons uncomment the code below.
#experiment = Experiment(ws, name='NLP-TC-BERT-distributed')
#run = get_run(experiment, '71d15ff2-2c71-4484-b074-6ca4c5c06acf')
#run.cancel()

In [68]:
# Wait for the run to finish (this might take an hour for MNLI dataset)
pipeline_run.wait_for_completion(show_output=True)

PipelineRunId: 9076e1c9-24e8-430c-9af8-a6c8cf69c69b
Link to Portal: https://mlworkspace.azure.ai/portal/subscriptions/15ae9cb6-95c1-483d-a0e3-b1a1a3b06324/resourceGroups/nlprg/providers/Microsoft.MachineLearningServices/workspaces/MAIDAPTest/experiments/NLP-TC-BERT-distributed/runs/9076e1c9-24e8-430c-9af8-a6c8cf69c69b

PipelineRun Execution Summary
PipelineRun Status: Finished
{'runId': '9076e1c9-24e8-430c-9af8-a6c8cf69c69b', 'status': 'Completed', 'startTimeUtc': '2019-07-26T19:37:55.489875Z', 'endTimeUtc': '2019-07-26T20:43:19.338839Z', 'properties': {'azureml.runsource': 'azureml.PipelineRun', 'runSource': None, 'runType': 'HTTP', 'azureml.parameters': '{}'}, 'logFiles': {'logs/azureml/stdoutlogs.txt': 'https://maidaptest3334372853.blob.core.windows.net/azureml/ExperimentRun/dcid.9076e1c9-24e8-430c-9af8-a6c8cf69c69b/logs/azureml/stdoutlogs.txt?sv=2018-03-28&sr=b&sig=uIftjnBZLpFMRrckf9fx3qHAHOTn%2B%2FqpHRhegULyDsw%3D&st=2019-07-26T21%3A48%3A04Z&se=2019-07-27T05%3A58%3A04Z&sp=r', 'log

'Finished'

In [76]:
#ds.as_mount()
result_file.as_download(".")

$AZUREML_DATAREFERENCE_.