In [14]:
%load_ext blackcellmagic

In [58]:
# Imports
import sys
sys.path.append("../../")
import json
from urllib.request import urlretrieve

#import utils
from utils_nlp.common.timer import Timer
from utils_nlp.azureml import azureml_utils

from azureml.core import Datastore, Experiment
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.train.dnn import PyTorch
from azureml.widgets import RunDetails

## BiDAF Settings

In [67]:
bidaf_settings = {
  "dataset_reader": {
    "type": "squad",
    "token_indexers": {
      "tokens": {
        "type": "single_id",
        "lowercase_tokens": True
      },
      "token_characters": {
        "type": "characters",
        "character_tokenizer": {
          "byte_encoding": "utf-8",
          "start_tokens": [259],
          "end_tokens": [260]
        },
        "min_padding_length": 5
      }
    }
  },
  "train_data_path": "https://allennlp.s3.amazonaws.com/datasets/squad/squad-train-v1.1.json",
  "validation_data_path": "https://allennlp.s3.amazonaws.com/datasets/squad/squad-dev-v1.1.json",
  "evaluate_on_test": True,
  "model": {
    "type": "bidaf",
    "text_field_embedder": {
      "token_embedders": {
        "tokens": {
          "type": "embedding",
          "pretrained_file": "https://allennlp.s3.amazonaws.com/datasets/glove/glove.6B.100d.txt.gz",
          "embedding_dim": 100,
          "trainable": False
        },
        "token_characters": {
          "type": "character_encoding",
          "embedding": {
            "num_embeddings": 262,
            "embedding_dim": 16
          },
          "encoder": {
            "type": "cnn",
            "embedding_dim": 16,
            "num_filters": 100,
            "ngram_filter_sizes": [5]
          },
          "dropout": 0.2
        }
      }
    },
    "num_highway_layers": 2,
    "phrase_layer": {
      "type": "lstm",
      "bidirectional": True,
      "input_size": 200,
      "hidden_size": 100,
      "num_layers": 1
    },
    "similarity_function": {
      "type": "linear",
      "combination": "x,y,x*y",
      "tensor_1_dim": 200,
      "tensor_2_dim": 200
    },
    "modeling_layer": {
      "type": "lstm",
      "bidirectional": True,
      "input_size": 800,
      "hidden_size": 100,
      "num_layers": 2,
      "dropout": 0.2
    },
    "span_end_encoder": {
      "type": "lstm",
      "bidirectional": True,
      "input_size": 1400,
      "hidden_size": 100,
      "num_layers": 1
    },
    "dropout": 0.2
  },
  "iterator": {
    "type": "bucket",
    "sorting_keys": [["passage", "num_tokens"], ["question", "num_tokens"]],
    "batch_size": 40
  },

  "trainer": {
    "num_epochs": 1, #20
    "grad_norm": 5.0,
    "patience": 10,
    "validation_metric": "+em",
    "cuda_device": 0,
    "learning_rate_scheduler": {
      "type": "reduce_on_plateau",
      "factor": 0.5,
      "mode": "max",
      "patience": 2
    },
    "optimizer": {
      "type": "adam",
      "betas": [0.9, 0.9]
    }
  }
}

In [68]:
os.makedirs("squad", exist_ok=True)

with open("squad/bidaf_config.json", "w") as f:
    f.write(json.dumps(bidaf_settings))

## AzureML Setup

Now, we set up the necessary components for running this as an AzureML experiment
1. Create or link to an existing `Workspace`
2. Set up an `Experiment` with `logging`
3. Create or attach existing `AmlCompute`
4. Upload our data to a `Datastore`

### Link to or create a Workspace

First, go through the [Configuration](https://github.com/Azure/MachineLearningNotebooks/blob/master/configuration.ipynb) notebook to install the Azure Machine Learning Python SDK and create an Azure ML `Workspace`. This will create a config.json file containing the values needed below to create a workspace.

**Note**: you do not need to fill in these values if you have a config.json in the same folder as this notebook

In [2]:
ws = azureml_utils.get_or_create_workspace(
    subscription_id="<SUBSCRIPTION_ID>",
    resource_group="<RESOURCE_GROUP>",
    workspace_name="<WORKSPACE_NAME>",
    workspace_region="<WORKSPACE_REGION>",
)

Performing interactive authentication. Please follow the instructions on the terminal.


Note, we have launched a browser for you to login. For old experience with device code, use "az login --use-device-code"
You have logged in. Now let us find all the subscriptions to which you have access...


Interactive authentication successfully completed.


In [3]:
print(
    "Workspace name: " + ws.name,
    "Azure region: " + ws.location,
    "Subscription id: " + ws.subscription_id,
    "Resource group: " + ws.resource_group,
    sep="\n",
)

Workspace name: MAIDAPTest
Azure region: eastus2
Subscription id: 15ae9cb6-95c1-483d-a0e3-b1a1a3b06324
Resource group: nlprg


### Set up an Experiment and Logging

In [11]:
# Make a folder for the project
project_folder = "./bidaf-question-answering"
os.makedirs(project_folder, exist_ok=True)

# Set up an experiment
experiment_name = "bidaf-question-answering"
experiment = Experiment(ws, experiment_name)

# Add logging to our experiment
run = experiment.start_logging()

### Link AmlCompute Compute Target


We need to link a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) for training our model (see [compute options](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-set-up-training-targets#supported-compute-targets) for explanation of the different options). We will use an [AmlCompute](https://docs.microsoft.com/azure/machine-learning/service/how-to-set-up-training-targets#amlcompute) target and link to an existing target (if the cluster_name exists) or create a STANDARD_NC6 GPU cluster (autoscales from 0 to 4 nodes) in this example. Creating a new AmlComputes takes approximately 5 minutes. 

As with other Azure services, there are limits on certain resources (e.g. AmlCompute) associated with the Azure Machine Learning service. Please read [this article](https://docs.microsoft.com/en-us/azure/machine-learning/service/how-to-manage-quotas) on the default limits and how to request more quota.

In [12]:
# choose your cluster
cluster_name = "gpucluster"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print("Found existing compute target.")
except ComputeTargetException:
    print("Creating a new compute target...")
    compute_config = AmlCompute.provisioning_configuration(
        vm_size="STANDARD_NC6", max_nodes=4
    )

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    compute_target.wait_for_completion(show_output=True)

# use get_status() to get a detailed status for the current AmlCompute.
print(compute_target.get_status().serialize())

Found existing compute target.
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-06-26T01:16:30.014000+00:00', 'errors': None, 'creationTime': '2019-05-20T22:09:40.142683+00:00', 'modifiedTime': '2019-05-20T22:10:11.888950+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}


### Upload Data to Datastore

This step uploads our local data to a `Datastore` so that the data is accessible from the remote compute target and creates a `DataReference` to point to the location of the data on the Datastore. A DataStore is backed either by a Azure File Storage (default option) or Azure Blob Storage ([how to decide between these options](https://docs.microsoft.com/en-us/azure/storage/common/storage-decide-blobs-files-disks)) and data is made accessible by mounting or copying data to the compute target. `ws.datastores` lists all options for datastores and `ds.account_name` gets the name of the datastore that can be used to find it in the Azure portal.

In [17]:
urlretrieve(
    "https://allennlp.s3.amazonaws.com/datasets/squad/squad-train-v1.1.json",
    filename="squad/squad_train.json",
)

urlretrieve(
    "https://allennlp.s3.amazonaws.com/datasets/squad/squad-dev-v1.1.json",
    filename="squad/squad_dev.json",
)

('squad/squad_dev.json', <http.client.HTTPMessage at 0x2c70027bb38>)

In [69]:
# Select a specific datastore or you can call ws.get_default_datastore()
datastore_name = "workspacefilestore"
ds = ws.datastores[datastore_name]

# Upload files in data folder to the datastore
ds.upload(
    src_dir="./squad",
    target_path="squad_data",
    overwrite=True,
    show_progress=True,
)

Uploading ./squad\bidaf_config.json
Uploading ./squad\squad_dev.json
Uploading ./squad\squad_train.json
Uploaded ./squad\bidaf_config.json, 1 files out of an estimated total of 3
Uploaded ./squad\squad_dev.json, 2 files out of an estimated total of 3
Uploaded ./squad\squad_train.json, 3 files out of an estimated total of 3


$AZUREML_DATAREFERENCE_0583283c61a1402cbdf6b6d13ee970e5

### Prepare Training Script

In [62]:
%%writefile $project_folder/train.py
import torch
import argparse
import os
import json
from allennlp.common import Params
from allennlp.commands.train import train_model

def load_params(folder, file):
    with open(os.path.join(folder, file)) as f:
        param_dict = json.load(f)
    return Params(param_dict)

def main():
    print("Torch version:", torch.__version__)
    # get command-line arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_folder', type=str, 
                        help='Folder where data is stored')
    parser.add_argument('--config_name', type=str, 
                        help='Name of json configuration file')
    args = parser.parse_args()
    squad_folder = os.path.join(args.data_folder, "squad_data")
    
    params = load_params(squad_folder, args.config_name)
    
    train_model(params,
           serialization_dir = os.path.join(squad_folder, "logs"),
           file_friendly_logging = True,
           recover = False,
           force = True)

if __name__ == "__main__":
    main()

Overwriting ./bidaf-question-answering/train.py


### Create a PyTorch Estimator
The Azure ML SDK's PyTorch estimator enables you to easily submit PyTorch training jobs for both single-node and distributed runs. For more information on the PyTorch estimator, refer [here](https://docs.microsoft.com/azure/machine-learning/service/how-to-train-pytorch).

In [70]:
script_params = {
    '--data_folder': ds.as_mount(),
    '--config_name': 'bidaf_config.json'}

estimator = PyTorch(source_directory=project_folder,
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script= "train.py",
                    use_gpu=True,
                    conda_dependencies_file_path="bidafenv.yml"
                   )

In [71]:
run = experiment.submit(estimator)
print(run)

Run(Experiment: bidaf-question-answering,
Id: bidaf-question-answering_1562078057_9d9e44ab,
Type: azureml.scriptrun,
Status: Starting)


In [72]:
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [73]:
ds.download(target_path="DownloadedLogs",
            prefix='squad_data',
            show_progress=True)

Downloading squad_data\logs\vocabulary\tokens.txt
Downloading squad_data\logs\vocabulary\non_padded_namespaces.txt
Downloading squad_data\logs\log\validation\events.out.tfevents.1562078333.3e6b26e42d944b088e5c10bc5811cb6d000000
Downloading squad_data\logs\log\train\events.out.tfevents.1562078333.3e6b26e42d944b088e5c10bc5811cb6d000000
Downloading squad_data\logs\training_state_epoch_0.th
Downloading squad_data\logs\stdout.log
Downloading squad_data\logs\stderr.log
Downloading squad_data\logs\model_state_epoch_0.th
Downloading squad_data\logs\model.tar.gz
Downloading squad_data\logs\metrics.json
Downloading squad_data\logs\metrics_epoch_0.json
Downloading squad_data\logs\best.th
Downloading squad_data\logs\config.json
Downloading squad_data\squad_train.json
Downloading squad_data\squad_dev.json
Downloading squad_data\bidaf_config.json
Downloaded squad_data\logs\log\validation\events.out.tfevents.1562078333.3e6b26e42d944b088e5c10bc5811cb6d000000, 1 files out of an estimated total of 21
Do

16

In [66]:
run.cancel()