In [1]:
import azureml.core
from azureml.core import Workspace, Datastore, Experiment
import azureml.data
from azureml.data.azure_storage_datastore import AzureFileDatastore, AzureBlobDatastore
from azureml.data.data_reference import DataReference
from azureml.train.dnn import TensorFlow
from azureml.core.compute import ComputeTarget, AmlCompute
from azureml.core.compute_target import ComputeTargetException
from azureml.widgets import RunDetails
import os

## Initialize workspace
Initialize a [Workspace](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#workspace) object from the existing workspace. `Workspace.from_config()` creates a workspace object from the details stored in `config.json`.

In [2]:
ws = Workspace.from_config()
print('Workspace name: ' + ws.name, 
      'Azure region: ' + ws.location, 
      'Subscription id: ' + ws.subscription_id, 
      'Resource group: ' + ws.resource_group, sep='\n')

Found the config file in: /home/dg595/CrashDetection/Azure/config.json
Workspace name: AccidentDetection
Azure region: eastus
Subscription id: e298a653-a33f-4b32-9ec2-2adfbd3b649d
Resource group: CCGroup7Resources


## Attach to existing DataStore

In [3]:
ds = Datastore.get(ws, datastore_name='dashcamstreetimages')

In [4]:
ds.as_mount()

$AZUREML_DATAREFERENCE_dashcamstreetimages

## Create or Attach existing AmlCompute
Create a [compute target](https://docs.microsoft.com/azure/machine-learning/service/concept-azure-machine-learning-architecture#compute-target) to train model

If we could not find the cluster with the given name, then we will create a new cluster here. We will create an `AmlCompute` cluster of `STANDARD_NC6` GPU VMs. This process is broken down into 3 steps:
1. create the configuration (this step is local and only takes a second)
2. create the cluster (this step will take about **20 seconds**)
3. provision the VMs to bring the cluster to the initial size (of 1 in this case). This step will take about **3-5 minutes** and is providing only sparse output in the process. Please make sure to wait until the call returns before moving to the next cell

In [5]:
cluster_name = "crash"

try:
    compute_target = ComputeTarget(workspace=ws, name=cluster_name)
    print('Found existing compute target')
except ComputeTargetException:
    print('Creating a new compute target...')
    compute_config = AmlCompute.provisioning_configuration(vm_size='STANDARD_NC6', 
                                                           max_nodes=4)

    # create the cluster
    compute_target = ComputeTarget.create(ws, cluster_name, compute_config)

    # can poll for a minimum number of nodes and for a specific timeout. 
    # if no min node count is provided it uses the scale settings for the cluster
    compute_target.wait_for_completion(show_output=True, min_node_count=None, timeout_in_minutes=20)

# use get_status() to get a detailed status for the current cluster. 
print(compute_target.get_status().serialize())

Found existing compute target
{'currentNodeCount': 0, 'targetNodeCount': 0, 'nodeStateCounts': {'preparingNodeCount': 0, 'runningNodeCount': 0, 'idleNodeCount': 0, 'unusableNodeCount': 0, 'leavingNodeCount': 0, 'preemptedNodeCount': 0}, 'allocationState': 'Steady', 'allocationStateTransitionTime': '2019-04-21T19:14:22.263000+00:00', 'errors': None, 'creationTime': '2019-04-18T23:11:51.292045+00:00', 'modifiedTime': '2019-04-18T23:12:07.921232+00:00', 'provisioningState': 'Succeeded', 'provisioningStateTransitionTime': None, 'scaleSettings': {'minNodeCount': 0, 'maxNodeCount': 4, 'nodeIdleTimeBeforeScaleDown': 'PT120S'}, 'vmPriority': 'Dedicated', 'vmSize': 'STANDARD_NC6'}


In [6]:
compute_targets = ws.compute_targets
for name, ct in compute_targets.items():
    print(name, ct.type, ct.provisioning_state)

crash AmlCompute Succeeded


## Create TensorFlow estimator & add Keras
Next, we construct an `azureml.train.dnn.TensorFlow` estimator object, use the `gpucluster` as compute target, and pass the mount-point of the datastore to the training code as a parameter.
The TensorFlow estimator is providing a simple way of launching a TensorFlow training job on a compute target. It will automatically provide a docker image that has TensorFlow installed. In this case, we add `keras` package (for the Keras framework obviously), and `matplotlib` package for plotting a "Loss vs. Accuracy" chart and record it in run history.

In [7]:
# Train using Tensorflow
from azureml.train.dnn import TensorFlow
exp = Experiment(workspace=ws, name='AccidentDetection')
script_params = {
    '--data_dir': ds.as_mount(),
    '--mode': 'train',
    '--gpu': 0,
}

tf_est = TensorFlow(source_directory='Anticipating-Accidents/',
                    script_params=script_params,
                    compute_target=compute_target,
                    entry_script='accident.py',
                    conda_packages=['opencv'],
                    pip_packages=['keras', 'matplotlib'],
                    use_gpu=True)

framework_version is not specified, defaulting to version 1.12.


## Submit job to run
Submit the estimator to the Azure ML experiment to kick off the execution.

In [8]:
run = exp.submit(tf_est)

Submitting /home/dg595/CrashDetection/Azure/Anticipating-Accidents directory for run. The size of the directory >= 25 MB, so it can take a few minutes.


### Monitor the Run
As the Run is executed, it will go through the following stages:
1. Preparing: A docker image is created matching the Python environment specified by the TensorFlow estimator and it will be uploaded to the workspace's Azure Container Registry. This step will only happen once for each Python environment -- the container will then be cached for subsequent runs. Creating and uploading the image takes about **5 minutes**. While the job is preparing, logs are streamed to the run history and can be viewed to monitor the progress of the image creation.

2. Scaling: If the compute needs to be scaled up (i.e. the AmlCompute cluster requires more nodes to execute the run than currently available), the cluster will attempt to scale up in order to make the required amount of nodes available. Scaling typically takes about **5 minutes**.

3. Running: All scripts in the script folder are uploaded to the compute target, data stores are mounted/copied and the `entry_script` is executed. While the job is running, stdout and the `./logs` folder are streamed to the run history and can be viewed to monitor the progress of the run.

4. Post-Processing: The `./outputs` folder of the run is copied over to the run history

There are multiple ways to check the progress of a running job. We can use a Jupyter notebook widget. 

**Note: The widget will automatically update ever 10-15 seconds, always showing you the most up-to-date information about the run**

In [9]:
from azureml.widgets import RunDetails
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'INFO', '…

In [15]:
run

Experiment,Id,Type,Status,Details Page,Docs Page
AccidentDetection,AccidentDetection_1556475402_29a7a716,azureml.scriptrun,Running,Link to Azure Portal,Link to Documentation


## Download the saved Model

In [22]:
# create a model folder in the current directory
os.makedirs('./model', exist_ok=True)

for f in run.get_file_names():
    if f.startswith('outputs/model'):
        output_file_path = os.path.join('./model', f.split('/')[-1])
        print('Downloading from {} to {} ...'.format(f, output_file_path))
        run.download_file(name=f, output_file_path=output_file_path)                  

Downloading from outputs/model/checkpoint to ./model/checkpoint ...
Downloading from outputs/model/final_model.data-00000-of-00001 to ./model/final_model.data-00000-of-00001 ...
Downloading from outputs/model/final_model.index to ./model/final_model.index ...
Downloading from outputs/model/final_model.meta to ./model/final_model.meta ...
Downloading from outputs/model/model-10.data-00000-of-00001 to ./model/model-10.data-00000-of-00001 ...
Downloading from outputs/model/model-10.index to ./model/model-10.index ...
Downloading from outputs/model/model-10.meta to ./model/model-10.meta ...
Downloading from outputs/model/model-15.data-00000-of-00001 to ./model/model-15.data-00000-of-00001 ...
Downloading from outputs/model/model-15.index to ./model/model-15.index ...
Downloading from outputs/model/model-15.meta to ./model/model-15.meta ...
Downloading from outputs/model/model-20.data-00000-of-00001 to ./model/model-20.data-00000-of-00001 ...
Downloading from outputs/model/model-20.index to

In [23]:
model = run.register_model(model_name='crash-detection', model_path='outputs')