# Running Dask on AzureML


In [5]:
import time

from azureml.core import Workspace, Experiment
from azureml.widgets import RunDetails
from azureml.core.runconfig import RunConfiguration, MpiConfiguration
from azureml.train.estimator import Estimator

## Starting the cluster

In [6]:
ws = Workspace.from_config()
ws

Workspace.create(name='ncus-azureml', subscription_id='6560575d-fa06-4e7d-95fb-f962e74efd7a', resource_group='copetersrg')

In [7]:
ct = ws.compute_targets['dask-cluster']
ct

AmlCompute(workspace=Workspace.create(name='ncus-azureml', subscription_id='6560575d-fa06-4e7d-95fb-f962e74efd7a', resource_group='copetersrg'), name=dask-cluster, id=/subscriptions/6560575d-fa06-4e7d-95fb-f962e74efd7a/resourceGroups/copetersrg/providers/Microsoft.MachineLearningServices/workspaces/ncus-azureml/computes/dask-cluster, type=AmlCompute, provisioning_state=Succeeded, location=northcentralus, tags=None)

Starting the Dask cluster using an Estimator with MpiConfiguration. Make sure the cluster is able to scale up to 10 nodes or change the `node_count` below. 

In [8]:
est = Estimator('dask', 
                compute_target=ct, 
                entry_script='startDask.py', 
                conda_dependencies_file='environment.yml', 
                script_params={'--datastore': ws.get_default_datastore()},
                node_count=50,
                distributed_training=MpiConfiguration())

run = Experiment(ws, 'dask').submit(est)

In [19]:
RunDetails(run).show()

_UserRunWidget(widget_settings={'childWidgetDisplay': 'popup', 'send_telemetry': False, 'log_level': 'NOTSET',…

In [18]:
from IPython.display import clear_output

print("waiting for scheduler node's ip")
while run.get_status() != 'Canceled' and run.get_status() != 'Running':
    print('.', end ="")
    time.sleep(5)

clear_output()

if run.get_status() == 'Canceled':
    print('Run was canceled')
else:
    headnode = run.get_metrics()['headnode']
    run.get_metrics()

Run was canceled


In [14]:
run.get_status()

'Canceled'

## Establish the port-forwarding from Compute Instance to Dask Dashboard

In [None]:
print(f'ssh -f daskuser@{headnode} -L 8786:localhost:8786 -L 8788:{headnode_private_ip}:8787 -L 9999:localhost:8888')

Make sure to leave the terminal tab open to keep the port-forward running

As you see, you are forwarding 3 ports 

1. 8786 is for the scheduler and will be used to connect the client to the cluster
2. 8788 is for the Bokeh app that shows the activity on the cluster (we are mapping to the local port 8788 to avoid a conflict with the RStudio Server running on the Notebook VM)
3. 9999 is for a jupyter instance running on the head node. You can connect to the scheduler from the jupyter running on your Notebook VM or from this jupyter instance on the head node.   

To access the Bokeh app, change the URL to your notebook VM by adding `-8788` right after the machine name. If you are running this notebook on a Notebook VM, then you can create the URLs by excuting the next cell:

## Run some jobs on the cluster
If you are able to see the Bokeh app, it is time to use the cluster. Thanks to the port forward, the scheduler appears to the notebook VM at `tcp://localhost:8786`. You should see 10 workers.

In [None]:
exp = Experiment(ws, 'dask')
exp

In [None]:
runs = exp.get_runs()
run = next(runs)
run

In [None]:
import dask
import dask.dataframe as dd

In [None]:
import matplotlib
import matplotlib.pyplot as plt

from datetime import datetime

%matplotlib inline

In [None]:
from dask.distributed import Client

c = Client('tcp://localhost:8786')
c.restart()
c

In [None]:
ds = run.get_metrics()['datastore']
ds

In [None]:
path = ds + ''
path

In [None]:
def load_data(path):
    df = dd.read_csv(path+'/datasets/isd/*data.csv', dtype={'usaf': 'object'})
    return df

In [None]:
NoaaIsd().to_dask_dataframe()

In [None]:
df = dask.delayed(load_data)(path).compute()

In [None]:
df.head()

In [None]:
df.npartitions

In [None]:
%time len(df)

In [None]:
df.datetime = dd.to_datetime(df.datetime).dt.floor('d')

In [None]:
df = df.repartition(npartitions=150)

In [None]:
df = df.set_index(df.datetime, sorted=True).persist()

In [None]:
df.head()

In [None]:
%time len(df)

In [None]:
df.npartitions

In [None]:
df2 = df.persist()

In [None]:
df.describe().compute()

In [None]:
means = df.groupby(df.index).mean().compute()
means.head()

In [None]:
df = df.drop(['datetime'], axis=1)

In [None]:
df.index

In [None]:
def write_data(path):
    df.to_parquet(path)

In [None]:
a = dask.delayed(write_data)(ds+'/dask/outputs/isd').compute()

In [None]:
counts = df.groupby([df.index.month, df.index.year]).day.count().compute()

In [None]:
cs = [counts[month][2015] for month in range(1, 13)]
cs

In [None]:
for col in list(means.columns):
    fig = plt.figure(figsize=(16, 8))
    plt.style.use('dark_background')
    means[col].plot(color='b')
    plt.title('Average of {}'.format(col))
    plt.xlim([datetime(2015, 1, 1), datetime(2015, 12, 1)])
    plt.grid()
    
    run.log_image(col, plot=plt)

In [None]:
df.memory_usage(index=True, deep=True).sum().compute()

In [None]:
df.info()

See if the cluster works

In [None]:
import time
import numpy as np
from dask import delayed, visualize

def inc(x):
    time.sleep(abs(np.random.normal(5, 2)))
    return x + 1

fut = []
for i in range(10):
    fut.append( c.submit(delayed(inc), i) )

fut

In [None]:
for i in fut:
    print(i.result())

In [None]:
def sum(a):
    x = 0
    for y in a:
        x += y
    return x

results = []
for f in fut:
    results.append(f.result())
    
fut2 = c.submit(sum, results)
fut2

In [None]:
fut2.result().compute()

In [None]:
visualize(fut2.result())

# Training on Large Datasets
(from https://github.com/dask/dask-tutorial)

Sometimes you'll want to train on a larger than memory dataset. `dask-ml` has implemented estimators that work well on dask arrays and dataframes that may be larger than your machine's RAM.

In [None]:
from dask.distributed import Client
import joblib
import dask.array as da
import dask.delayed
from sklearn.datasets import make_blobs
import numpy as np

We'll make a small (random) dataset locally using scikit-learn.

In [None]:
n_centers = 12
n_features = 20

X_small, y_small = make_blobs(n_samples=1000, centers=n_centers, n_features=n_features, random_state=0)

centers = np.zeros((n_centers, n_features))

for i in range(n_centers):
    centers[i] = X_small[y_small == i].mean(0)
    
centers[:4]

The small dataset will be the template for our large random dataset.
We'll use `dask.delayed` to adapt `sklearn.datasets.make_blobs`, so that the actual dataset is being generated on our workers. 

In [None]:
n_samples_per_block = 200000
n_blocks = 500

delayeds = [dask.delayed(make_blobs)(n_samples=n_samples_per_block,
                                     centers=centers,
                                     n_features=n_features,
                                     random_state=i)[0]
            for i in range(n_blocks)]
arrays = [da.from_delayed(obj, shape=(n_samples_per_block, n_features), dtype='float64')
          for obj in delayeds]
X = da.concatenate(arrays)
X

In [None]:
# Check the size of the array
X.nbytes / 1e9

In [None]:
# Only run this on the cluster.
X = X.persist()  

The algorithms implemented in Dask-ML are scalable. They handle larger-than-memory datasets just fine.

They follow the scikit-learn API, so if you're familiar with scikit-learn, you'll feel at home with Dask-ML.

In [None]:
from dask_ml.cluster import KMeans
clf = KMeans(init_max_iter=3, oversampling_factor=10)

In [None]:
%time clf.fit(X)

In [None]:
clf.labels_

In [None]:
clf.labels_[:10].compute()

## Shut cluster down
To shut the cluster down, cancel the job that runs the cluster. 

In [None]:
for run in ws.experiments['dask'].get_runs():
    if run.get_status() == "Running":
        print(f'cancelling run {run.id}')
        run.cancel()

### Just for convenience, get the latest running Run

In [None]:
for run in ws.experiments['dask'].get_runs():
    if run.get_status() == "Running":
        print(f'latest running run is {run.id}')
        break