In [1]:
import os
import dask
import time
import joblib
import fsspec
import socket
import matplotlib

import pandas as pd
import dask.dataframe as dd
import matplotlib.pyplot as plt

from datetime import datetime
from dask.distributed import Client
from IPython.core.display import HTML
#from dask_ml.xgboost import XGBRegressor

from azureml.core import Run

%matplotlib inline

In [None]:
def update_packages():
    os.system('pip install --upgrade fsspec dask-ml[complete] adlfs')

In [None]:
computes = [dask.delayed(update_packages)() for i in range(10)]
computes

In [None]:
c.compute(computes)

In [2]:
run = Run.get_context()
run

Experiment,Id,Type,Status,Details Page,Docs Page
rob-dask-demo,rob-dask-demo_1579670991_bc706a0e,azureml.scriptrun,Running,Link to Azure Machine Learning studio,Link to Documentation


### Connect to cluster

In [3]:
c = Client(f'{run.get_metrics()["scheduler"]}')
c

0,1
Client  Scheduler: tcp://10.3.0.7:8786  Dashboard: http://10.3.0.7:8787/status,Cluster  Workers: 10  Cores: 160  Memory: 1.18 TB


In [4]:
dstore = run.experiment.workspace.datastores['data4dask']

In [5]:
STORAGE_OPTIONS = {
    'account_name': dstore.account_name, 
    'account_key' : dstore.account_key
}

In [6]:
protocol  = 'abfs'      # use 'adl' for Azure Data Lake Gen 1
container = 'datasets'  # only contains ISD, GFS is wip

In [7]:
fs = fsspec.filesystem(protocol, **STORAGE_OPTIONS, container_name=container)

In [8]:
fs.ls('/noaa/gfs')

['noaa/gfs/GFSProcessed/', 'noaa/gfs/8c6ca145-43b7-4492-8222-30f0a138fe69']

In [9]:
files = []
for file in fs.glob('noaa/gfs/GFSProcessed/year=*/month=*'): 
    files += fs.ls(f'{file}/')
files[-5:]

['noaa/gfs/GFSProcessed/year=2019/month=9/day=5/',
 'noaa/gfs/GFSProcessed/year=2019/month=9/day=6/',
 'noaa/gfs/GFSProcessed/year=2019/month=9/day=7/',
 'noaa/gfs/GFSProcessed/year=2019/month=9/day=8/',
 'noaa/gfs/GFSProcessed/year=2019/month=9/day=9/']

In [None]:
df = dask.delayed(dd.read_parquet)(files, engine='pyarrow', storage_options=STORAGE_OPTIONS).compute()
%time df.head()

In [None]:
df = df.set_index(dd.to_datetime(df.datetime).dt.floor('d'), sorted=False)
df = df.persist() 
%time len(df)

In [None]:
%time len(df)

In [None]:
%time df.describe().compute()

In [None]:
%time bites = df.memory_usage(index=True, deep=True).sum().compute()
print(f'Dataframe is: {round(bites/1e9, 2)}GB')

In [None]:
%time means = df.groupby(df.index).mean().compute()
means.head()

In [None]:
for col in list(means.columns):
    fig = plt.figure(figsize=(16, 8))
    #plt.style.use('dark_background')
    means[col].plot(color='b')
    plt.title('Average of {}'.format(col))
    plt.xlim([datetime(2008, 1, 1), datetime(2018, 12, 31)])
    plt.grid()
    
    # optionally, log the image to the run
    run.log_image(f'mean_{col}', plot=plt)

You can see the images logged to the run in the studio.

In [None]:
run

## Prepare data

In [None]:
## insert any Pandas-like Dask data prep code 
df['temperature'] = df['temperature']*(9/5)+32 # 'Merica

## Write data

**Important:** you will have not access to write to this account, get your own storage account! 

See the [API documentation for dask.dataframe.to_parquet](https://docs.dask.org/en/latest/dataframe-api.html#dask.dataframe.to_parquet) for additional settings. 

In [None]:
df = df.repartition(npartitions=df.npartitions*10)
%time dask.delayed(df.to_parquet)(f'abfs://outputs/noaa/isd_out.parquet', compression='lz4', storage_options=STORAGE_OPTIONS).compute()

![Write gif](media/write.gif)

## Train XGBoost model

In [None]:
df = dask.delayed(dd.read_parquet)(files, engine='pyarrow', storage_options=STORAGE_OPTIONS).compute()

In [None]:
files = []
for file in fs.glob('noaa/isd/year=*/month=*'): # see https://github.com/dask/adlfs/issues/34
    files += fs.ls(f'{file}/')
files = [f'{protocol}://{container}/{file}' for file in files if '2019' in file] 

df2019 = dask.delayed(dd.read_parquet)(files, engine='pyarrow', storage_options=STORAGE_OPTIONS).compute() 

In [None]:
# begin data prep
df = df.fillna(0) 
df2019 = df2019.fillna(0) 

In [None]:
df['month'] = df['datetime'].dt.month
df2019['month'] = df2019['datetime'].dt.month

In [None]:
cols = list(df.columns)
cols = [col for col in cols if df.dtypes[col] != 'object' and col not in ['version', 'datetime']]
cols

In [None]:
X = df[[col for col in cols if col not in ['temperature']]].persist()
y = df.temperature.persist()
# end data prep - persist intelligently per https://docs.dask.org/en/latest/best-practices.html

In [None]:
xgb = XGBRegressor(n_estimators=16)
%time xgb.fit(X, y)

In [None]:
%time y_pred = xgb.predict(X).compute()

In [None]:
rmse = (((y.to_dask_array().compute()-y_pred)**2).mean())**.5 # runs locally, distribute (?)
print(f'Training RMSE: {round(rmse, 3)}')

## Evaluate model

In [None]:
X_test = df2019[[col for col in cols if col not in ['temperature']]].persist()
y_test = df2019.temperature.persist()

In [None]:
%time y_pred = xgb.predict(X_test).compute()

In [None]:
rmse = (((y_test.to_dask_array().compute()-y_pred)**2).mean())**.5 # runs locally, distribute (?)
print(f'Test RMSE: {round(rmse, 3)}')

## Register model

In [None]:
model_path = 'xgboost_noaa_isd.joblib.dat'
joblib.dump(xgb, model_path)
xgb = joblib.load(model_path)

In [None]:
model = Model.register(ws, model_path, 'xgboost-noaa-isd', 
                       description='Dask XGBoost NOAA ISD temperature predictor',
                       model_framework='XGBoost')

## End the run

Cluster will return to 0 nodes.

In [None]:
c.close()
run.cancel()

In [None]:
t_end = time.time()
print(f'Total run time: {round((t_end-t_start)/60, 2)} minutes')