# Exploratory Data Analysis (EDA) at Scale with Dask

In [None]:
!pip install --upgrade "dask-cloudprovider[azure]"

In [1]:
from azureml.core import Workspace

ws = Workspace.from_config()
ws

Workspace.create(name='AzureML', subscription_id='6560575d-fa06-4e7d-95fb-f962e74efd7a', resource_group='cody-eastus-rg')

In [2]:
import git
from pathlib import Path

# get root of git repo
prefix = Path(git.Repo(".", search_parent_directories=True).working_tree_dir)
prefix

PosixPath('/Users/cody/code/azureml-examples')

In [3]:
from azureml.core import Environment
from dask.distributed import Client
from dask_cloudprovider import AzureMLCluster

env = Environment.from_conda_specification(
    "dask-tutorial", prefix.joinpath("environments", "dask.yml")
)
cluster = AzureMLCluster(
    ws,
    vm_size="STANDARD_DS13_V2",
    environment_definition=env,
    initial_node_count=30,
    scheduler_idle_timeout=1200,
)

c = Client(cluster)
c

.................................................



0,1
Client  Scheduler: tcp://localhost:9002  Dashboard: http://localhost:9001,Cluster  Workers: 1  Cores: 8  Memory: 59.08 GB


In [4]:
container_name = "isdweatherdatacontainer"

storage_options = {"account_name": "azureopendatastorage"}

In [5]:
from adlfs import AzureBlobFileSystem

fs = AzureBlobFileSystem(**storage_options)
fs

<adlfs.spec.AzureBlobFileSystem at 0x7fb57aba3d90>

In [6]:
%%time
files = fs.glob(f"{container_name}/ISDWeather/year=*/month=*/*.parquet")
files = [f"az://{file}" for file in files]
len(files)

CPU times: user 10.7 s, sys: 572 ms, total: 11.3 s
Wall time: 4min 8s


1232

In [7]:
files[-5:]

['az://isdweatherdatacontainer/ISDWeather/year=2020/month=9/part-00003-tid-1578165671371548424-dff7e310-9776-4fe4-a52f-9ef0381fafb9-2463-9.c000.snappy.parquet',
 'az://isdweatherdatacontainer/ISDWeather/year=2020/month=9/part-00004-tid-1578165671371548424-dff7e310-9776-4fe4-a52f-9ef0381fafb9-2469-9.c000.snappy.parquet',
 'az://isdweatherdatacontainer/ISDWeather/year=2020/month=9/part-00005-tid-1578165671371548424-dff7e310-9776-4fe4-a52f-9ef0381fafb9-2466-9.c000.snappy.parquet',
 'az://isdweatherdatacontainer/ISDWeather/year=2020/month=9/part-00006-tid-1578165671371548424-dff7e310-9776-4fe4-a52f-9ef0381fafb9-2465-9.c000.snappy.parquet',
 'az://isdweatherdatacontainer/ISDWeather/year=2020/month=9/part-00007-tid-1578165671371548424-dff7e310-9776-4fe4-a52f-9ef0381fafb9-2470-9.c000.snappy.parquet']

In [8]:
import dask.dataframe as dd

In [9]:
%%time
ddf = dd.read_parquet(
    files, storage_options=storage_options, engine="pyarrow", blocksize="1GB"
)
ddf

CPU times: user 3.19 s, sys: 220 ms, total: 3.41 s
Wall time: 55.7 s


Unnamed: 0_level_0,usaf,wban,datetime,latitude,longitude,elevation,windAngle,windSpeed,temperature,seaLvlPressure,cloudCoverage,presentWeatherIndicator,pastWeatherIndicator,precipTime,precipDepth,snowDepth,stationName,countryOrRegion,p_k,year,day,version
npartitions=1232,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
,object,object,datetime64[ns],float64,float64,float64,int32,float64,float64,float64,object,int32,int32,float64,float64,float64,object,object,object,int32,int32,float64
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [10]:
ddf = ddf.repartition(npartitions=128).persist()

In [11]:
%%time
len(ddf)

CPU times: user 680 ms, sys: 128 ms, total: 807 ms
Wall time: 6min 38s


1506802504

In [12]:
%%time
len(ddf)

CPU times: user 210 ms, sys: 42.1 ms, total: 252 ms
Wall time: 1min 50s


1506802504

In [13]:
%%time
ddf.describe().compute()

CPU times: user 4.07 s, sys: 192 ms, total: 4.27 s
Wall time: 4min 39s


Unnamed: 0,latitude,longitude,elevation,windAngle,windSpeed,temperature,seaLvlPressure,presentWeatherIndicator,pastWeatherIndicator,precipTime,precipDepth,snowDepth,year,day,version
count,1506802000.0,1506803000.0,1506803000.0,1210044000.0,1256867000.0,1478103000.0,539482200.0,136875100.0,55707110.0,320423600.0,320423600.0,8952745.0,1506803000.0,1506803000.0,1506803000.0
mean,37.37169,-39.29356,396.7938,165.1766,3.433093,12.11181,1014.643,29.96899,4.229705,9.667262,1149.292,17.4847,2014.203,15.70473,1.0
std,21.67534,78.60413,629.9284,114.4986,2.86114,12.44856,9.340864,29.37689,3.017572,20.83517,3175.199,32.89746,3.637312,8.816625,0.0
min,-90.0,-179.999,-388.0,0.0,0.0,-91.3,860.0,0.0,0.0,0.0,0.0,0.0,2008.0,1.0,1.0
25%,34.047,-95.266,44.0,80.0,2.0,8.8,1011.5,10.0,2.0,1.0,0.0,1.0,2011.0,8.0,1.0
50%,41.616,-71.15,200.0,190.0,3.1,16.0,1016.9,50.0,5.0,1.0,0.0,14.0,2014.0,16.0,1.0
75%,49.133,17.45,548.0,280.0,5.7,27.4,1025.8,71.0,8.0,12.0,9999.0,62.0,2017.0,24.0,1.0
max,87.333,999.999,9999.0,360.0,90.0,61.7,1090.0,99.0,9.0,99.0,9999.0,999.0,2020.0,31.0,1.0


In [14]:
%%time
gbs = round(ddf.memory_usage(index=True, deep=True).sum().compute() / 1e9, 2)
print(f"ddf is {gbs} GBs")

ddf is 741.86 GBs
CPU times: user 325 ms, sys: 65.8 ms, total: 391 ms
Wall time: 2min 36s


## EDA

In [15]:
%%time
ddf = ddf.set_index(
    dd.to_datetime(ddf.datetime).dt.floor("d"), sorted=False
).persist()

CPU times: user 777 ms, sys: 84.3 ms, total: 861 ms
Wall time: 2min 39s


In [None]:
%%time
len(ddf)

In [None]:
%%time
len(ddf)

In [None]:
%%time
places = (
    ddf.groupby(ddf.index)[["longitude", "latitude", "year"]].mean().compute()
)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(16, 16))
plt.scatter(places.longitude, places.latitude, c=places.year)
plt.title("Lat/long")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.grid()
plt.colorbar()

In [None]:
plt.figure(figsize=(16, 16))
plt.scatter(places.longitude, places.latitude, c=places.year)
plt.title("Lat/long")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.xlim([-50, -30])  # zoom in
plt.ylim([35, 40])  # zoom in
plt.grid()
plt.colorbar()

In [None]:
%%time
means = ddf.groupby(ddf.index).mean().compute()
means.head()

In [None]:
from datetime import datetime

for col in list(means.columns):
    fig = plt.figure(figsize=(16, 8))
    # plt.style.use('dark_background')
    means[col].plot(color="b")
    plt.title("Average of {}".format(col))
    plt.xlim([datetime(2008, 1, 1), datetime(2021, 1, 1)])
    plt.grid()

## Process and persist 

In [None]:
ddf["temperature"] = ddf["temperature"] * (9 / 5) + 32

In [None]:
ds = ws.get_default_datastore()

In [None]:
container_name = ds.container_name

storage_options = {
    "account_name": ds.account_name,
    "account_key": ds.account_key,
}

In [None]:
%%time
# ddf.to_csv("az://{container_name}/data/dask/isd", storage_options=storage_options)