# Exploratory Data Analysis (EDA) at Scale with Dask

In [None]:
!pip install --upgrade "dask-cloudprovider[azure]"

In [None]:
from azureml.core import Workspace

ws = Workspace.from_config()
ws

In [None]:
import git
from pathlib import Path

# get root of git repo
prefix = Path(git.Repo(".", search_parent_directories=True).working_tree_dir)

In [None]:
from azureml.core import Environment
from dask.distributed import Client
from dask_cloudprovider import AzureMLCluster

env = Environment.from_conda_specification(
    "dask-tutorial", prefix.joinpath("environments", "dask.yml")
)
cluster = AzureMLCluster(
    ws,
    vm_size="STANDARD_DS13_V2",
    environment_definition=env,
    initial_node_count=20,
    scheduler_idle_timeout=1200,
)

c = Client(cluster)
c

In [None]:
container_name = "isdweatherdatacontainer"

storage_options = {"account_name": "azureopendatastorage"}

In [None]:
from adlfs import AzureBlobFileSystem

fs = AzureBlobFileSystem(**storage_options)
fs

In [None]:
%%time
files = fs.glob(f"{container_name}/ISDWeather/year=*/month=*/*.parquet")
files = [f"az://{file}" for file in files]
len(files)

In [None]:
files[-5:]

In [None]:
import dask.dataframe as dd

In [None]:
%%time
ddf = dd.read_parquet(files, storage_options=storage_options, engine="pyarrow")
ddf

In [None]:
ddf = ddf.persist()

In [None]:
%%time
len(ddf)

In [None]:
%%time
len(ddf)

In [None]:
%%time
ddf.describe().compute()

In [None]:
%%time
gbs = ddf.memory_usage(index=True, deep=True).sum().compute() // 1e9
print(f"ddf is {gbs} GBs")