# Introduction to Dask

In [None]:
!pip install --upgrade dask distributed bokeh adlfs fsspec fastparquet pyarrow python-snappy lz4

In [None]:
from azureml.core import Workspace

ws = Workspace.from_config()
ws

In [None]:
import git
from pathlib import Path

# get root of git repo
prefix = Path(git.Repo(".", search_parent_directories=True).working_tree_dir)

# setup data path
data_path = prefix.joinpath("data", "raw", "iris", "iris.csv")

In [None]:
from dask.distributed import Client

c = Client()
c

## Pandas and Dask on local data

In [None]:
import pandas as pd
import dask.dataframe as dd

In [None]:
%%time
df = pd.read_csv(data_path)

In [None]:
%%time
ddf = dd.read_csv(data_path)

In [None]:
%%time
df.describe()

In [None]:
%%time
ddf.describe().compute()

## Read data from Azure

In [None]:
ds = ws.get_default_datastore()
ds.upload_files([str(data_path)], target_path="datasets/iris", overwrite=True)

In [None]:
container_name = ds.container_name

storage_options = {
    "account_name": ds.account_name,
    "account_key": ds.account_key,
}

In [None]:
%%time
ddf = dd.read_csv(
    f"az://{container_name}/datasets/iris/iris.csv",
    storage_options=storage_options,
)

In [None]:
%%time
ddf.describe().compute()

## Convert to Pandas

In [None]:
%%time
df = ddf.compute()

In [None]:
%%time
df.describe()

## Bigger data

In [None]:
container_name = "isdweatherdatacontainer"

storage_options = {"account_name": "azureopendatastorage"}

In [None]:
from adlfs import AzureBlobFileSystem

fs = AzureBlobFileSystem(**storage_options)
fs

In [None]:
%%time
fs.ls(f"{container_name}/ISDWeather/year=2020")

In [None]:
files = fs.glob(f"{container_name}/ISDWeather/year=2020/month=2/*.parquet")
files = [f"az://{file}" for file in files]
files[-5:]

In [None]:
%%time
ddf = dd.read_parquet(files, storage_options=storage_options, chunksize="20MB")
ddf

In [None]:
ddf = ddf.persist()

In [None]:
%%time
len(ddf)

In [None]:
%%time
len(ddf)

In [None]:
%%time
ddf.describe().compute()

In [None]:
%%time
gbs = round(ddf.memory_usage(index=True, deep=True).sum().compute() / 1e9, 2)
print(f"ddf is {gbs} GBs")