# Exploratory Data Science (EDS) at scale with Dask

In [None]:
!pip install --upgrade "dask-cloudprovider[azure]" dask-lightgbm lightgbm lz4

In [None]:
from azureml.core import Workspace

ws = Workspace.from_config()
ws

In [None]:
import git
from pathlib import Path

# get root of git repo
prefix = Path(git.Repo(".", search_parent_directories=True).working_tree_dir)
prefix

In [None]:
from azureml.core import Environment
from dask.distributed import Client
from dask_cloudprovider import AzureMLCluster

env = Environment.from_conda_specification(
    "dask-tutorial", prefix.joinpath("environments", "dask.yml")
)
cluster = AzureMLCluster(
    ws,
    vm_size="STANDARD_DS5_V2",
    environment_definition=env,
    initial_node_count=60,
    scheduler_idle_timeout=1200,
)

c = Client(cluster)
c

In [None]:
container_name = "isdweatherdatacontainer"

storage_options = {"account_name": "azureopendatastorage"}

In [None]:
from adlfs import AzureBlobFileSystem

fs = AzureBlobFileSystem(**storage_options)
fs

In [None]:
%%time
files = fs.glob(f"{container_name}/ISDWeather/year=*/month=*/*.parquet")
files = [f"az://{file}" for file in files]
len(files)

In [None]:
files[-5:]

In [None]:
%%time
import dask.dataframe as dd

npartitions = 256
engine = "pyarrow"
blocksize = "1GB"

ddf = dd.read_parquet(
    files, storage_options=storage_options, engine=engine, blocksize=blocksize
).repartition(npartitions=npartitions)
ddf = ddf.set_index(
    dd.to_datetime(ddf.datetime).dt.floor("d"), sorted=False
).persist()
ddf

In [None]:
%%time
len(ddf)

In [None]:
%%time
len(ddf)

In [None]:
%%time
gbs = round(ddf.memory_usage(index=True, deep=True).sum().compute() / 1e9, 2)
print(f"ddf is {gbs} GBs")

## EDA

In [None]:
%%time
ddf.describe().compute()

In [None]:
%%time
places = (
    ddf.groupby(ddf.index)[["longitude", "latitude", "year"]].mean().compute()
)

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(16, 16))
plt.scatter(places.longitude, places.latitude, c=places.year)
plt.title("Lat/long")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.grid()
plt.colorbar()

In [None]:
plt.figure(figsize=(16, 16))
plt.scatter(places.longitude, places.latitude, c=places.year)
plt.title("Lat/long")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.xlim([-50, -30])  # zoom in
plt.ylim([35, 40])  # zoom in
plt.grid()
plt.colorbar()

In [None]:
%%time
means = ddf.groupby(ddf.index).mean().compute()
means.head()

In [None]:
from datetime import datetime

for col in list(means.columns):
    fig = plt.figure(figsize=(16, 8))
    # plt.style.use('dark_background')
    means[col].plot(color="b")
    plt.title("Average of {}".format(col))
    plt.xlim([datetime(2008, 1, 1), datetime(2021, 1, 1)])
    plt.grid()

## Process and persist 

In [None]:
ddf["temperature"] = ddf["temperature"] * (9 / 5) + 32

In [None]:
ds = ws.get_default_datastore()

In [None]:
container_name = ds.container_name

storage_options = {
    "account_name": ds.account_name,
    "account_key": ds.account_key,
}

In [None]:
%%time
#  ddf.to_csv("az://{container_name}/data/dask/isd", storage_options=storage_options)

## Prepare data and train LightGBM model

In [None]:
ddf = ddf.fillna(0)

In [None]:
cols = list(ddf.columns)
cols = [
    col
    for col in cols
    if ddf.dtypes[col] != "object"
    and col not in ["version", "datetime", "temperature"]
]
cols

In [None]:
X = ddf[cols].persist()
y = ddf.temperature.persist()

In [None]:
%%time
from dask_lightgbm import LGBMRegressor

params = {
    "n_estimators": 31,
    "num_iterations": 100,
    "learning_rate": 0.01,
}

lgbm = LGBMRegressor(**params)
lgbm.fit(X, y)

In [None]:
%%time
y_pred = lgbm.predict(X)

In [None]:
%%time
rmse = ((((y.to_dask_array() - y_pred) ** 2).mean()) ** 0.5).compute()
print(f"Training RMSE: {round(rmse, 3)}")

## Close Cluster and Client 

In [None]:
try:
    cluster.close()
    c.close()
except:
    pass