# Dask for beginners Cheat Sheets sample code

(c) 2020 NVIDIA, Blazing SQL

Distributed under Apache License 2.0

### Imports

In [None]:
import cudf
import numpy as np
import dask_cudf
from dask_cuda import LocalCUDACluster
from dask.distributed import Client
from dask.delayed import delayed
import dask.distributed

### Sample DataFrame

In [None]:
df = cudf.DataFrame(
    [
          (39, 6.88, np.datetime64('2020-10-08T12:12:01'), 'C', 'D', 'data'
            , 'RAPIDS.ai is a suite of open-source libraries that allow you to run your end to end data science and analytics pipelines on GPUs.')
        , (11, 4.21, None,                                 'A', 'D', 'cuDF'
            , 'cuDF is a Python GPU DataFrame (built on the Apache Arrow columnar memory format)')
        , (31, 4.71, np.datetime64('2020-10-10T09:26:43'), 'U', 'D', 'memory'
            , 'cuDF allows for loading, joining, aggregating, filtering, and otherwise manipulating tabular data using a DataFrame style API.')
        , (40, 0.93, np.datetime64('2020-10-11T17:10:00'), 'P', 'B', 'tabular'
            , '''If your workflow is fast enough on a single GPU or your data comfortably fits in memory on 
                 a single GPU, you would want to use cuDF.''')
        , (33, 9.26, np.datetime64('2020-10-15T10:58:02'), 'O', 'D', 'parallel'
            , '''If you want to distribute your workflow across multiple GPUs or have more data than you can fit 
                 in memory on a single GPU you would want to use Dask-cuDF''')
        , (42, 4.21, np.datetime64('2020-10-01T10:02:23'), 'U', 'C', 'GPUs'
            , 'BlazingSQL provides a high-performance distributed SQL engine in Python')
        , (36, 3.01, np.datetime64('2020-09-30T14:36:26'), 'T', 'D', None
            , 'BlazingSQL is built on the RAPIDS GPU data science ecosystem')
        , (38, 6.44, np.datetime64('2020-10-10T08:34:36'), 'X', 'B', 'csv'
            , 'BlazingSQL lets you ETL raw data directly into GPU memory as a GPU DataFrame (GDF)')
        , (17, 5.28, np.datetime64('2020-10-09T08:34:40'), 'P', 'D', 'dataframes'
            , 'Dask is a flexible library for parallel computing in Python')
        , (10, 8.28, np.datetime64('2020-10-03T03:31:21'), 'W', 'B', 'python'
            , None)
    ]
    , columns = ['number', 'float_number', 'datetime', 'letter', 'category', 'word', 'string']
)

## Cluster and client setup

In [None]:
cluster = LocalCUDACluster(
    n_workers=1
    , threads_per_worker=1
    , CUDA_VISIBLE_DEVICES="0"
    , rmm_managed_memory=True
    , rmm_pool_size="20GB"
)

client = Client(cluster)
client

# DataFrame

#### dask_cudf.DataFrame.from_cudf

In [None]:
ddf = dask_cudf.from_cudf(df, npartitions=2)
ddf.head()

In [None]:
ddf = dask_cudf.from_cudf(df, chunksize=2)
ddf.npartitions

#### dask_cudf.DataFrame.map_partitions

In [None]:
def process_frame(df):
    df['num_inc'] = df['number'] + 10
    
    return df
    
ddf.map_partitions(process_frame).compute()

In [None]:
def multiply(a, b, mult):
    for i, (aa, bb) in enumerate(zip(a, b)):
        mult[i] = aa * bb

def process_frame_mul(df):
    df = df.apply_rows(
        multiply
        , incols = {'number': 'a', 'float_number': 'b'}
        , outcols = {'mult': np.float64}
        , kwargs = {}
    )
    
    return df['mult']

ddf.map_partitions(process_frame_mul).head()

In [None]:
def divide(a, div, b):
    for i, aa in enumerate(a):
        div[i] = aa / b

def process_frame_div(df, col_a, val_divide):
    df = df.apply_rows(
        divide
        , incols = {col_a: 'a'}
        , outcols = {'div': np.float64}
        , kwargs = {'b': val_divide}
    )
    
    return df['div']

ddf['div_number'] = ddf.map_partitions(process_frame_div, 'number', 10.0)
ddf['div_float']  = ddf.map_partitions(process_frame_div, 'float_number', 5.0)

ddf.head()

In [None]:
ddf['div_number'] = ddf.map_partitions(lambda df: process_frame_div(df, 'number', 10.0))
ddf['div_float']  = ddf.map_partitions(lambda df: process_frame_div(df, 'float_number', 5.0))

#### dask_cudf.compute

In [None]:
ddf

In [None]:
ddf.compute()

#### client.compute

In [None]:
computation = client.compute(ddf)

In [None]:
computation.result().head()

In [None]:
computation = client.compute(ddf, optimize_graph=True, workers='0')
computation.result().head()

#### dask_cudf.persist

In [None]:
ddf.persist()

# Delayed

#### dask.delayed.delayed

In [None]:
import cupy as cp

def delayed_task(n):
    df = cudf.DataFrame({'random': cp.random.rand(n)})
    df['rand_scaled'] = df['random'] * 3
    return df

tasks = [delayed(delayed_task)(10) for _ in range(2)]
computation = client.compute(tasks, optimize_graph=True)
computation

In [None]:
import cupy as cp

@delayed
def delayed_task(n):
    df = cudf.DataFrame({'random': cp.random.rand(n)})
    df['rand_scaled'] = df['random'] * 3
    return df

tasks = [delayed_task(10) for _ in range(2)]
computation = client.compute(tasks, optimize_graph=True)
computation

In [None]:
cudf.concat([f.result() for f in computation]).head()

#### dask_cudf.DataFrame.to_delayed

In [None]:
def process_frame_delayed(df):
    return df['number'] + 10
    
ddf_delayed_add = dask_cudf.from_delayed([
    process_frame_delayed(df) 
    for df 
    in ddf.to_delayed()
])

In [None]:
ddf_delayed_add.compute()

#### dask_cudf.DataFrame.from_delayed

In [None]:
def process_frame_delayed(df, divide):
    added = df['number'] + 10
    
    return added / divide
    
ddf_delayed_div = dask_cudf.from_delayed([
    process_frame_delayed(df, 10.0) 
    for df 
    in ddf.to_delayed()
])

ddf_delayed_div.head()

# Futures

#### client.persist

In [None]:
client.persist(ddf)

#### client.submit

In [None]:
def first_computation(df):
    return df['number'] + 10

def second_computation(result):
    return result / 10.0

computation_1 = client.submit(first_computation, ddf)
computation_2 = client.submit(second_computation, computation_1)

In [None]:
computation_1

In [None]:
computation_2.result().compute()

#### dask.distributed.wait

In [None]:
computation = client.compute(tasks, optimize_graph=True)
dask.distributed.wait(computation)

### this object only gets created one all computations are finished
results = dask_cudf.from_delayed(computation)
results.head()

#### dask.distributed.as_completed

In [None]:
computation = client.compute(tasks, optimize_graph=True)

for part in dask.distributed.as_completed(computation):
    print(part.result())

#### Future.result

In [None]:
def first_computation(df):
    return df['number'] + 10

computation_1 = client.submit(first_computation, ddf)

In [None]:
computation_1.result().compute()

#### Future.done

In [None]:
print(computation_1.done())

#### client.gather

In [None]:
client.gather(computation_1).compute()

#### client.scatter

In [None]:
data = client.gather(computation_1).compute()
distributed = client.scatter(data)

#### client.cancel

In [None]:
computation_1.cancel()