# "Big Pandas" - Dask from the Inside
## Part 5 - Dask graphs
### PyData Berlin tutorial, 30 June 2017
### Stephen Simmons


This part looks at what dask DataFrame actually are, a lazily evaluated dependency graph, 
and how these get executed 

In [None]:
# Complete set of Python 3.6 imports used for these examples

# Standard modules
import io
import logging
import lzma
import multiprocessing
import os
import ssl
import sys
import time
import urllib.request
import zipfile

# Third-party modules
import fastparquet      # Needs python-snappy and llvmlite
import graphviz         # To visualize Dask graphs 
import numpy as np
import pandas as pd
import psutil           # Memory stats
import dask
import dask.dataframe as dd
import bokeh.io         # For Dask profile graphs
import seaborn as sns   # For colormaps

# Support multiple lines of output in each cell
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Don't wrap tables
pd.options.display.max_rows = 20
pd.options.display.max_columns = 20
pd.options.display.width = 300

# Show matplotlib and bokeh graphs inline in Jupyter notebook
%matplotlib inline
bokeh.io.output_notebook()

print(sys.version)
np.__version__, pd.__version__, dask.__version__

# So what exactly is a Dask DataFrame?

In [None]:
df = pd.DataFrame([[1,2,3],[4,5,6],[7,8,9],[10,11,12],[13,14,15]], columns=['a','b','c'])
print(df)

In [None]:
ddf = dd.from_pandas(df, npartitions=1)
print(ddf)
ddf.divisions
print(ddf._meta)
ddf._name
ddf.dask
ddf.visualize()

In [None]:
ddf = dd.from_pandas(df, npartitions=2)
print(ddf)
ddf.divisions
print(ddf._meta)
ddf._name
ddf.dask
ddf.visualize()

In [None]:
ddf = dd.from_pandas(df, npartitions=2).head(n=2, npartitions=2, compute=False)
print(ddf)
ddf.divisions
print(ddf._meta)
ddf._name
ddf.dask
ddf.visualize()

In [None]:
ddf._keys()

In [None]:
ddf.compute()

In [None]:
ddf2 = ddf1.head(1,compute=False)
ddf2.visualize()
ddf2._name
ddf2.dask

In [None]:
ddf2._keys()

In [None]:
ddf2._keys??

In [None]:
ddf = dd.from_pandas(df, chunksize=3)
print(ddf)

In [None]:
print(ddf._meta)
ddf.npartitions
ddf.divisions
ddf.visualize()

In [None]:
ddf.dask

In [None]:
for k, v in ddf.dask.items():
    print(repr(k))
    print('  ' + repr(v))
    


In [None]:
(ddf+1).sum()

In [None]:
task = ddf.head(n=2, compute=False)
task.dask
task.visualize()

In [None]:
ddf.head(2, npartitions=-1, compute=False).visualize()

In [None]:
(ddf + 1).sum()visualize()

In [None]:
df.head(n=2)

In [None]:
task = ddf.head(n=2, npartitions=2, compute=False)

In [None]:
task.visualize()

In [None]:
task.dask

In [None]:
task._keys()

In [None]:
print(pd.DataFrame.__doc__)

In [None]:
print(dd.DataFrame.__doc__)

In [None]:
dd.from_pandas()

In [None]:
ddf = dd.from_pandas(df, chunksize=2)
task = ddf[ddf.a>2]

In [None]:
task.compute()

In [None]:
task.visualize()

In [None]:
print(dd.DataFrame.__doc__)

In [None]:
task._meta

In [None]:
task.npartitions
task.divisions

In [None]:
task._name

In [None]:
task.dask

In [None]:
task.dask[(task._name,0)]

In [None]:
task.dask[(task._name,1)]

In [None]:
task.compute??

In [None]:
task2.compute()

In [None]:
task2.visualize()

In [None]:
task2.dask[(task2._name,0)]