In [1]:
#Documentation: http://dask.pydata.org/en/latest/
#Installation: pip install dask
#Where to find data: https://www.quora.com/Where-can-I-find-large-datasets-open-to-the-public
# A real introduction to Dask: https://www.youtube.com/watch?v=5Md_sSsN51k
import time
import pandas as pd
import dask.dataframe as dd

In [2]:
start = time.time()
df = pd.read_csv("data/accident.csv")

result = df.groupby(['STATE']).agg({"FATALS":sum})
print("took",time.time() - start,"seconds")
#result

took 0.34474897384643555 seconds


In [3]:
start = time.time()
dask_f = dd.read_csv("data/accident.csv")
result = dask_f.groupby(["STATE"]).agg({"FATALS":sum}).compute()
print("took",time.time() - start,"seconds")
#result


took 0.36710095405578613 seconds


In [4]:
start = time.time()
weather = {0: 'No Additional Atmospheric Conditions', 1: 'Clear', 
           2: 'Rain', 3: 'Sleet, Hail', 
           4: 'Snow', 5: 'Fog, Smog, Smoke', 6: 'Severe Crosswinds', 
           7: 'Blowing Sand, Soil, Dirt', 
           8: 'Other', 10: 'Cloudy', 11: 'Blowing Snow', 
           12: 'Freezing Rain or Drizzle', 
           98: 'Not Reported', 99: 'Unknown'}

df['weather']=df['WEATHER'].apply(lambda x: weather[x])
df['weather1']=df['WEATHER1'].apply(lambda x: weather[x])
df['weather2']=df['WEATHER2'].apply(lambda x: weather[x])
df["weather"].value_counts()
print("took",time.time() - start,"seconds")

took 0.0431818962097168 seconds


In [5]:
import warnings
warnings.filterwarnings('ignore')
start = time.time()
weather = {0: 'No Additional Atmospheric Conditions', 1: 'Clear', 
           2: 'Rain', 3: 'Sleet, Hail', 
           4: 'Snow', 5: 'Fog, Smog, Smoke', 6: 'Severe Crosswinds', 
           7: 'Blowing Sand, Soil, Dirt', 
           8: 'Other', 10: 'Cloudy', 11: 'Blowing Snow', 
           12: 'Freezing Rain or Drizzle', 
           98: 'Not Reported', 99: 'Unknown'}

dask_f['weather']=dask_f['WEATHER'].apply(lambda x: weather[x])
dask_f['weather1']=dask_f['WEATHER1'].apply(lambda x: weather[x])
dask_f['weather2']=dask_f['WEATHER2'].apply(lambda x: weather[x])
dask_f["weather"].value_counts().compute()
print("took",time.time() - start,"seconds")

took 0.4163849353790283 seconds


In [7]:
#https://distributed.readthedocs.io/en/latest/quickstart.html
#https://distributed.readthedocs.io/en/latest/client.html#dask
from distributed import Client
client = Client()


In [6]:
import dask.array as da
from dask.dot import dot_graph
from dask.diagnostics import Profiler
a = da.random.random(size=(10000, 1000), chunks=(1000, 1000))
q, r = da.linalg.qr(a)
a2 = q.dot(r)

with Profiler() as prof:
    out = a2.compute()
    prof.visualize()

