In [1]:
import utils
import pandas as pd
import dask.dataframe as dd

dask_df = dd.read_csv('../datasets/yellow_tripdata_2015-01.csv')
pandas_df = pd.read_csv('../datasets/yellow_tripdata_2015-01.csv')
# Trigger some computatoin before timing anything because read_csv() with Dask is very fast (so, either it's actually
# fast or we need to trigger a computation)
print(dask_df.head())

   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         2  2015-01-15 19:05:39   2015-01-15 19:23:42                1   
1         1  2015-01-10 20:33:38   2015-01-10 20:53:28                1   
2         1  2015-01-10 20:33:38   2015-01-10 20:43:41                1   
3         1  2015-01-10 20:33:39   2015-01-10 20:35:31                1   
4         1  2015-01-10 20:33:39   2015-01-10 20:52:58                1   

   trip_distance  pickup_longitude  pickup_latitude  RateCodeID  \
0           1.59        -73.993896        40.750111           1   
1           3.30        -74.001648        40.724243           1   
2           1.80        -73.963341        40.802788           1   
3           0.50        -74.009087        40.713818           1   
4           3.00        -73.971176        40.762428           1   

  store_and_fwd_flag  dropoff_longitude  dropoff_latitude  payment_type  \
0                  N         -73.974785         40.750618             1

## Example 1 - Math Ops Series to Series

In [2]:
%%time_cell
x = dask_df['pickup_longitude'] + dask_df['pickup_latitude']
print(x.compute())

0        -33.243786
1        -33.277405
2        -33.160553
3        -33.295269
4        -33.208748
            ...    
411302   -33.165771
411303   -33.254559
411304   -33.229774
411305   -33.261082
411306   -33.193951
Length: 12748986, dtype: float64


In [3]:
dask_time = _TIMED_CELL
print(f"Dask time: {dask_time:.1f}s")

Dask time: 3.8s


In [4]:
%%time_cell
y = pandas_df['pickup_longitude'] + pandas_df['pickup_latitude']
print(y)

0          -33.243786
1          -33.277405
2          -33.160553
3          -33.295269
4          -33.208748
              ...    
12748981   -33.165771
12748982   -33.254559
12748983   -33.229774
12748984   -33.261082
12748985   -33.193951
Length: 12748986, dtype: float64


In [5]:
pandas_time = _TIMED_CELL
print(f"Pandas time: {pandas_time:.1f}s")

Pandas time: 0.0s


In [6]:
slowdown = dask_time / pandas_time
utils.print_md(f"### Dask is {slowdown:.1f}x slower.")

### Dask is 136.8x slower.

## Example 2 - Math Ops Series to Constant

In [7]:
%%time_cell
x = dask_df['pickup_longitude'] * 2
print(x.compute())

0        -147.987793
1        -148.003296
2        -147.926682
3        -148.018173
4        -147.942352
             ...    
411302   -147.903976
411303   -147.965485
411304   -147.958649
411305   -147.999130
411306   -147.920700
Name: pickup_longitude, Length: 12748986, dtype: float64


In [8]:
dask_time = _TIMED_CELL
print(f"Dask time: {dask_time:.1f}s")

Dask time: 3.8s


In [9]:
%%time_cell
y = pandas_df['pickup_longitude'] * 2
print(y)

0          -147.987793
1          -148.003296
2          -147.926682
3          -148.018173
4          -147.942352
               ...    
12748981   -147.903976
12748982   -147.965485
12748983   -147.958649
12748984   -147.999130
12748985   -147.920700
Name: pickup_longitude, Length: 12748986, dtype: float64


In [10]:
pandas_time = _TIMED_CELL
print(f"Pandas time: {pandas_time:.1f}s")

Pandas time: 0.0s


In [11]:
slowdown = dask_time / pandas_time
utils.print_md(f"### Dask is {slowdown:.1f}x slower.")

### Dask is 158.1x slower.

## Example 3 - Compare Series to Series

In [12]:
%%time_cell
x = dask_df['pickup_longitude'] < dask_df['pickup_latitude']
assert x.compute().any()

In [13]:
dask_time = _TIMED_CELL
print(f"Dask time: {dask_time:.1f}s")

Dask time: 3.8s


In [14]:
%%time_cell
y = pandas_df['pickup_longitude'] < pandas_df['pickup_latitude']
assert y.any()

In [15]:
pandas_time = _TIMED_CELL
print(f"Pandas time: {pandas_time:.1f}s")

Pandas time: 0.0s


In [16]:
slowdown = dask_time / pandas_time
utils.print_md(f"### Dask is {slowdown:.1f}x slower.")

### Dask is 181.4x slower.

## Example 4 - Compare Series to Constant

In [17]:
%%time_cell
x = dask_df['pickup_longitude'] < 2.3
assert x.compute().any()

In [18]:
dask_time = _TIMED_CELL
print(f"Dask time: {dask_time:.1f}s")

Dask time: 3.7s


In [19]:
%%time_cell
y = pandas_df['pickup_longitude'] < 2.3
print(y)

0           True
1           True
2           True
3           True
4           True
            ... 
12748981    True
12748982    True
12748983    True
12748984    True
12748985    True
Name: pickup_longitude, Length: 12748986, dtype: bool


In [20]:
pandas_time = _TIMED_CELL
print(f"Pandas time: {pandas_time:.1f}s")

Pandas time: 0.0s


In [21]:
slowdown = dask_time / pandas_time
utils.print_md(f"### Dask is {slowdown:.1f}x slower.")

### Dask is 295.8x slower.

## Example 5 - Unary Reductions 1

In [22]:
%%time_cell
x = dask_df['pickup_longitude'].std()
print(x.compute())

10.12510359297291


In [23]:
dask_time = _TIMED_CELL
print(f"Dask time: {dask_time:.1f}s")

Dask time: 3.7s


In [24]:
%%time_cell
y = pandas_df['pickup_longitude'].std()
print(y)

10.125103592972902


In [25]:
pandas_time = _TIMED_CELL
print(f"Pandas time: {pandas_time:.1f}s")

Pandas time: 0.1s


In [26]:
slowdown = dask_time / pandas_time
utils.print_md(f"### Dask is {slowdown:.1f}x slower.")

### Dask is 33.5x slower.

## Example 6 - Unary Reductions 2

In [27]:
%%time_cell
x = dask_df["tpep_pickup_datetime"].unique()
# Tigger computation, Dask is lazy.
print(x.compute())

0          2015-01-15 19:05:39
1          2015-01-10 20:33:38
2          2015-01-10 20:33:39
3          2015-01-10 20:33:40
4          2015-01-10 20:33:41
                  ...         
2438279    2015-01-26 10:54:04
2438280    2015-01-02 13:33:23
2438281    2015-01-10 18:16:49
2438282    2015-01-26 06:45:05
2438283    2015-01-09 15:04:28
Name: tpep_pickup_datetime, Length: 2438284, dtype: object


In [28]:
dask_time = _TIMED_CELL
print(f"Dask time: {dask_time:.1f}s")

Dask time: 8.1s


In [29]:
%%time_cell
y = pandas_df["tpep_pickup_datetime"].unique()
print(y)

['2015-01-15 19:05:39' '2015-01-10 20:33:38' '2015-01-10 20:33:39' ...
 '2015-01-10 18:16:49' '2015-01-26 06:45:05' '2015-01-09 15:04:28']


In [30]:
pandas_time = _TIMED_CELL
print(f"Pandas time: {pandas_time:.1f}s")

Pandas time: 1.7s


In [31]:
slowdown = dask_time / pandas_time
utils.print_md(f"### Dask is {slowdown:.1f}x slower.")

### Dask is 4.8x slower.