In [1]:
import utils
%include preamble.py

2022-12-25 01:59:59,381	INFO worker.py:1528 -- Started a local Ray instance.


In [2]:
modin_df, pandas_df = utils.get_two_dfs("../../datasets/yellow_tripdata_2015-01.csv")

## Example 1 - Math Ops Series to Series

In [3]:
%%time_cell
x = modin_df['pickup_longitude'] + modin_df['pickup_latitude']
# Trigger computation. Modin is lazy.
print(x)

0          -33.243786
1          -33.277405
2          -33.160553
3          -33.295269
4          -33.208748
              ...    
12748981   -33.165771
12748982   -33.254559
12748983   -33.229774
12748984   -33.261082
12748985   -33.193951
Length: 12748986, dtype: float64


In [4]:
modin_time = _TIMED_CELL
print(f"Modin time: {modin_time:.1f}s")

Modin time: 1.5s


In [5]:
%%time_cell
y = pandas_df['pickup_longitude'] + pandas_df['pickup_latitude']
print(y)

0          -33.243786
1          -33.277405
2          -33.160553
3          -33.295269
4          -33.208748
              ...    
12748981   -33.165771
12748982   -33.254559
12748983   -33.229774
12748984   -33.261082
12748985   -33.193951
Length: 12748986, dtype: float64


In [6]:
pandas_time = _TIMED_CELL
print(f"Pandas time: {pandas_time:.1f}s")

Pandas time: 0.0s


In [7]:
slowdown = modin_time / pandas_time
utils.print_md(f"### Modin is {slowdown:.1f}x slower.")

### Modin is 55.1x slower.

## Example 2 - Math Ops Series to Constant

In [8]:
%%time_cell
x = modin_df['pickup_longitude'] * 2
# Trigger computation. Modin is lazy.
print(x)

0          -147.987793
1          -148.003296
2          -147.926682
3          -148.018173
4          -147.942352
               ...    
12748981   -147.903976
12748982   -147.965485
12748983   -147.958649
12748984   -147.999130
12748985   -147.920700
Name: pickup_longitude, Length: 12748986, dtype: float64


In [9]:
modin_time = _TIMED_CELL
print(f"Modin time: {modin_time:.1f}s")

Modin time: 0.3s


In [10]:
%%time_cell
y = pandas_df['pickup_longitude'] * 2
print(y)

0          -147.987793
1          -148.003296
2          -147.926682
3          -148.018173
4          -147.942352
               ...    
12748981   -147.903976
12748982   -147.965485
12748983   -147.958649
12748984   -147.999130
12748985   -147.920700
Name: pickup_longitude, Length: 12748986, dtype: float64


In [11]:
pandas_time = _TIMED_CELL
print(f"Pandas time: {pandas_time:.1f}s")

Pandas time: 0.0s


In [12]:
slowdown = modin_time / pandas_time
utils.print_md(f"### Modin is {slowdown:.1f}x slower.")

### Modin is 16.5x slower.

## Example 3 - Compare Series to Series

In [13]:
%%time_cell
x = modin_df['pickup_longitude'] < modin_df['pickup_latitude']
# Trigger computation. Modin is lazy.
assert x.any()

In [14]:
modin_time = _TIMED_CELL
print(f"Modin time: {modin_time:.1f}s")

Modin time: 0.7s


In [15]:
%%time_cell
y = pandas_df['pickup_longitude'] < pandas_df['pickup_latitude']
assert y.any()

In [16]:
pandas_time = _TIMED_CELL
print(f"Pandas time: {pandas_time:.1f}s")

Pandas time: 0.0s


In [17]:
slowdown = modin_time / pandas_time
utils.print_md(f"### Modin is {slowdown:.1f}x slower.")

### Modin is 30.7x slower.

## Example 4 - Compare Series to Constant

In [18]:
%%time_cell
x = modin_df['pickup_longitude'] < 2.3
# Trigger computation. Modin is lazy.
assert x.any()

In [19]:
modin_time = _TIMED_CELL
print(f"Modin time: {modin_time:.1f}s")

Modin time: 0.3s


In [20]:
%%time_cell
y = pandas_df['pickup_longitude'] < 2.3
print(y)

0           True
1           True
2           True
3           True
4           True
            ... 
12748981    True
12748982    True
12748983    True
12748984    True
12748985    True
Name: pickup_longitude, Length: 12748986, dtype: bool


In [21]:
pandas_time = _TIMED_CELL
print(f"Pandas time: {pandas_time:.1f}s")

Pandas time: 0.0s


In [22]:
slowdown = modin_time / pandas_time
utils.print_md(f"### Modin is {slowdown:.1f}x slower.")

### Modin is 20.4x slower.

## Example 5 - Unary Reductions 1

In [23]:
%%time_cell
x = modin_df['pickup_longitude'].std()
# Trigger computation. Modin is lazy.
print(x)

10.125103592972902


In [24]:
modin_time = _TIMED_CELL
print(f"Modin time: {modin_time:.1f}s")

Modin time: 0.6s


In [25]:
%%time_cell
y = pandas_df['pickup_longitude'].std()
print(y)

10.125103592972902


In [26]:
pandas_time = _TIMED_CELL
print(f"Pandas time: {pandas_time:.1f}s")

Pandas time: 0.1s


In [27]:
slowdown = modin_time / pandas_time
utils.print_md(f"### Modin is {slowdown:.1f}x slower.")

### Modin is 5.3x slower.

## Example 6 - Unary Reductions 2

In [28]:
%%time_cell
x = modin_df["tpep_pickup_datetime"].unique()
# Tigger computation, Modin is lazy.
print(x[:10])

['2015-01-15 19:05:39' '2015-01-10 20:33:38' '2015-01-10 20:33:39'
 '2015-01-10 20:33:40' '2015-01-10 20:33:41' '2015-01-15 19:05:40'
 '2015-01-15 19:05:41' '2015-01-15 19:05:42' '2015-01-15 19:05:43'
 '2015-01-15 19:05:44']


In [29]:
modin_time = _TIMED_CELL
print(f"Modin time: {modin_time:.1f}s")

Modin time: 4.5s


In [30]:
%%time_cell
y = pandas_df["tpep_pickup_datetime"].unique()
print(y[:10])

['2015-01-15 19:05:39' '2015-01-10 20:33:38' '2015-01-10 20:33:39'
 '2015-01-10 20:33:40' '2015-01-10 20:33:41' '2015-01-15 19:05:40'
 '2015-01-15 19:05:41' '2015-01-15 19:05:42' '2015-01-15 19:05:43'
 '2015-01-15 19:05:44']


In [31]:
pandas_time = _TIMED_CELL
print(f"Pandas time: {pandas_time:.1f}s")

Pandas time: 1.7s


In [32]:
slowdown = modin_time / pandas_time
utils.print_md(f"### Modin is {slowdown:.1f}x slower.")

### Modin is 2.6x slower.