`sort_values()` seems to be faster in Modin even though it says it defaults to the Pandas implementation.

In [1]:
import utils
import pandas as pd
import dask.dataframe as dd

dask_df = dd.read_csv('../datasets/yellow_tripdata_2015-01.csv')
pandas_df = pd.read_csv('../datasets/yellow_tripdata_2015-01.csv')
# Trigger some computatoin before timing anything because read_csv() with Dask is very fast (so, either it's actually
# fast or we need to trigger a computation)
print(dask_df.head())

   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         2  2015-01-15 19:05:39   2015-01-15 19:23:42                1   
1         1  2015-01-10 20:33:38   2015-01-10 20:53:28                1   
2         1  2015-01-10 20:33:38   2015-01-10 20:43:41                1   
3         1  2015-01-10 20:33:39   2015-01-10 20:35:31                1   
4         1  2015-01-10 20:33:39   2015-01-10 20:52:58                1   

   trip_distance  pickup_longitude  pickup_latitude  RateCodeID  \
0           1.59        -73.993896        40.750111           1   
1           3.30        -74.001648        40.724243           1   
2           1.80        -73.963341        40.802788           1   
3           0.50        -74.009087        40.713818           1   
4           3.00        -73.971176        40.762428           1   

  store_and_fwd_flag  dropoff_longitude  dropoff_latitude  payment_type  \
0                  N         -73.974785         40.750618             1

In [2]:
%%time_cell
# The Dask DataFrame doesn't seem to have value_counts(), so it seems we need to it for individual Series. This is a bit unfair
# because this loop could be done in parallel, but on the other hand, if it is that simple, then why Dask doesn't do it?
for col in dask_df.columns:
  print(dask_df[col].value_counts().compute())

2    6647797
1    6101189
Name: VendorID, dtype: int64
2015-01-05 09:39:49    94
2015-01-12 19:02:27    41
2015-01-12 18:55:17    39
2015-01-14 16:13:52    32
2015-01-14 13:20:13    29
                       ..
2015-01-11 21:33:54     1
2015-01-17 06:26:23     1
2015-01-17 06:26:22     1
2015-01-27 20:14:30     1
2015-01-28 04:38:42     1
Name: tpep_pickup_datetime, Length: 2438284, dtype: int64
2015-01-02 00:00:00    142
2015-01-04 00:00:00    133
2015-02-01 00:00:00    132
2015-01-03 00:00:00    129
2015-01-11 00:00:00    119
                      ... 
2015-01-16 02:08:54      1
2015-01-16 02:08:56      1
2015-01-16 02:09:07      1
2015-01-16 02:09:09      1
2016-02-02 16:30:52      1
Name: tpep_dropoff_datetime, Length: 2442163, dtype: int64
1    8993870
2    1814594
5     697645
3     528486
6     454568
4     253228
0       6565
9         11
8         10
7          9
Name: passenger_count, dtype: int64
0.90           304042
0.80           300724
1.00           298316
1.10         

In [3]:
dask_time = _TIMED_CELL
print(f"Dask time: {dask_time:.1f}s")

Dask time: 121.5s


In [4]:
%%time_cell
print(pandas_df.value_counts())

VendorID  tpep_pickup_datetime  tpep_dropoff_datetime  passenger_count  trip_distance  pickup_longitude  pickup_latitude  RateCodeID  store_and_fwd_flag  dropoff_longitude  dropoff_latitude  payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  improvement_surcharge  total_amount
2         2015-01-05 09:39:49   2015-01-05 09:39:52    1                0.00            0.000000         0.000000         1           N                    0.000000          0.000000          2             2.5          0.0    0.5      0.00        0.0           0.3                    3.30            92
          2015-01-12 19:02:27   2015-01-12 19:04:07    1                0.00            0.000000         0.000000         1           N                    0.000000          0.000000          2             2.1          0.0    0.5      0.00        0.0           0.3                    2.90            33
          2015-01-12 18:55:17   2015-01-12 18:56:27    1                0.00            0.000000    

In [5]:
pandas_time = _TIMED_CELL
print(f"Pandas time: {pandas_time:.1f}s")

Pandas time: 40.0s


In [6]:
slowdown = dask_time / pandas_time
utils.print_md(f"### Dask is {slowdown:.1f}x slower.")

### Dask is 3.0x slower.