In [1]:
import os
import time
import modin.pandas as pd

def read():
    columns_names = [
        "trip_id", "vendor_id", "pickup_datetime", "dropoff_datetime", "store_and_fwd_flag",
        "rate_code_id", "pickup_longitude", "pickup_latitude", "dropoff_longitude", "dropoff_latitude",
        "passenger_count", "trip_distance", "fare_amount", "extra", "mta_tax", "tip_amount",
        "tolls_amount", "ehail_fee", "improvement_surcharge", "total_amount", "payment_type",
        "trip_type", "pickup", "dropoff", "cab_type", "precipitation", "snow_depth", "snowfall",
        "max_temperature", "min_temperature", "average_wind_speed", "pickup_nyct2010_gid",
        "pickup_ctlabel", "pickup_borocode", "pickup_boroname", "pickup_ct2010",
        "pickup_boroct2010", "pickup_cdeligibil", "pickup_ntacode", "pickup_ntaname", "pickup_puma",
        "dropoff_nyct2010_gid", "dropoff_ctlabel", "dropoff_borocode", "dropoff_boroname",
        "dropoff_ct2010", "dropoff_boroct2010", "dropoff_cdeligibil", "dropoff_ntacode",
        "dropoff_ntaname", "dropoff_puma",
    ]
    parse_dates=["pickup_datetime", "dropoff_datetime"]
    return pd.read_csv(os.path.expanduser('/data/taxi/trips_xaa.csv'), names=columns_names,
                header=None, parse_dates=parse_dates)

def q1(df):
    return df.groupby("cab_type")["cab_type"].count()

def q2(df):
    return df.groupby("passenger_count", as_index=False).mean()[["passenger_count", "total_amount"]]

def q3(df):
    df["pickup_datetime"] = df["pickup_datetime"].dt.year
    return df.groupby(["pickup_datetime", "passenger_count"]).size().reset_index()


def q4(df):
    df["pickup_datetime"] = df["pickup_datetime"].dt.year
    df["trip_distance"] = df["trip_distance"].astype("int64")
    return df.groupby(["passenger_count", "pickup_datetime", "trip_distance"])  \
            .size().reset_index().sort_values(by=["pickup_datetime", 0], ascending=[True, False])

def measure(name, func, *args, **kw):
    t0 = time.time()
    res = func(*args, **kw)
    t1 = time.time()
    print(f'{name}: {t1 - t0} sec')
    return res


In [4]:
pd.DataFrame({1: [2, 3]})



Unnamed: 0,1
0,2
1,3


In [5]:
df = measure('Reading', read)

Reading: 22.889394283294678 sec


In [6]:
measure('Q1', q1, df)

Q1: 2.5625734329223633 sec


6020391

In [7]:
measure('Q2', q2, df)

Q2: 15.341039180755615 sec


Unnamed: 0,passenger_count,total_amount
0,0,7.778554
1,1,14.010305
2,2,14.759865
3,3,14.874173
4,4,14.961926
5,5,14.179034
6,6,15.187849
7,7,16.16792
8,8,19.421102
9,9,27.913659


In [8]:
measure('Q3', q3, df.copy())

Q3: 3.3864006996154785 sec


Unnamed: 0,pickup_datetime,passenger_count,0
0,2013,0,441
1,2013,1,958867
2,2013,2,93204
3,2013,3,27774
4,2013,4,11356
5,2013,5,104758
6,2013,6,14346
7,2013,7,30
8,2013,8,18
9,2013,9,17


In [9]:
measure('Q4', q4, df.copy())



Q4: 6.033255577087402 sec


Unnamed: 0,passenger_count,pickup_datetime,trip_distance,0
51,1,2013,1,274599
50,1,2013,0,210759
52,1,2013,2,152696
53,1,2013,3,96007
54,1,2013,4,62700
...,...,...,...,...
664,8,2014,10,1
665,8,2014,35,1
674,9,2014,5,1
676,9,2014,7,1
