# NYC Taxi benchmark
## Workflow consists from 4 queries to NYC taxi dataset

### The goal is to measure the total execution time: [Workflow execution cell](#execution_cell)

### Datasets links:
### 1) Dataset for measuring time: `https://modin-datasets.s3.amazonaws.com/trips_xaa.csv.gz`
### 2) Smaller dataset for quick workflow testing: `https://modin-datasets.s3.amazonaws.com/trips_data.csv.gz`

In [1]:
import logging
from flytekit.loggers import logger



In [None]:
logger.setLevel(level=logging.WARN)
logger.getEffectiveLevel

In [2]:
import pandas as pd
import typing
from flytekit import Resources, task, workflow, ExecutionParameters, FlyteContextManager, FlyteContext
from flytekit.types.file import FlyteFile
import flytekit
from flytekit.core.context_manager import ExecutionState
from flytekit.core.data_persistence import FileAccessProvider

In [3]:
N_RUNS = 5
cols = ["trip_id",
        "vendor_id",
        "pickup_datetime",
        "dropoff_datetime",
        "store_and_fwd_flag",
        "rate_code_id",
        "pickup_longitude",
        "pickup_latitude",
        "dropoff_longitude",
        "dropoff_latitude",
        "passenger_count",
        "trip_distance",
        "fare_amount",
        "extra",
        "mta_tax",
        "tip_amount",
        "tolls_amount",
        "ehail_fee",
        "improvement_surcharge",
        "total_amount",
        "payment_type",
        "trip_type",
        "pickup",
        "dropoff",
        "cab_type",
        "precipitation",
        "snow_depth",
        "snowfall",
        "max_temperature",
        "min_temperature",
        "average_wind_speed",
        "pickup_nyct2010_gid",
        "pickup_ctlabel",
        "pickup_borocode",
        "pickup_boroname",
        "pickup_ct2010",
        "pickup_boroct2010",
        "pickup_cdeligibil",
        "pickup_ntacode",
        "pickup_ntaname",
        "pickup_puma",
        "dropoff_nyct2010_gid",
        "dropoff_ctlabel",
        "dropoff_borocode",
        "dropoff_boroname",
        "dropoff_ct2010",
        "dropoff_boroct2010",
        "dropoff_cdeligibil",
        "dropoff_ntacode",
        "dropoff_ntaname",
        "dropoff_puma"]

parse_dates=["pickup_datetime", "dropoff_datetime"]

In [5]:
@task
def get_taxi_dataset_task(
    data: FlyteFile[typing.TypeVar("csv")],
    compression: str,
    names: typing.List[str],
    parse_dates: typing.List[str]
) -> typing.NamedTuple("OutputsBC", get_taxi_output=pd.DataFrame):
    return pd.read_csv(data, compression=compression, names=cols, parse_dates=parse_dates)

In [6]:
@task
def taxi_q1_task(
    df: pd.DataFrame
) -> pd.DataFrame:
    return pd.DataFrame(df.groupby(["cab_type"]).count()["trip_id"])

In [7]:
@task
def taxi_q2_task(
    df: pd.DataFrame
) -> pd.DataFrame:
    return df.groupby("passenger_count", as_index=False).mean()[["passenger_count", "total_amount"]]

In [8]:
@task
def taxi_q3_task(
    df: pd.DataFrame
) -> pd.DataFrame:
    res = df.groupby(["passenger_count", "pickup_datetime"]).size().reset_index()
    res.columns = res.columns.astype(str)
    return res

In [9]:
@task
def taxi_q4_task(
    df: pd.DataFrame
) -> pd.DataFrame:
    transformed = pd.DataFrame({
        "passenger_count": df["passenger_count"],
        "pickup_datetime": df["pickup_datetime"].dt.year,
        "trip_distance": df["trip_distance"].astype("int64"),
    })
    transformed = transformed.groupby(["passenger_count", "pickup_datetime", "trip_distance"])  \
            .size().reset_index().sort_values(by=["pickup_datetime", 0], ascending=[True, False])
    transformed.columns = transformed.columns.astype(str)
    return transformed

In [12]:
@workflow
def taxi_wf(
    dataset: FlyteFile["csv"] = "https://modin-datasets.s3.amazonaws.com/taxi/trips_xaa.csv.gz",
    compression: str = 'gzip'
) -> (
    pd.DataFrame,
    pd.DataFrame,
    pd.DataFrame,
    pd.DataFrame
):
    df = get_taxi_dataset_task(data=dataset, compression=compression, names=cols, parse_dates=parse_dates)[0]
    res_1 = taxi_q1_task(df=df)
    res_2 = taxi_q2_task(df=df)
    res_3 = taxi_q3_task(df=df)
    res_4 = taxi_q4_task(df=df)
    return res_1, res_2, res_3, res_4

## <a id='execution_cell'>Workflow execution</a>

In [13]:
%%time

taxi_wf()

CPU times: user 6min 9s, sys: 3min 15s, total: 9min 24s
Wall time: 7min 46s


DefaultNamedTupleOutput(o0=           trip_id
cab_type          
green     20000000, o1=   passenger_count  total_amount
0                0      9.841277
1                1     14.417759
2                2     15.336595
3                3     15.534873
4                4     15.496616
5                5     14.466787
6                6     15.234298
7                7     20.582484
8                8     16.502798
9                9     30.553729, o2=          passenger_count     pickup_datetime  0
0                       0 2013-08-14 12:07:00  1
1                       0 2013-08-14 12:37:00  1
2                       0 2013-08-15 00:00:00  1
3                       0 2013-09-17 13:30:00  1
4                       0 2013-09-25 17:32:46  1
...                   ...                 ... ..
16319462                9 2015-02-16 13:12:45  1
16319463                9 2015-02-20 02:41:14  1
16319464                9 2015-02-20 14:23:40  1
16319465                9 2015-02-23 12:28:49  1
163194