In [2]:
import pyarrow
import datetime
from pyarrow import csv
import pyarrow.parquet as pq
import pandas as pd

## Load data

In [16]:
names = ["Duration", "Start Date", "End Date", "Start station number", "Start station", "End station number", "End station", "Bike number", "Member Type"]
df = csv.read_csv(
    "./data/combined.csv",
    csv.ReadOptions(column_names=names),
).to_pandas()

# Reduce resolution of start and end date to reduce cardinality
df["Start Date"] = pd.to_datetime(df["Start Date"])
df["Start Day"] = df["Start Date"].dt.date
df["End Date"] = pd.to_datetime(df["End Date"])
df["End Day"] = df["End Date"].dt.date


In [4]:
df.head(2)

Unnamed: 0,Duration,Start Date,End Date,Start station number,Start station,End station number,End station,Bike number,Member Type,Start Day,End Day
0,2389,2015-01-01 00:02:44,2015-01-01 00:42:33,31271,Constitution Ave & 2nd St NW/DOL,31254,15th & K St NW,W01140,Casual,2015-01-01,2015-01-01
1,2394,2015-01-01 00:02:46,2015-01-01 00:42:41,31271,Constitution Ave & 2nd St NW/DOL,31254,15th & K St NW,W00612,Casual,2015-01-01,2015-01-01


## Write Table

In [5]:
# Convert df to arrow table
df_table = pyarrow.Table.from_pandas(df)

### Unpartitioned

In [11]:
pq.write_table(
    df_table,
    "df.parquet",
    compression="snappy",
)

### Partitioned

In [7]:
pq.write_to_dataset(
    df_table,
    'df_partitioned',
    partition_cols=["Start Day", "End Day"],
    use_legacy_dataset=False
)

## Read Table

In [37]:
df = (
    pq.ParquetDataset(
        "df.parquet",
        use_legacy_dataset=False,
    )
    .read_pandas()
    .to_pandas()
)

In [12]:
# Read with filter
df_filtered = (
    pq.ParquetDataset(
        "df.parquet",
        filters=[("Start Day", "=", datetime.date(2015, 1, 1))],
        use_legacy_dataset=False,
    )
    .read_pandas()
    .to_pandas()
)

### Partioned

In [47]:
df = (
    pq.ParquetDataset(
        "df_partitioned",
        use_legacy_dataset=False,
    )
    .read_pandas()
    .to_pandas()
)

In [8]:
# Read with filter
df_filtered = (
    pq.ParquetDataset(
        "df_partitioned",
        filters=[("Start Day", "=", "2015-01-01")],
        use_legacy_dataset=False,
    )
    .read_pandas()
    .to_pandas()
)