In [88]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from large_parquet_loader import ParquetReader

# Create Dummy Data
Create a large dataframe and save it as a parquet

In [89]:
partitions_size = {"part": 3, "date": 5, "store": 20}
n_col = 10
n_row_per_partition = 10000

In [90]:
current_date = datetime.now().date()
base_columns = [f"col_{i}" for i in range(n_col)]
df_lst = []

for i in range(partitions_size["part"]):
    for j in range(partitions_size["date"]):
        for k in range(partitions_size["store"]):
            current = pd.DataFrame(np.random.random(size=(n_row_per_partition, n_col)), columns=base_columns)
            current["part"] = str(i)
            current["date"] = current_date + timedelta(days=j)
            current["store"] = str(k)
            df_lst.append(current)

df = pd.concat(df_lst, ignore_index=True)
df.loc[:3, :]

Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,part,date,store
0,0.873173,0.257509,0.27984,0.434004,0.483767,0.741774,0.520325,0.094757,0.455784,0.38285,0,2024-12-02,0
1,0.743606,0.676129,0.017887,0.882693,0.985512,0.911625,0.61634,0.261969,0.837505,0.377142,0,2024-12-02,0
2,0.95406,0.889566,0.46019,0.378309,0.59266,0.282878,0.394865,0.097175,0.150502,0.124858,0,2024-12-02,0
3,0.413414,0.771646,0.680224,0.45259,0.103873,0.194113,0.300798,0.129074,0.561838,0.413664,0,2024-12-02,0


In [91]:
df.to_parquet("sample.parquet", partition_cols=["part", "date", "store"], compression=None, index=False)

# Read Data
Example how to read in several partitions of the data

In [92]:
partitions = {
        "part": ["*"], # Asterisk (*) indicates that all of the partitions at this level should be read
        "date": [datetime.now().date() + timedelta(days=i) for i in range(3)],
        "store": 0
}
pr = ParquetReader("sample.parquet")
df = pr.load_parquet(partitions)
print(df["part"].unique())
df.loc[:3, :]

['0' '1' '2']


Unnamed: 0,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9,part,date,store
0,0.873173,0.257509,0.27984,0.434004,0.483767,0.741774,0.520325,0.094757,0.455784,0.38285,0,2024-12-02,0
1,0.743606,0.676129,0.017887,0.882693,0.985512,0.911625,0.61634,0.261969,0.837505,0.377142,0,2024-12-02,0
2,0.95406,0.889566,0.46019,0.378309,0.59266,0.282878,0.394865,0.097175,0.150502,0.124858,0,2024-12-02,0
3,0.413414,0.771646,0.680224,0.45259,0.103873,0.194113,0.300798,0.129074,0.561838,0.413664,0,2024-12-02,0


# Time Comparisons

In [93]:
import timeit
import pyarrow.dataset as ds
import pyarrow.parquet as pq
from fastparquet import ParquetFile

In [94]:
def read_parquet_manual():
    partitions = {
            "part": [1, 2],
            "date": [datetime.now().date() + timedelta(days=i) for i in range(3)],
            "store": 0
    }
    pr = ParquetReader("sample.parquet")
    pr.load_parquet(partitions)

def read_parquet_pyarrow_dataset():
    dataset = ds.dataset("sample.parquet", format="parquet", partitioning="hive")
    filter_condition = (
        (ds.field("part").isin([1, 2])) &
        (ds.field("date").isin([(datetime.now().date() + timedelta(days=i)).strftime("%Y-%m-%d") for i in range(3)])) &
        (ds.field("store").isin([0]))
    )
    dataset.to_table(filter=filter_condition).to_pandas()

def read_parquet_pyarrow_parquet():
    filters = [
        ("part", "in", [1, 2]),
        ("date", "in", [(datetime.now().date() + timedelta(days=i)).strftime("%Y-%m-%d") for i in range(3)]),
        ("store", "in", [0])
    ]
    pq.read_table(source="sample.parquet", filters=filters).to_pandas()

def read_parquet_fastparquet():
    filters = [
            ("part", "in", [1, 2]),
            ("date", "in", [(datetime.now().date() + timedelta(days=i)).strftime("%Y-%m-%d") for i in range(3)]),
            ("store", "in", [0])
        ]

    pf = ParquetFile("sample.parquet")
    pf.to_pandas(filters=filters)

In [95]:
t1 = timeit.timeit(read_parquet_manual, number=1000)
print(f"Manual: {t1}")

t2 = timeit.timeit(read_parquet_pyarrow_dataset, number=1000)
print(f"PyArrow (dataset): {t2}")

t3 = timeit.timeit(read_parquet_pyarrow_parquet, number=1000)
print(f"PyArrow (parquet): {t3}")

t4 = timeit.timeit(read_parquet_fastparquet, number=1000)
print(f"Fastparquet: {t4}")

Manual: 14.770857100000285
PyArrow (dataset): 31.565202200000385
PyArrow (parquet): 35.061277199999495
Fastparquet: 100.1803774
