[Reference](https://medium.com/munchy-bytes/are-you-using-parquet-with-pandas-in-the-right-way-595c9ee7112)

In [1]:
import pandas as pd
import numpy as np

parquet_file_path = "test_data.parquet"

# Number of rows to generate
num_rows = 10**8 # 100M

# Generate the DataFrame
data = {
    "user_id": np.arange(num_rows),
    "value": np.random.randint(-10000, 10001, size=num_rows)
}
df = pd.DataFrame(data)

# Write the result to a Parquet file with 20 row groups (5M records per row group)
df.to_parquet(parquet_file_path, index=False, row_group_size=5 * 10**6)

In [2]:
import pyarrow.parquet as pq

parquet_file = pq.ParquetFile(parquet_file_path)

for i in range(parquet_file.metadata.num_row_groups):
    user_id_col_stats = parquet_file.metadata.row_group(i).column(0).statistics
    print(f"row group: {i}, num of rows: {user_id_col_stats.num_values}, min: {user_id_col_stats.min}, max: {user_id_col_stats.max}")

row group: 0, num of rows: 5000000, min: 0, max: 4999999
row group: 1, num of rows: 5000000, min: 5000000, max: 9999999
row group: 2, num of rows: 5000000, min: 10000000, max: 14999999
row group: 3, num of rows: 5000000, min: 15000000, max: 19999999
row group: 4, num of rows: 5000000, min: 20000000, max: 24999999
row group: 5, num of rows: 5000000, min: 25000000, max: 29999999
row group: 6, num of rows: 5000000, min: 30000000, max: 34999999
row group: 7, num of rows: 5000000, min: 35000000, max: 39999999
row group: 8, num of rows: 5000000, min: 40000000, max: 44999999
row group: 9, num of rows: 5000000, min: 45000000, max: 49999999
row group: 10, num of rows: 5000000, min: 50000000, max: 54999999
row group: 11, num of rows: 5000000, min: 55000000, max: 59999999
row group: 12, num of rows: 5000000, min: 60000000, max: 64999999
row group: 13, num of rows: 5000000, min: 65000000, max: 69999999
row group: 14, num of rows: 5000000, min: 70000000, max: 74999999
row group: 15, num of rows: 50

In [3]:
%%time

pd.read_parquet(parquet_file_path).query("user_id == 8767068")

CPU times: user 4.34 s, sys: 3.68 s, total: 8.02 s
Wall time: 7.71 s


Unnamed: 0,user_id,value
8767068,8767068,-3617


In [4]:
%%time

pd.read_parquet(parquet_file_path, filters=[("user_id", "=", 8767068)]).to_dict()

CPU times: user 212 ms, sys: 161 ms, total: 372 ms
Wall time: 343 ms


{'user_id': {0: 8767068}, 'value': {0: -3617}}

In [5]:
for i in range(parquet_file.metadata.num_row_groups):
    user_id_col_stats = parquet_file.metadata.row_group(i).column(1).statistics
    print(f"row group: {i}, num of rows: {user_id_col_stats.num_values}, min: {user_id_col_stats.min}, max: {user_id_col_stats.max}")

row group: 0, num of rows: 5000000, min: -10000, max: 10000
row group: 1, num of rows: 5000000, min: -10000, max: 10000
row group: 2, num of rows: 5000000, min: -10000, max: 10000
row group: 3, num of rows: 5000000, min: -10000, max: 10000
row group: 4, num of rows: 5000000, min: -10000, max: 10000
row group: 5, num of rows: 5000000, min: -10000, max: 10000
row group: 6, num of rows: 5000000, min: -10000, max: 10000
row group: 7, num of rows: 5000000, min: -10000, max: 10000
row group: 8, num of rows: 5000000, min: -10000, max: 10000
row group: 9, num of rows: 5000000, min: -10000, max: 10000
row group: 10, num of rows: 5000000, min: -10000, max: 10000
row group: 11, num of rows: 5000000, min: -10000, max: 10000
row group: 12, num of rows: 5000000, min: -10000, max: 10000
row group: 13, num of rows: 5000000, min: -10000, max: 10000
row group: 14, num of rows: 5000000, min: -10000, max: 10000
row group: 15, num of rows: 5000000, min: -10000, max: 10000
row group: 16, num of rows: 500000

In [7]:
%%time
# apply the filter on the dataframe
pd.read_parquet(parquet_file_path).query("value == 6666").count().to_dict()

%%time
# the filter to pyarrow
pd.read_parquet(parquet_file_path, filters=[("value", "=", 6666)]).count().to_dict()

UsageError: Line magic function `%%time` not found.


In [8]:
parquet_file_path = "another_test.parquet"

# Set the number of rows for the DataFrame
num_rows = 10**8 # 100M

# Generate random data for the second column
second_column_data = np.random.rand(num_rows)

# Create a mask to set 50% of the first column to None
mask = np.random.rand(num_rows) < 0.5
first_column_data = np.where(mask, None, np.random.rand(num_rows))

# Create the DataFrame
data = {"Column1": first_column_data, "Column2": second_column_data}
df = pd.DataFrame(data)

# Write the result to a Parquet file with 20 row groups (5M records per row group)
df.to_parquet(parquet_file_path, index=False, row_group_size=5 * 10**6)

In [9]:
%%time

pd.read_parquet(parquet_file_path).sum().to_dict()

CPU times: user 3.68 s, sys: 3.7 s, total: 7.38 s
Wall time: 5.65 s


{'Column1': 24993955.54717727, 'Column2': 49998063.70082336}

In [10]:
%%time

pd.read_parquet(parquet_file_path).dropna(subset=["Column1"]).sum().to_dict()


CPU times: user 4.48 s, sys: 6.07 s, total: 10.6 s
Wall time: 11.2 s


{'Column1': 24993955.547177102, 'Column2': 24993659.50756679}

In [11]:
%%time

# Column1 type is DOUBLE, so max value is 2**53
pd.read_parquet(parquet_file_path, filters=[("Column1", "<=", 2**53)]).sum().to_dict()


CPU times: user 4.66 s, sys: 2.63 s, total: 7.29 s
Wall time: 15.3 s


{'Column1': 24993955.547177102, 'Column2': 24993659.50756679}