In [1]:
import pyarrow as pa
import pyarrow.parquet as pq

parquet_file = pq.ParquetFile("../data/corpus/cleaned_msmarco-docs.parquet")
arrow_table = parquet_file.read()
print("Loaded Arrow Table")

pq.write_table(
    arrow_table,
    "../data/corpus/dataset.parquet",
    row_group_size=10000
)

print(f"Original number of row groups: {parquet_file.num_row_groups}")

new_parquet_file = pq.ParquetFile("../data/corpus/dataset.parquet")
print(f"New number of row groups: {new_parquet_file.num_row_groups}")


Loaded Arrow Table
Original number of row groups: 108
New number of row groups: 288


In [12]:
import pyarrow.parquet as pq
import polars as pl

parquet_file_path = "../data/corpus/dataset.parquet"  # Path to your Parquet file
lookup_file_path = "../data/train/docid_lookup.parquet"  # Path to save the lookup dataset
row_group_size = 10000  # Row group size used in the Parquet file

# Load the Parquet file
parquet_file = pq.ParquetFile(parquet_file_path)

# Initialize the lookup data
lookup_data = []

# Iterate over each row group
for row_group_index in range(parquet_file.num_row_groups):
    # Read the current row group
    row_group = parquet_file.read_row_group(row_group_index, columns=["docid"])
    
    # Convert to Polars DataFrame
    row_group_df = pl.from_arrow(row_group)
    
    # Add row group index to each docid
    lookup_data.append(
        row_group_df.with_columns(pl.lit(row_group_index).alias("row_group"))
    )
    print(f"Added row group {row_group_index} to lookup data")
# Concatenate all row group lookup data into a single Polars DataFrame
lookup_df = pl.concat(lookup_data)

# Save the lookup dataset as a Parquet file
lookup_df.write_parquet(lookup_file_path)

print(f"Lookup dataset created and saved to {lookup_file_path}")


Added row group 0 to lookup data
Added row group 1 to lookup data
Added row group 2 to lookup data
Added row group 3 to lookup data
Added row group 4 to lookup data
Added row group 5 to lookup data
Added row group 6 to lookup data
Added row group 7 to lookup data
Added row group 8 to lookup data
Added row group 9 to lookup data
Added row group 10 to lookup data
Added row group 11 to lookup data
Added row group 12 to lookup data
Added row group 13 to lookup data
Added row group 14 to lookup data
Added row group 15 to lookup data
Added row group 16 to lookup data
Added row group 17 to lookup data
Added row group 18 to lookup data
Added row group 19 to lookup data
Added row group 20 to lookup data
Added row group 21 to lookup data
Added row group 22 to lookup data
Added row group 23 to lookup data
Added row group 24 to lookup data
Added row group 25 to lookup data
Added row group 26 to lookup data
Added row group 27 to lookup data
Added row group 28 to lookup data
Added row group 29 to lo

In [14]:
import pyarrow.parquet as pq
import polars as pl

# Load the lookup dataset
lookup_df = pl.read_parquet("../data/train/docid_lookup.parquet")

# Load the Parquet file
parquet_file = pq.ParquetFile("../data/corpus/dataset.parquet")

# Function to retrieve a document by docid
def get_document_by_docid(docid: str):
    # Find the row group for the given docid
    row_group = lookup_df.filter(pl.col("docid") == docid).select("row_group").to_series(0)
    
    if row_group.is_empty():
        raise ValueError(f"Document with docid '{docid}' not found.")
    
    row_group_index = row_group[0]
    row_group_data = parquet_file.read_row_group(row_group_index).to_pandas()
    
    row_group_df = pl.from_pandas(row_group_data)
    document = row_group_df.filter(pl.col("docid") == docid)
    return document

docid = "D12123"
try:
    document = get_document_by_docid(docid)
    print(document)
except ValueError as e:
    print(e)


shape: (1, 3)
┌────────┬─────────────────────────────────┬─────────────────────────────────┐
│ docid  ┆ title                           ┆ body                            │
│ ---    ┆ ---                             ┆ ---                             │
│ str    ┆ str                             ┆ str                             │
╞════════╪═════════════════════════════════╪═════════════════════════════════╡
│ D12123 ┆ is there an age of accountabil… ┆ is there an age of accountabil… │
└────────┴─────────────────────────────────┴─────────────────────────────────┘
