# Explorer parquet metadata

In this notebook, we use the api pyarrow to explore the metadata of different parquet files.

In [3]:
import pyarrow.parquet as pq
from pathlib import Path

In [4]:
data_path = Path.cwd().parent / "data"

fr_immo_raw_path = (data_path / "fr_immo_transactions.parquet").as_posix()
fr_immo_valid_path = (data_path / "fr_immo_transactions_valid_ts.parquet").as_posix()
fr_immo_custom_metadata_path = (data_path / "fr_immo_transactions_custom_metadata.parquet").as_posix()
print(fr_immo_raw_path)

/home/pliu/git/ParquetDuckDB/data/fr_immo_transactions.parquet


In [5]:
fr_immo_year_partition = (data_path / "fr_immo_partition_by_department").as_posix()


In [6]:
def show_parquet_schema(file_path:str):
    """
    This function prints the schema of the parquet file
    :param file_path:
    :return:
    """
    # create a pyarrow parquet file object
    parquet_file = pq.ParquetFile(file_path)
    # print the metadata summery
    print(parquet_file.schema)


In [7]:
def show_parquet_custom_metadata(file_path:str):
    """
    This function prints the schema of the parquet file
    :param file_path:
    :return:
    """
    # create a pyarrow table object
    arrow_table = pq.read_table(file_path)

    # get metadata
    metadata = arrow_table.schema.metadata

    # Convert metadata back to a readable format
    metadata_dict = {k.decode(): v.decode() for k, v in metadata.items()}

    print("\nTable & Column Metadata:")
    for key, value in metadata_dict.items():
        print(f"{key}: {value}")

In [8]:
def show_parquet_file_metadata(file_path:str):
    """
    This function shows the metadata summary of the parquet file
    :param file_path:
    :return:
    """
    # create a pyarrow parquet file object
    parquet_file = pq.ParquetFile(file_path)
    # print the metadata summery
    print(parquet_file.metadata)

In [9]:
def show_parquet_detail_metadata(file_path:str):
    """
    This function shows the metadata detail of the parquet file
    It iterates over row group then column trunks
    :param file_path:
    :return:
    """
    # create a pyarrow parquet file object
    parquet_file = pq.ParquetFile(file_path)
    # Get number of row groups
    num_row_groups = parquet_file.num_row_groups
    print(f"Number of row groups: {num_row_groups}")

    # Iterate through row groups
    for i in range(num_row_groups):
        row_group_metadata = parquet_file.metadata.row_group(i)
        print(f"\nRow Group {i}:")
        print(f"  Number of Rows: {row_group_metadata.num_rows}")
        print(f"  Total Byte Size: {row_group_metadata.total_byte_size}")

        # Get column metadata within the row group
        for col in range(row_group_metadata.num_columns):
            col_chunk = row_group_metadata.column(col)
            print(f"  Column {col}: {col_chunk.path_in_schema}")
            print(f"    Encoding Algo: {col_chunk.encodings}")
            print(f"    Encoded Bytes: {col_chunk.total_compressed_size}")

            col_stats = col_chunk.statistics
            if col_stats:
                print(f"    Min: {col_stats.min}")
                print(f"    Max: {col_stats.max}")
                print(f"    Null Count: {col_stats.null_count}")
                print(f"    Distinct Count: {col_stats.distinct_count}")
            else:
                print("    No statistics available")

In [10]:
show_parquet_schema(fr_immo_raw_path)

<pyarrow._parquet.ParquetSchema object at 0x7e58f84f87c0>
required group field_id=-1 schema {
  optional int32 field_id=-1 id_transaction;
  optional int64 field_id=-1 date_transaction (Timestamp(isAdjustedToUTC=false, timeUnit=nanoseconds, is_from_converted_type=false, force_set_converted_type=false));
  optional double field_id=-1 prix;
  optional binary field_id=-1 departement (String);
  optional int32 field_id=-1 id_ville;
  optional binary field_id=-1 ville (String);
  optional int32 field_id=-1 code_postal;
  optional binary field_id=-1 adresse (String);
  optional binary field_id=-1 type_batiment (String);
  optional int32 field_id=-1 n_pieces;
  optional int32 field_id=-1 surface_habitable;
  optional double field_id=-1 latitude;
  optional double field_id=-1 longitude;
}



In [11]:
show_parquet_file_metadata(fr_immo_raw_path)

<pyarrow._parquet.FileMetaData object at 0x7e58f84f5440>
  created_by: parquet-cpp-arrow version 19.0.0
  num_columns: 13
  num_rows: 9141573
  num_row_groups: 9
  format_version: 2.6
  serialized_size: 18182


In [13]:
show_parquet_custom_metadata(fr_immo_raw_path)


Table & Column Metadata:
pandas: {"index_columns": [{"kind": "range", "name": null, "start": 0, "stop": 9141573, "step": 1}], "column_indexes": [{"name": null, "field_name": null, "pandas_type": "unicode", "numpy_type": "object", "metadata": {"encoding": "UTF-8"}}], "columns": [{"name": "id_transaction", "field_name": "id_transaction", "pandas_type": "int32", "numpy_type": "int32", "metadata": null}, {"name": "date_transaction", "field_name": "date_transaction", "pandas_type": "datetime", "numpy_type": "datetime64[ns]", "metadata": null}, {"name": "prix", "field_name": "prix", "pandas_type": "float64", "numpy_type": "float64", "metadata": null}, {"name": "departement", "field_name": "departement", "pandas_type": "unicode", "numpy_type": "object", "metadata": null}, {"name": "id_ville", "field_name": "id_ville", "pandas_type": "int32", "numpy_type": "int32", "metadata": null}, {"name": "ville", "field_name": "ville", "pandas_type": "unicode", "numpy_type": "object", "metadata": null}, 

In [13]:
show_parquet_custom_metadata(fr_immo_valid_path)


Table & Column Metadata:
org.apache.spark.version: 3.5.3
org.apache.spark.sql.parquet.row.metadata: {"type":"struct","fields":[{"name":"id_transaction","type":"integer","nullable":true,"metadata":{}},{"name":"date_transaction","type":"timestamp","nullable":true,"metadata":{}},{"name":"prix","type":"double","nullable":true,"metadata":{}},{"name":"departement","type":"string","nullable":true,"metadata":{}},{"name":"id_ville","type":"integer","nullable":true,"metadata":{}},{"name":"ville","type":"string","nullable":true,"metadata":{}},{"name":"code_postal","type":"integer","nullable":true,"metadata":{}},{"name":"adresse","type":"string","nullable":true,"metadata":{}},{"name":"type_batiment","type":"string","nullable":true,"metadata":{}},{"name":"n_pieces","type":"integer","nullable":true,"metadata":{}},{"name":"surface_habitable","type":"integer","nullable":true,"metadata":{}},{"name":"latitude","type":"double","nullable":true,"metadata":{}},{"name":"longitude","type":"double","nullable"

In [17]:
show_parquet_custom_metadata(fr_immo_custom_metadata_path)


Table & Column Metadata:
owner: Pengfei
organization: CASD
data_version: 1.0.0
row.metadata: {"type": "struct", "fields": [{"name": "id_transaction", "type": "integer", "metadata": {"description": "Id de transaction (add by pengfei)"}}, {"name": "date_transaction", "type": "timestamp", "metadata": {"description": "Date de transaction (add by pengfei)"}}, {"name": "prix", "type": "double", "metadata": {"description": "Le prix de bien en euros (add by pengfei)"}}, {"name": "departement", "type": "string", "metadata": {"description": "bla bla (add by pengfei)"}}, {"name": "id_ville", "type": "integer", "metadata": {"description": "bla bla (add by pengfei)"}}, {"name": "ville", "type": "string", "metadata": {"description": "bla bla (add by pengfei)"}}, {"name": "code_postal", "type": "integer", "metadata": {"description": "bla bla (add by pengfei)"}}, {"name": "adresse", "type": "string", "metadata": {"description": "bla bla (add by pengfei)"}}, {"name": "type_batiment", "type": "string",

In [16]:
show_parquet_detail_metadata(fr_immo_raw_path)

{'owner': 'Pengfei', 'organization': 'CASD', 'data_version': '1.0.0'}


In [9]:
v2_example = "/home/pliu/git/ParquetDuckDB/data/fr_immo_transactions_multi_partition/departement=92/type_batiment=Appartement/part-00000-a73fc87e-c032-4529-9bcc-e6195eca35ba.c000.snappy.parquet"
show_parquet_file_metadata(v2_example)

<pyarrow._parquet.FileMetaData object at 0x7b44f6ba93a0>
  created_by: parquet-mr version 1.13.1 (build db4183109d5b734ec5930d870cdae161e408ddba)
  num_columns: 11
  num_rows: 202414
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 2453


In [12]:
show_parquet_detail_metadata(v2_example)

Number of row groups: 1

Row Group 0:
  Number of Rows: 202414
  Total Byte Size: 3895976
  Column 0: id_transaction
    Encoding Algo: ('DELTA_BINARY_PACKED',)
    Encoded Bytes: 410467
    Min: 13681429
    Max: 14652289
    Null Count: 0
    Distinct Count: None
  Column 1: date_transaction
    Encoding Algo: ('PLAIN', 'RLE_DICTIONARY')
    Encoded Bytes: 47981
    No statistics available
  Column 2: prix
    Encoding Algo: ('PLAIN', 'RLE_DICTIONARY')
    Encoded Bytes: 575278
    Min: 0.0
    Max: 139480320.0
    Null Count: 0
    Distinct Count: None
  Column 3: id_ville
    Encoding Algo: ('PLAIN', 'RLE_DICTIONARY')
    Encoded Bytes: 152810
    Min: 2
    Max: 78
    Null Count: 0
    Distinct Count: None
  Column 4: ville
    Encoding Algo: ('PLAIN', 'RLE_DICTIONARY')
    Encoded Bytes: 153164
    Min: ANTONY
    Max: VILLENEUVE-LA-GARENNE
    Null Count: 0
    Distinct Count: None
  Column 5: code_postal
    Encoding Algo: ('PLAIN', 'RLE_DICTIONARY')
    Encoded Bytes: 152822


In [14]:
show_parquet_file_metadata(fr_immo_valid_path)

<pyarrow._parquet.FileMetaData object at 0x7e58fc55e0c0>
  created_by: parquet-mr version 1.13.1 (build db4183109d5b734ec5930d870cdae161e408ddba)
  num_columns: 13
  num_rows: 9141573
  num_row_groups: 3
  format_version: 1.0
  serialized_size: 6199
