In [2]:
import pyarrow.parquet as pq
from pathlib import Path

In [3]:
data_path = Path.cwd().parent / "data"

fr_immo_raw_path = (data_path / "fr_immo_transactions.parquet").as_posix()
fr_immo_valid_path = (data_path / "fr_immo_transactions_valid_ts.parquet").as_posix()

print(fr_immo_raw_path)

/home/pliu/git/ParquetDuckDB/data/fr_immo_transactions.parquet


In [4]:
fr_immo_year_partition = (data_path / "fr_immo_partition_by_department").as_posix()


In [5]:
def show_parquet_schema(file_path:str):
    """
    This function prints the schema of the parquet file
    :param file_path:
    :return:
    """
    # create a pyarrow parquet file object
    parquet_file = pq.ParquetFile(file_path)
    # print the metadata summery
    print(parquet_file.schema)


In [6]:
def show_parquet_summery_metadata(file_path:str):
    """
    This function shows the metadata summary of the parquet file
    :param file_path:
    :return:
    """
    # create a pyarrow parquet file object
    parquet_file = pq.ParquetFile(file_path)
    # print the metadata summery
    print(parquet_file.metadata)

In [7]:
def show_parquet_detail_metadata(file_path:str):
    """
    This function shows the metadata detail of the parquet file
    It iterates over row group then column trunks
    :param file_path:
    :return:
    """
    # create a pyarrow parquet file object
    parquet_file = pq.ParquetFile(file_path)
    # Get number of row groups
    num_row_groups = parquet_file.num_row_groups
    print(f"Number of row groups: {num_row_groups}")

    # Iterate through row groups
    for i in range(num_row_groups):
        row_group_metadata = parquet_file.metadata.row_group(i)
        print(f"\nRow Group {i}:")
        print(f"  Number of Rows: {row_group_metadata.num_rows}")
        print(f"  Total Byte Size: {row_group_metadata.total_byte_size}")

        # Get column metadata within the row group
        for col in range(row_group_metadata.num_columns):
            col_metadata = row_group_metadata.column(col)
            print(f"  Column {col}: {col_metadata.path_in_schema}")
            print(f"    Encoded Bytes: {col_metadata.total_compressed_size}")
            print(f"    Uncompressed Bytes: {col_metadata.total_uncompressed_size}")
            col_stats = col_metadata.statistics
            if col_stats:
                print(f"    Min: {col_stats.min}")
                print(f"    Max: {col_stats.max}")
                print(f"    Null Count: {col_stats.null_count}")
                print(f"    Distinct Count: {col_stats.distinct_count}")
            else:
                print("    No statistics available")

In [8]:
show_parquet_schema(fr_immo_raw_path)

<pyarrow._parquet.ParquetSchema object at 0x7ace38111280>
required group field_id=-1 schema {
  optional int32 field_id=-1 id_transaction;
  optional int64 field_id=-1 date_transaction (Timestamp(isAdjustedToUTC=false, timeUnit=nanoseconds, is_from_converted_type=false, force_set_converted_type=false));
  optional double field_id=-1 prix;
  optional binary field_id=-1 departement (String);
  optional int32 field_id=-1 id_ville;
  optional binary field_id=-1 ville (String);
  optional int32 field_id=-1 code_postal;
  optional binary field_id=-1 adresse (String);
  optional binary field_id=-1 type_batiment (String);
  optional int32 field_id=-1 n_pieces;
  optional int32 field_id=-1 surface_habitable;
  optional double field_id=-1 latitude;
  optional double field_id=-1 longitude;
}



In [9]:
show_parquet_summery_metadata(fr_immo_raw_path)

<pyarrow._parquet.FileMetaData object at 0x7ace3836a570>
  created_by: parquet-cpp-arrow version 19.0.0
  num_columns: 13
  num_rows: 9141573
  num_row_groups: 9
  format_version: 2.6
  serialized_size: 18182


In [10]:
show_parquet_summery_metadata(fr_immo_valid_path)

<pyarrow._parquet.FileMetaData object at 0x7ace321a2ac0>
  created_by: parquet-mr version 1.13.1 (build db4183109d5b734ec5930d870cdae161e408ddba)
  num_columns: 13
  num_rows: 9141573
  num_row_groups: 3
  format_version: 1.0
  serialized_size: 6199


In [9]:
show_parquet_detail_metadata(fr_immo_raw_path)

Number of row groups: 9

Row Group 0:
  Number of Rows: 1048576
  Total Byte Size: 54792718
  Column 0: id_transaction
    Encoded Bytes: 4785153
    Uncompressed Bytes: 4784914
    Min: 1
    Max: 1985836
    Null Count: 0
    Distinct Count: None
  Column 1: date_transaction
    Encoded Bytes: 206029
    Uncompressed Bytes: 259738
    Min: 2014-01-02 00:00:00
    Max: 2024-06-29 00:00:00
    Null Count: 0
    Distinct Count: None
  Column 2: prix
    Encoded Bytes: 2591646
    Uncompressed Bytes: 2819362
    Min: -0.0
    Max: 641855000.0
    Null Count: 0
    Distinct Count: None
  Column 3: departement
    Encoded Bytes: 184
    Uncompressed Bytes: 200
    Min: 01
    Max: 14
    Null Count: 0
    Distinct Count: None
  Column 4: id_ville
    Encoded Bytes: 1290642
    Uncompressed Bytes: 1316179
    Min: 1
    Max: 834
    Null Count: 0
    Distinct Count: None
  Column 5: ville
    Encoded Bytes: 1574708
    Uncompressed Bytes: 1703662
    Min: ABBECOURT
    Max: YZEURE
    Null 

In [13]:
v2_example = "/home/pliu/git/ParquetDuckDB/data/fr_immo_transactions_multi_partition/departement=92/type_batiment=Appartement/part-00000-a73fc87e-c032-4529-9bcc-e6195eca35ba.c000.snappy.parquet"
show_parquet_summery_metadata(v2_example)

<pyarrow._parquet.FileMetaData object at 0x7ace321a3060>
  created_by: parquet-mr version 1.13.1 (build db4183109d5b734ec5930d870cdae161e408ddba)
  num_columns: 11
  num_rows: 202414
  num_row_groups: 1
  format_version: 1.0
  serialized_size: 2453


In [14]:
show_parquet_detail_metadata(v2_example)

Number of row groups: 1

Row Group 0:
  Number of Rows: 202414
  Total Byte Size: 3895976
  Column 0: id_transaction
    Encoded Bytes: 410467
    Uncompressed Bytes: 410393
    Min: 13681429
    Max: 14652289
    Null Count: 0
    Distinct Count: None
  Column 1: date_transaction
    Encoded Bytes: 47981
    Uncompressed Bytes: 51004
    No statistics available
  Column 2: prix
    Encoded Bytes: 575278
    Uncompressed Bytes: 685839
    Min: 0.0
    Max: 139480320.0
    Null Count: 0
    Distinct Count: None
  Column 3: id_ville
    Encoded Bytes: 152810
    Uncompressed Bytes: 152751
    Min: 2
    Max: 78
    Null Count: 0
    Distinct Count: None
  Column 4: ville
    Encoded Bytes: 153164
    Uncompressed Bytes: 153173
    Min: ANTONY
    Max: VILLENEUVE-LA-GARENNE
    Null Count: 0
    Distinct Count: None
  Column 5: code_postal
    Encoded Bytes: 152822
    Uncompressed Bytes: 152763
    Min: 75015
    Max: 92800
    Null Count: 0
    Distinct Count: None
  Column 6: adresse
 

In [28]:
sf_fire_file = pq.ParquetFile(sf_fire_path)

In [5]:
sf_fire_table = pq.read_table(sf_fire_path)
out_path = data_path / "sf_fire_snappy.parquet"
pq.write_table(sf_fire_table,out_path,compression="snappy",row_group_size=500000,data_page_size=65536)

In [6]:
sf_fire_new_path = (data_path / "sf_fire_snappy.parquet").as_posix()

In [12]:
show_parquet_summery_metadata(sf_fire_new_path)

<pyarrow._parquet.FileMetaData object at 0x7900c9b964d0>
  created_by: parquet-cpp-arrow version 19.0.0
  num_columns: 34
  num_rows: 5500519
  num_row_groups: 12
  format_version: 2.6
  serialized_size: 54665


In [13]:
show_parquet_detail_metadata(sf_fire_new_path)

Number of row groups: 12

Row Group 0:
  Number of Rows: 500000
  Total Byte Size: 116609411
  Column 0: CallNumber
    Encoded Bytes: 2133805
    Uncompressed Bytes: 2133636
    Min: 1030101
    Max: 210630242
    Null Count: 0
    Distinct Count: None
  Column 1: UnitID
    Encoded Bytes: 558947
    Uncompressed Bytes: 560278
    Min: 50
    Max: VAN9
    Null Count: 0
    Distinct Count: None
  Column 2: IncidentNumber
    Encoded Bytes: 2133804
    Uncompressed Bytes: 2133636
    Min: 30612
    Max: 21027766
    Null Count: 0
    Distinct Count: None
  Column 3: CallType
    Encoded Bytes: 247854
    Uncompressed Bytes: 304484
    Min: Administrative
    Max: Watercraft in Distress
    Null Count: 0
    Distinct Count: None
  Column 4: CallDate
    Encoded Bytes: 731920
    Uncompressed Bytes: 804797
    Min: 01/01/2001
    Max: 12/31/2020
    Null Count: 0
    Distinct Count: None
  Column 5: WatchDate
    Encoded Bytes: 732926
    Uncompressed Bytes: 804992
    Min: 01/01/2001
  

   id_transaction date_transaction      prix departement  id_ville  \
0          141653       2014-01-02  197000.0          01       427   
1          141970       2014-01-02  157500.0          01       451   
2          139240       2014-01-02  112000.0          01       365   
3          146016       2014-01-02  173020.0          01       202   
4          145911       2014-01-03   88000.0          01       283   

                  ville  code_postal                      adresse  \
0               TREVOUX         1600           6346 MTE DES LILAS   
1                VIRIAT         1440       1369 RTE DE STRASBOURG   
2  SAINT-JEAN-SUR-VEYLE         1290   5174  SAINT JEAN SUR VEYLE   
3               LAGNIEU         1150  21 GR GRANDE RUE DE BULLIEZ   
4               OYONNAX         1100          29B RUE DE LA FORGE   

  type_batiment  n_pieces  surface_habitable   latitude  longitude  
0   Appartement         4                 84  45.942301   4.770694  
1        Maison         4 

In [27]:
print(pdf.shape)

(9141573, 13)


In [28]:
print(pdf.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9141573 entries, 0 to 9141572
Data columns (total 13 columns):
 #   Column             Dtype         
---  ------             -----         
 0   id_transaction     int32         
 1   date_transaction   datetime64[ns]
 2   prix               float64       
 3   departement        object        
 4   id_ville           int32         
 5   ville              object        
 6   code_postal        int32         
 7   adresse            object        
 8   type_batiment      object        
 9   n_pieces           int32         
 10  surface_habitable  int32         
 11  latitude           float64       
 12  longitude          float64       
dtypes: datetime64[ns](1), float64(3), int32(5), object(4)
memory usage: 732.3+ MB
None
