In [4]:
import duckdb
from pathlib import Path

# 1. duckdb allows to get parquet file metadata

Duckdb allows us to get various parquet file metadata. Below are some examples

In [5]:
# Connexion à DuckDB
conn = duckdb.connect(database=':memory:', read_only=False)

In [6]:
# define parquet file path
data_path = Path.cwd().parent / "data"
fr_immo_raw_path = (data_path / "fr_immo_transactions.parquet").as_posix()

## 1.1 Get detailed metadata of the parquet file

In [11]:
# get detailed metadata of the parquet file
result = conn.execute(f"SELECT * FROM parquet_metadata('{fr_immo_raw_path}');").fetchdf()
print("Parquet file metadata : ")
print(result.head(5))

Résultat de la requête : 
                                           file_name  row_group_id  \
0  /home/pliu/git/ParquetDuckDB/data/fr_immo_tran...             0   
1  /home/pliu/git/ParquetDuckDB/data/fr_immo_tran...             0   
2  /home/pliu/git/ParquetDuckDB/data/fr_immo_tran...             0   
3  /home/pliu/git/ParquetDuckDB/data/fr_immo_tran...             0   
4  /home/pliu/git/ParquetDuckDB/data/fr_immo_tran...             0   

   row_group_num_rows  row_group_num_columns  row_group_bytes  column_id  \
0             1048576                     13         54792718          0   
1             1048576                     13         54792718          1   
2             1048576                     13         54792718          2   
3             1048576                     13         54792718          3   
4             1048576                     13         54792718          4   

   file_offset  num_values    path_in_schema        type  ...  \
0            0     1048576    i

## 1.2. Fetch the column names and column types

The **DESCRIBE** function returns the traditional schema of the parquet file (e.g. column names, types, etc)

In [13]:
query = f"DESCRIBE SELECT * FROM '{fr_immo_raw_path}';"
result = conn.execute(query).fetchdf()
print("Parquet schema : ")
print(result.head(5))

Parquet schema : 
        column_name   column_type null   key default extra
0    id_transaction       INTEGER  YES  None    None  None
1  date_transaction  TIMESTAMP_NS  YES  None    None  None
2              prix        DOUBLE  YES  None    None  None
3       departement       VARCHAR  YES  None    None  None
4          id_ville       INTEGER  YES  None    None  None


## 1.3. Fetch the internal schema of a Parquet file

The parquet_schema function can be used to query the internal schema contained within a Parquet file. Note that this is the schema as it is contained within the metadata of the Parquet file. It contains more detailed information


In [14]:
query = f"SELECT * FROM parquet_schema('{fr_immo_raw_path}');"
result = conn.execute(query).fetchdf()
print("Parquet schema : ")
print(result.head(5))

Parquet schema : 
                                           file_name              name  \
0  /home/pliu/git/ParquetDuckDB/data/fr_immo_tran...            schema   
1  /home/pliu/git/ParquetDuckDB/data/fr_immo_tran...    id_transaction   
2  /home/pliu/git/ParquetDuckDB/data/fr_immo_tran...  date_transaction   
3  /home/pliu/git/ParquetDuckDB/data/fr_immo_tran...              prix   
4  /home/pliu/git/ParquetDuckDB/data/fr_immo_tran...       departement   

         type type_length repetition_type  num_children converted_type  scale  \
0        None        None        REQUIRED          13.0           None    NaN   
1       INT32        None        OPTIONAL           NaN           None    NaN   
2       INT64        None        OPTIONAL           NaN           None    NaN   
3      DOUBLE        None        OPTIONAL           NaN           None    NaN   
4  BYTE_ARRAY        None        OPTIONAL           NaN           UTF8    NaN   

   precision  field_id                            

## 1.4. Parquet File Metadata

The parquet_file_metadata function can be used to query file-level metadata such as the format version and the encryption algorithm used:

In [15]:
query = f"SELECT * FROM parquet_file_metadata('{fr_immo_raw_path}');"
result = conn.execute(query).fetchdf()
print("Parquet schema : ")
print(result.head(5))

Parquet schema : 
                                           file_name  \
0  /home/pliu/git/ParquetDuckDB/data/fr_immo_tran...   

                         created_by  num_rows  num_row_groups  format_version  \
0  parquet-cpp-arrow version 19.0.0   9141573               9               2   

  encryption_algorithm footer_signing_key_metadata  
0                 None                        None  


## 2. Use duckdb to read partitioned parquet files

As we know, the partitioned parquet file does not contains the reference columns. For example, all parquet files in the folder departement=01 will not contain column departement.

We need to inform duckdb, how to read the partitioned parquet file correctly. 

In [19]:
fr_immo_single_partition_path = (data_path / "fr_immo_transactions_dep_partition").as_posix()
fr_immo_multi_partition_path = (data_path / "fr_immo_transactions_multi_partition").as_posix()

In [20]:
# read a single partition parquet file with the option hive partition = true
query = f"SELECT * FROM read_parquet('{fr_immo_single_partition_path}/*/*.parquet', hive_partitioning=true);"
result = conn.execute(query).fetchdf()
print("Parquet schema : ")
print(result.head(5))

Parquet schema : 
   id_transaction           date_transaction      prix  id_ville  \
0          141653 2013-12-31 09:25:51.804819  197000.0       427   
1          141970 2013-12-31 09:25:51.804819  157500.0       451   
2          139240 2013-12-31 09:25:51.804819  112000.0       365   
3          146016 2013-12-31 09:25:51.804819  173020.0       202   
4          145911 2014-01-01 09:25:43.165683   88000.0       283   

                  ville  code_postal                      adresse  \
0               TREVOUX         1600           6346 MTE DES LILAS   
1                VIRIAT         1440       1369 RTE DE STRASBOURG   
2  SAINT-JEAN-SUR-VEYLE         1290   5174  SAINT JEAN SUR VEYLE   
3               LAGNIEU         1150  21 GR GRANDE RUE DE BULLIEZ   
4               OYONNAX         1100          29B RUE DE LA FORGE   

  type_batiment  n_pieces  surface_habitable   latitude  longitude departement  
0   Appartement         4                 84  45.942301   4.770694          0

In [21]:
# read a single partition parquet file with the option hive partition = true
query = f"SELECT * FROM read_parquet('{fr_immo_single_partition_path}/*/*.parquet', hive_partitioning=false);"
result = conn.execute(query).fetchdf()
print("Parquet schema : ")
print(result.head(5))

Parquet schema : 
   id_transaction           date_transaction      prix  id_ville  \
0          141653 2013-12-31 09:25:51.804819  197000.0       427   
1          141970 2013-12-31 09:25:51.804819  157500.0       451   
2          139240 2013-12-31 09:25:51.804819  112000.0       365   
3          146016 2013-12-31 09:25:51.804819  173020.0       202   
4          145911 2014-01-01 09:25:43.165683   88000.0       283   

                  ville  code_postal                      adresse  \
0               TREVOUX         1600           6346 MTE DES LILAS   
1                VIRIAT         1440       1369 RTE DE STRASBOURG   
2  SAINT-JEAN-SUR-VEYLE         1290   5174  SAINT JEAN SUR VEYLE   
3               LAGNIEU         1150  21 GR GRANDE RUE DE BULLIEZ   
4               OYONNAX         1100          29B RUE DE LA FORGE   

  type_batiment  n_pieces  surface_habitable   latitude  longitude  
0   Appartement         4                 84  45.942301   4.770694  
1        Maison     

> You can notice the result for seconde query does not contain the column **departement**

In [17]:
# read a two level partition parquet file
query = f"SELECT * FROM read_parquet('{fr_immo_multi_partition_path}/*/*/*.parquet', hive_partitioning=true);"
result = conn.execute(query).fetchdf()
print("Parquet schema : ")
print(result.head(5))

Parquet schema : 
   id_transaction           date_transaction      prix  id_ville  \
0          141653 2013-12-31 09:25:51.804819  197000.0       427   
1          145911 2014-01-01 09:25:43.165683   88000.0       283   
2          145399 2014-01-01 09:25:43.165683   68000.0        53   
3          143192 2014-01-01 09:25:43.165683  156750.0        53   
4          146426 2014-01-01 09:25:43.165683  163640.0       376   

                      ville  code_postal                adresse  n_pieces  \
0                   TREVOUX         1600     6346 MTE DES LILAS         4   
1                   OYONNAX         1100    29B RUE DE LA FORGE         3   
2           BOURG-EN-BRESSE         1000  10Z RUE EDGAR  QUINET         2   
3           BOURG-EN-BRESSE         1000    15 RUE DE MONTHOLON         3   
4  SAINT-MAURICE-DE-BEYNOST         1700         40 AV DES ILES         3   

   surface_habitable   latitude  longitude departement type_batiment  
0                 84  45.942301   4.770

In [22]:
# read a two level partition parquet file
query = f"SELECT * FROM read_parquet('{fr_immo_multi_partition_path}/*/*/*.parquet', hive_partitioning=false);"
result = conn.execute(query).fetchdf()
print("Parquet schema : ")
print(result.head(5))

Parquet schema : 
   id_transaction           date_transaction      prix  id_ville  \
0          141653 2013-12-31 09:25:51.804819  197000.0       427   
1          145911 2014-01-01 09:25:43.165683   88000.0       283   
2          145399 2014-01-01 09:25:43.165683   68000.0        53   
3          143192 2014-01-01 09:25:43.165683  156750.0        53   
4          146426 2014-01-01 09:25:43.165683  163640.0       376   

                      ville  code_postal                adresse  n_pieces  \
0                   TREVOUX         1600     6346 MTE DES LILAS         4   
1                   OYONNAX         1100    29B RUE DE LA FORGE         3   
2           BOURG-EN-BRESSE         1000  10Z RUE EDGAR  QUINET         2   
3           BOURG-EN-BRESSE         1000    15 RUE DE MONTHOLON         3   
4  SAINT-MAURICE-DE-BEYNOST         1700         40 AV DES ILES         3   

   surface_habitable   latitude  longitude  
0                 84  45.942301   4.770694  
1                104

> You can notice the result for seconde query does not contain the column **departement** and **type_batiment**