In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pathlib import Path

In [2]:
spark=SparkSession.builder.master("local[4]") \
                  .appName("ReadWriteParquet") \
                  .config("spark.sql.legacy.parquet.nanosAsLong", "true") \
                  .getOrCreate()


25/01/27 16:51:23 WARN Utils: Your hostname, pliu-ubuntu24 resolves to a loopback address: 127.0.1.1; using 192.168.30.128 instead (on interface ens33)
25/01/27 16:51:23 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/27 16:51:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/27 16:51:25 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# configuration

In [3]:
data_path = Path.cwd().parent / "data"

fr_immo_raw_path = (data_path / "fr_immo_transactions.parquet").as_posix()
fr_immo_valid_path = (data_path / "fr_immo_transactions_valid_ts.parquet").as_posix()
fr_immo_csv_path = (data_path / "fr_immo_transactions.csv").as_posix()

date_col_name = "date_transaction"


## Performance test csv vs parquet

In this part, we will test the query performance between csv and parquet.

1. row_counts
2. group_by
3. filter

In [14]:
fr_immo_parquet_df = spark.read.parquet(fr_immo_valid_path)

In [15]:
%%time
fr_immo_parquet_df.count()

CPU times: user 6.42 ms, sys: 3.65 ms, total: 10.1 ms
Wall time: 879 ms


9141573

In [16]:
fr_immo_parquet_df.printSchema()

root
 |-- id_transaction: integer (nullable = true)
 |-- date_transaction: timestamp (nullable = true)
 |-- prix: double (nullable = true)
 |-- departement: string (nullable = true)
 |-- id_ville: integer (nullable = true)
 |-- ville: string (nullable = true)
 |-- code_postal: integer (nullable = true)
 |-- adresse: string (nullable = true)
 |-- type_batiment: string (nullable = true)
 |-- n_pieces: integer (nullable = true)
 |-- surface_habitable: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)



In [18]:
fr_immo_csv_df = spark.read.option("header",True).option("inferSchema", True).csv(fr_immo_csv_path)


                                                                                

In [19]:
fr_immo_csv_df.printSchema()

root
 |-- id_transaction: integer (nullable = true)
 |-- date_transaction: timestamp (nullable = true)
 |-- prix: double (nullable = true)
 |-- departement: integer (nullable = true)
 |-- id_ville: integer (nullable = true)
 |-- ville: string (nullable = true)
 |-- code_postal: integer (nullable = true)
 |-- adresse: string (nullable = true)
 |-- type_batiment: string (nullable = true)
 |-- n_pieces: integer (nullable = true)
 |-- surface_habitable: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)



In [20]:
%%time
fr_immo_csv_df.count()



CPU times: user 6.33 ms, sys: 2.47 ms, total: 8.8 ms
Wall time: 1.36 s


                                                                                

9141573

In [21]:
batiment_typ_col_name = "type_batiment"

In [22]:
%%time
fr_immo_parquet_df.groupby(batiment_typ_col_name).count().show()

+-------------+-------+
|type_batiment|  count|
+-------------+-------+
|  Appartement|4079137|
|       Maison|5062436|
+-------------+-------+

CPU times: user 3.88 ms, sys: 551 μs, total: 4.43 ms
Wall time: 679 ms


In [23]:
%%time
fr_immo_csv_df.groupby(batiment_typ_col_name).count().show()



+-------------+-------+
|type_batiment|  count|
+-------------+-------+
|  Appartement|4079137|
|       Maison|5062436|
+-------------+-------+

CPU times: user 16.6 ms, sys: 6.98 ms, total: 23.5 ms
Wall time: 6.17 s


                                                                                

In [28]:
code_postal_montrouge = 92120

total_transaction_montrouge = fr_immo_parquet_df.filter(col("code_postal") == code_postal_montrouge).count()
print(f"Total transaction montrouge: {total_transaction_montrouge}")

Total transaction montrouge: 7361


In [30]:
total_transaction_montrouge = fr_immo_csv_df.filter(col("code_postal")== code_postal_montrouge).count()
print(f"Total transaction montrouge: {total_transaction_montrouge}")



Total transaction montrouge: 7361


                                                                                

## Compatibility problems

As there are many libraries that can write parquet files, there are some compatibility problems.

### Timestamp data type

The default Timestamp data type implementation in **pyarrow/pandas is INT64 (TIMESTAMP(NANOS))**.

The default Timestamp data type implementation in **spark is INT64 (TIMESTAMP(MICROS)) or INT96 (NANOS)**.

INT96 (NANOS) is deprecated in the newer spark version. You can still activate in Spark environments with `config("spark.sql.legacy.parquet.nanosAsLong", "true")`

In the below example, we use pyspark to read a parquet file which is generated by using the pandas/pyarrow. You may receive an error message (based on your spark version)

```java
org.apache.spark.sql.AnalysisException: Illegal Parquet type: INT64 (TIMESTAMP(NANOS,false)).
```

In [11]:
fr_immo_raw_df = spark.read.parquet(fr_immo_raw_path)
print(fr_immo_raw_df.count())


9141573


In [19]:
fr_immo_raw_df.select([date_col_name]).show(5)

+-------------------+
|   date_transaction|
+-------------------+
|1388620800000000000|
|1388620800000000000|
|1388620800000000000|
|1388620800000000000|
|1388707200000000000|
+-------------------+
only showing top 5 rows



In [15]:
fr_immo_raw_df.printSchema()

root
 |-- id_transaction: integer (nullable = true)
 |-- date_transaction: long (nullable = true)
 |-- prix: double (nullable = true)
 |-- departement: string (nullable = true)
 |-- id_ville: integer (nullable = true)
 |-- ville: string (nullable = true)
 |-- code_postal: integer (nullable = true)
 |-- adresse: string (nullable = true)
 |-- type_batiment: string (nullable = true)
 |-- n_pieces: integer (nullable = true)
 |-- surface_habitable: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)



In [20]:
fr_immo_valid_df = spark.read.parquet(fr_immo_valid_path)
fr_immo_valid_df.select([date_col_name]).show(5)

+--------------------+
|    date_transaction|
+--------------------+
|2013-12-31 10:25:...|
|2013-12-31 10:25:...|
|2013-12-31 10:25:...|
|2013-12-31 10:25:...|
|2014-01-01 10:25:...|
+--------------------+
only showing top 5 rows

