In [15]:
from pyspark.sql import SparkSession, DataFrame
from pyspark.sql.functions import col, year
from pathlib import Path

In [2]:
spark=SparkSession.builder.master("local[4]") \
                  .appName("ReadWriteParquet") \
                  .config("spark.sql.legacy.parquet.nanosAsLong", "true") \
                  .getOrCreate()


25/01/28 09:55:12 WARN Utils: Your hostname, pliu-ubuntu24 resolves to a loopback address: 127.0.1.1; using 192.168.30.128 instead (on interface ens33)
25/01/28 09:55:12 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/01/28 09:55:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/01/28 09:55:16 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
25/01/28 09:55:16 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.


# configuration

In [3]:
data_path = Path.cwd().parent / "data"

fr_immo_raw_path = (data_path / "fr_immo_transactions.parquet").as_posix()
fr_immo_valid_path = (data_path / "fr_immo_transactions_valid_ts.parquet").as_posix()
fr_immo_csv_path = (data_path / "fr_immo_transactions.csv").as_posix()

date_col_name = "date_transaction"


## Performance test csv vs parquet

In this part, we will test the query performance between csv and parquet.

1. row_counts
2. group_by
3. filter

In [4]:
%%time
fr_immo_parquet_df = spark.read.parquet(fr_immo_valid_path)

                                                                                

CPU times: user 5.42 ms, sys: 4.46 ms, total: 9.88 ms
Wall time: 3.6 s


In [15]:
%%time
fr_immo_parquet_df.count()

CPU times: user 6.42 ms, sys: 3.65 ms, total: 10.1 ms
Wall time: 879 ms


9141573

In [16]:
fr_immo_parquet_df.printSchema()

root
 |-- id_transaction: integer (nullable = true)
 |-- date_transaction: timestamp (nullable = true)
 |-- prix: double (nullable = true)
 |-- departement: string (nullable = true)
 |-- id_ville: integer (nullable = true)
 |-- ville: string (nullable = true)
 |-- code_postal: integer (nullable = true)
 |-- adresse: string (nullable = true)
 |-- type_batiment: string (nullable = true)
 |-- n_pieces: integer (nullable = true)
 |-- surface_habitable: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)



In [7]:
%%time
fr_immo_csv_df = spark.read.option("header",True).option("inferSchema", True).csv(fr_immo_csv_path)




CPU times: user 23.9 ms, sys: 7.73 ms, total: 31.6 ms
Wall time: 24.6 s


                                                                                

In [19]:
fr_immo_csv_df.printSchema()

root
 |-- id_transaction: integer (nullable = true)
 |-- date_transaction: timestamp (nullable = true)
 |-- prix: double (nullable = true)
 |-- departement: integer (nullable = true)
 |-- id_ville: integer (nullable = true)
 |-- ville: string (nullable = true)
 |-- code_postal: integer (nullable = true)
 |-- adresse: string (nullable = true)
 |-- type_batiment: string (nullable = true)
 |-- n_pieces: integer (nullable = true)
 |-- surface_habitable: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)



In [20]:
%%time
fr_immo_csv_df.count()



CPU times: user 6.33 ms, sys: 2.47 ms, total: 8.8 ms
Wall time: 1.36 s


                                                                                

9141573

In [21]:
batiment_typ_col_name = "type_batiment"

In [22]:
%%time
fr_immo_parquet_df.groupby(batiment_typ_col_name).count().show()

+-------------+-------+
|type_batiment|  count|
+-------------+-------+
|  Appartement|4079137|
|       Maison|5062436|
+-------------+-------+

CPU times: user 3.88 ms, sys: 551 μs, total: 4.43 ms
Wall time: 679 ms


In [23]:
%%time
fr_immo_csv_df.groupby(batiment_typ_col_name).count().show()



+-------------+-------+
|type_batiment|  count|
+-------------+-------+
|  Appartement|4079137|
|       Maison|5062436|
+-------------+-------+

CPU times: user 16.6 ms, sys: 6.98 ms, total: 23.5 ms
Wall time: 6.17 s


                                                                                

In [6]:
%%time
target_year =2023
code_postal_montrouge = 92120

total_transaction_montrouge = fr_immo_parquet_df.filter((col("code_postal") == code_postal_montrouge) & (year(col("date_transaction"))==target_year)).count()
print(f"Total transaction montrouge: {total_transaction_montrouge}")



Total transaction montrouge: 656
CPU times: user 6.35 ms, sys: 3.6 ms, total: 9.95 ms
Wall time: 2.45 s


                                                                                

In [8]:
%%time
total_transaction_montrouge = fr_immo_csv_df.filter((col("code_postal") == code_postal_montrouge) & (year(col("date_transaction"))==target_year)).count()
print(f"Total transaction montrouge: {total_transaction_montrouge}")



Total transaction montrouge: 656
CPU times: user 18.6 ms, sys: 3.16 ms, total: 21.7 ms
Wall time: 10.7 s


                                                                                

In [9]:
%%time
distinct_city = fr_immo_parquet_df.select("ville").distinct()
print(f"distinct city count: {distinct_city.count()}")
distinct_city.show(5)

                                                                                

distinct city count: 32320
+-------------------+
|              ville|
+-------------------+
|              CESSY|
|DOMPIERRE-SUR-VEYLE|
|       GRAND-CORENT|
|            CONFORT|
|LA NEUVILLE BOSMONT|
+-------------------+
only showing top 5 rows

CPU times: user 9.48 ms, sys: 3.57 ms, total: 13.1 ms
Wall time: 2.47 s


                                                                                

In [12]:
%%time
distinct_rows = fr_immo_parquet_df.distinct()
print(f"distinct rows count: {distinct_rows.count()}")
distinct_rows.show(5)

25/01/28 10:07:53 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:07:53 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:07:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:07:54 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:07:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:07:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:07:56 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:07:57 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:07:58 WARN RowBasedKeyValueBatch: Calling spill() on

distinct rows count: 9141573


25/01/28 10:08:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:08:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:08:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:08:26 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:08:28 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:08:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:08:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:08:29 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:08:30 WARN RowBasedKeyValueBatch: Calling spill() on

+--------------+--------------------+---------+-----------+--------+-------------------+-----------+--------------------+-------------+--------+-----------------+----------------+----------------+
|id_transaction|    date_transaction|     prix|departement|id_ville|              ville|code_postal|             adresse|type_batiment|n_pieces|surface_habitable|        latitude|       longitude|
+--------------+--------------------+---------+-----------+--------+-------------------+-----------+--------------------+-------------+--------+-----------------+----------------+----------------+
|        143401|2014-01-11 10:24:...|265886.29|         01|     354|   ST-GENIS-POUILLY|       1630|      FIN DE POUILLY|  Appartement|       3|               62|46.2518053832376|6.02741119562863|
|        140101|2014-01-14 10:23:...| 213000.0|         01|     173|                GEX|       1170|341 RUE MARIUS CADOZ|  Appartement|       2|               57|46.3302948271118|6.06187724874993|
|        138801

                                                                                

In [11]:
%%time
distinct_city = fr_immo_csv_df.select("ville").distinct()
print(f"distinct city count: {distinct_city.count()}")
distinct_city.show(5)

                                                                                

distinct city count: 32320




+-------------------+
|              ville|
+-------------------+
|              CESSY|
|DOMPIERRE-SUR-VEYLE|
|       GRAND-CORENT|
|            CONFORT|
|LA NEUVILLE BOSMONT|
+-------------------+
only showing top 5 rows

CPU times: user 31.4 ms, sys: 6.81 ms, total: 38.2 ms
Wall time: 10.6 s


                                                                                

In [13]:
%%time
distinct_rows = fr_immo_csv_df.distinct()
print(f"distinct rows count: {distinct_rows.count()}")
distinct_rows.show(5)

25/01/28 10:09:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:09:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:09:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:09:41 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:09:42 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:10:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:10:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:10:06 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:10:06 WARN RowBasedKeyValueBatch: Calling spill() on

distinct rows count: 9141573


25/01/28 10:10:36 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:10:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:10:37 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:10:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:10:38 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:10:47 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:10:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:10:48 WARN RowBasedKeyValueBatch: Calling spill() on RowBasedKeyValueBatch. Will not spill but return 0.
25/01/28 10:10:49 WARN RowBasedKeyValueBatch: Calling spill() on

+--------------+--------------------+--------+-----------+--------+------------+-----------+--------------------+-------------+--------+-----------------+----------------+----------------+
|id_transaction|    date_transaction|    prix|departement|id_ville|       ville|code_postal|             adresse|type_batiment|n_pieces|surface_habitable|        latitude|       longitude|
+--------------+--------------------+--------+-----------+--------+------------+-----------+--------------------+-------------+--------+-----------------+----------------+----------------+
|        143595|2014-01-06 10:24:...|206800.0|          1|     446|  VILLENEUVE|       1480|10 ALL DES MARGUE...|       Maison|       5|              100| 46.017215697925|4.83236118261167|
|        146931|2014-02-24 10:17:...| 50000.0|          1|     430|    VARAMBON|       1160|      5154  VARAMBON|       Maison|       2|               56|46.0400470273731|5.31682925744396|
|        146921|2014-02-25 10:17:...|236000.0|         

                                                                                

In [16]:

import time


def random_batch(df:DataFrame,fmt:str):
    start = time.time()
    result=df.sample(False, 0.05).collect()
    stats="{},{},{}".format(fmt, "random_batch", time.time() - start)
    print(stats)

In [17]:
random_batch(fr_immo_parquet_df,"parquet")

                                                                                

parquet,random_batch,9.600819110870361


In [18]:
random_batch(fr_immo_csv_df,"csv")

                                                                                

csv,random_batch,20.17941665649414


## Compatibility problems

As there are many libraries that can write parquet files, there are some compatibility problems.

### Timestamp data type

The default Timestamp data type implementation in **pyarrow/pandas is INT64 (TIMESTAMP(NANOS))**.

The default Timestamp data type implementation in **spark is INT64 (TIMESTAMP(MICROS)) or INT96 (NANOS)**.

INT96 (NANOS) is deprecated in the newer spark version. You can still activate in Spark environments with `config("spark.sql.legacy.parquet.nanosAsLong", "true")`

In the below example, we use pyspark to read a parquet file which is generated by using the pandas/pyarrow. You may receive an error message (based on your spark version)

```java
org.apache.spark.sql.AnalysisException: Illegal Parquet type: INT64 (TIMESTAMP(NANOS,false)).
```

In [11]:
fr_immo_raw_df = spark.read.parquet(fr_immo_raw_path)
print(fr_immo_raw_df.count())


9141573


In [19]:
fr_immo_raw_df.select([date_col_name]).show(5)

+-------------------+
|   date_transaction|
+-------------------+
|1388620800000000000|
|1388620800000000000|
|1388620800000000000|
|1388620800000000000|
|1388707200000000000|
+-------------------+
only showing top 5 rows



In [15]:
fr_immo_raw_df.printSchema()

root
 |-- id_transaction: integer (nullable = true)
 |-- date_transaction: long (nullable = true)
 |-- prix: double (nullable = true)
 |-- departement: string (nullable = true)
 |-- id_ville: integer (nullable = true)
 |-- ville: string (nullable = true)
 |-- code_postal: integer (nullable = true)
 |-- adresse: string (nullable = true)
 |-- type_batiment: string (nullable = true)
 |-- n_pieces: integer (nullable = true)
 |-- surface_habitable: integer (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)



In [20]:
fr_immo_valid_df = spark.read.parquet(fr_immo_valid_path)
fr_immo_valid_df.select([date_col_name]).show(5)

+--------------------+
|    date_transaction|
+--------------------+
|2013-12-31 10:25:...|
|2013-12-31 10:25:...|
|2013-12-31 10:25:...|
|2013-12-31 10:25:...|
|2014-01-01 10:25:...|
+--------------------+
only showing top 5 rows

