In [106]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [107]:
# Créer une session Spark locale.
builder: SparkSession.Builder = SparkSession.builder
spark = builder.master("local").appName("01_exploration_spark").getOrCreate()

In [None]:
# Charger `air_quality_raw.csv` en DataFrame Spark.
air_quality = (
    spark.read.option("header", "true")
    .option("inferSchema", "true")
    .option("sep", ",")
    .csv("data/air_quality_raw.csv")
)

In [109]:
# Afficher le schéma inféré et identifier les problèmes de typage.
air_quality.printSchema()

# Problèmes de typages :
# timestamp : string -> datetime
# value : string -> float

root
 |-- station_id: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- pollutant: string (nullable = true)
 |-- value: string (nullable = true)
 |-- unit: string (nullable = true)



In [110]:
# Calculer des statistiques descriptives par polluant.
df_summary = air_quality.groupBy("pollutant").agg(
    F.count("*").alias("count"),
    F.avg("value").alias("mean"),
    F.std("value").alias("stddev"),
    F.min("value").alias("min"),
    F.max("value").alias("max"),
    F.percentile("value", [0.25, 0.5, 0.75]).alias("quantiles"),
)
df_summary = df_summary.withColumn(
    "Q1",
    F.col("quantiles")[0],
).withColumn(
    "Q2",
    F.col("quantiles")[1],
).withColumn(
    "Q3",
    F.col("quantiles")[2],
).drop("quantiles")
df_summary.show(truncate=False)

+---------+------+------------------+------------------+---+----+-----+-----+------+
|pollutant|count |mean              |stddev            |min|max |Q1   |Q2   |Q3    |
+---------+------+------------------+------------------+---+----+-----+-----+------+
|PM10     |205130|70.21267468823174 |335.66700132196706|---|null|23.65|35.25|50.06 |
|NO2      |205106|78.76389141237802 |340.0682280909494 |---|null|28.3 |42.41|60.23 |
|O3       |205242|108.20850580284663|334.0956554017962 |---|null|47.15|70.46|100.24|
|CO       |205083|33.244708332372745|338.5496147920897 |---|null|0.47 |0.71 |1.0   |
|PM2.5    |205215|55.09891935232638 |335.55145568365253|---|null|14.16|21.17|30.12 |
|SO2      |205175|41.00714578488003 |344.27247159941027|---|null|4.7  |7.04 |10.05 |
+---------+------+------------------+------------------+---+----+-----+-----+------+



In [None]:
# Compter les valeurs nulles par colonne.
df_null = air_quality.select(
    [
        F.count(F.when((F.col(c).isNull()) | (F.col(c) == "") | (F.lower(F.col(c)) == "null"), c)).alias(c + "_null")
        for c in air_quality.columns
    ]
)
df_null.show()

+---------------+--------------+--------------+----------+---------+
|station_id_null|timestamp_null|pollutant_null|value_null|unit_null|
+---------------+--------------+--------------+----------+---------+
|              0|             0|             0|      1538|        0|
+---------------+--------------+--------------+----------+---------+



In [112]:
# Identifier les stations avec le plus d'enregistrements.
top10_station = air_quality.groupBy("station_id").count().orderBy(F.col("count").desc()).limit(10)
top10_station.show()

+----------+-----+
|station_id|count|
+----------+-----+
|    ST0032|26264|
|    ST0012|26244|
|    ST0028|26241|
|    ST0003|26239|
|    ST0029|26235|
|    ST0024|26235|
|    ST0020|26235|
|    ST0037|26233|
|    ST0023|26233|
|    ST0042|26224|
+----------+-----+



In [113]:
spark.stop()