# Etape 1 - Exploration et chargement Spark

**Objectif** : Charger et explorer les donnees de qualite de l'air avec PySpark

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType
import os

# Chemins des donnees
DATA_DIR = "../data"
AIR_QUALITY_PATH = os.path.join(DATA_DIR, "air_quality_raw.csv")
STATIONS_PATH = os.path.join(DATA_DIR, "stations.csv")
WEATHER_PATH = os.path.join(DATA_DIR, "weather_raw.csv")

## 1.1 Creation de la session Spark

In [None]:
# Creer une session Spark locale
spark = SparkSession.builder \
    .appName("TP Qualite Air - Exploration") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .config("spark.sql.shuffle.partitions", "8") \
    .getOrCreate()

# Reduire les logs
spark.sparkContext.setLogLevel("WARN")

print(f"Spark version: {spark.version}")
print(f"Spark UI: {spark.sparkContext.uiWebUrl}")

## 1.2 Chargement des donnees de qualite de l'air

In [None]:
# Charger le CSV avec inference de schema
df_air_raw = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(AIR_QUALITY_PATH)

print(f"Nombre de lignes: {df_air_raw.count():,}")
print(f"Nombre de colonnes: {len(df_air_raw.columns)}")

In [None]:
# Afficher le schema infere
print("Schema infere:")
df_air_raw.printSchema()

In [None]:
# Apercu des donnees
df_air_raw.show(10, truncate=False)

## 1.3 Identification des problemes de typage

In [None]:
# La colonne 'value' est en string car elle contient des virgules et des valeurs textuelles
# Examinons les valeurs non numeriques

# Valeurs qui ne peuvent pas etre converties en nombre
df_non_numeric = df_air_raw.filter(
    ~F.col("value").rlike("^-?[0-9]+[.,]?[0-9]*$")
)

print(f"Nombre de valeurs non numeriques: {df_non_numeric.count():,}")
df_non_numeric.select("value").distinct().show()

In [None]:
# Valeurs avec virgule comme separateur decimal
df_with_comma = df_air_raw.filter(F.col("value").contains(","))
print(f"Nombre de valeurs avec virgule: {df_with_comma.count():,}")
df_with_comma.select("value").show(5)

In [None]:
# Differents formats de timestamp
print("Exemples de formats de timestamp:")
df_air_raw.select("timestamp").distinct().show(20, truncate=False)

## 1.4 Statistiques descriptives par polluant

In [None]:
# Convertir value en double (en remplacant la virgule par un point)
df_air_numeric = df_air_raw.withColumn(
    "value_clean",
    F.regexp_replace(F.col("value"), ",", ".").cast("double")
)

# Statistiques par polluant (en ignorant les valeurs nulles)
stats_by_pollutant = df_air_numeric.filter(F.col("value_clean").isNotNull()) \
    .groupBy("pollutant") \
    .agg(
        F.count("*").alias("count"),
        F.round(F.mean("value_clean"), 2).alias("mean"),
        F.round(F.stddev("value_clean"), 2).alias("stddev"),
        F.round(F.min("value_clean"), 2).alias("min"),
        F.round(F.max("value_clean"), 2).alias("max"),
        F.round(F.expr("percentile(value_clean, 0.5)"), 2).alias("median")
    ) \
    .orderBy("pollutant")

print("Statistiques par polluant:")
stats_by_pollutant.show()

In [None]:
# Identifier les valeurs aberrantes
print("Valeurs negatives:")
df_air_numeric.filter(F.col("value_clean") < 0).groupBy("pollutant").count().show()

print("\nValeurs > 1000 ug/m3:")
df_air_numeric.filter(F.col("value_clean") > 1000).groupBy("pollutant").count().show()

## 1.5 Comptage des valeurs nulles par colonne

In [None]:
# Compter les nulls pour chaque colonne
null_counts = df_air_raw.select([
    F.count(F.when(F.col(c).isNull() | (F.col(c) == ""), c)).alias(c)
    for c in df_air_raw.columns
])

print("Nombre de valeurs nulles/vides par colonne:")
null_counts.show()

In [None]:
# Pourcentage de completude
total_rows = df_air_raw.count()
print(f"\nTaux de completude (sur {total_rows:,} lignes):")

for col_name in df_air_raw.columns:
    null_count = df_air_raw.filter(
        F.col(col_name).isNull() | (F.col(col_name) == "")
    ).count()
    completude = (1 - null_count / total_rows) * 100
    print(f"  {col_name}: {completude:.2f}%")

## 1.6 Stations avec le plus d'enregistrements

In [None]:
# Charger les stations
df_stations = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .csv(STATIONS_PATH)

df_stations.show(10)

In [None]:
# Nombre d'enregistrements par station
records_by_station = df_air_raw.groupBy("station_id") \
    .count() \
    .orderBy(F.desc("count"))

# Joindre avec les infos des stations
records_with_info = records_by_station.join(
    df_stations,
    on="station_id",
    how="left"
).select(
    "station_id", "station_name", "city", "station_type", "count"
)

print("Top 10 stations avec le plus d'enregistrements:")
records_with_info.show(10)

In [None]:
# Repartition par ville
records_by_city = df_air_raw.join(
    df_stations.select("station_id", "city"),
    on="station_id",
    how="left"
).groupBy("city") \
 .count() \
 .orderBy(F.desc("count"))

print("Repartition des enregistrements par ville:")
records_by_city.show()

## 1.7 Synthese des problemes de qualite identifies

In [None]:
# Resume des problemes
total = df_air_raw.count()

# Valeurs non numeriques
non_numeric = df_air_raw.filter(
    ~F.col("value").rlike("^-?[0-9]+[.,]?[0-9]*$")
).count()

# Valeurs avec virgule
with_comma = df_air_raw.filter(F.col("value").contains(",")).count()

# Valeurs negatives (apres conversion)
negative = df_air_numeric.filter(F.col("value_clean") < 0).count()

# Valeurs aberrantes > 1000
outliers = df_air_numeric.filter(F.col("value_clean") > 1000).count()

# Doublons
duplicates = total - df_air_raw.dropDuplicates(["station_id", "timestamp", "pollutant"]).count()


print(f"Total enregistrements: {total:,}")
print()
print(f"Problemes identifies:")
print(f"  - Valeurs non numeriques: {non_numeric:,} ({non_numeric/total*100:.2f}%)")
print(f"  - Valeurs avec virgule decimale: {with_comma:,} ({with_comma/total*100:.2f}%)")
print(f"  - Valeurs negatives: {negative:,} ({negative/total*100:.2f}%)")
print(f"  - Valeurs aberrantes (>1000): {outliers:,} ({outliers/total*100:.2f}%)")
print(f"  - Doublons: {duplicates:,} ({duplicates/total*100:.2f}%)")
print(f"  - Formats de dates multiples: 4 formats differents detectes")

In [None]:
# Fermer la session Spark
spark.stop()