In [2]:
# Version minimaliste
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName("GeoAnalysis_Simple").getOrCreate()

BASE_PATH = "file:///tmp"
SILVER_NOAA = f"{BASE_PATH}/silver/noaa/"
GOLD_GEO = f"{BASE_PATH}/gold/geography/"

import os
os.makedirs("/tmp/gold/geography", exist_ok=True)

# Chargement
df = spark.read.parquet(SILVER_NOAA)

# Bandes de latitude
df = df.withColumn("lat_band", (F.floor(F.col("latitude")/5) * 5).cast("int"))

# Stats par latitude/mois
stats_lat = df.groupBy("lat_band", F.month("date").alias("month")).agg(
    F.mean("temp_c").alias("mean_temp_lat"),
    F.stddev("temp_c").alias("std_temp_lat")
)

# Jointure
df2 = df.withColumn("month", F.month("date")).join(stats_lat, on=["lat_band", "month"], how="left")

# Anomalies
df_geo_anom = df2.withColumn("is_lat_incoherent", 
    F.when(F.col("temp_c") > F.col("mean_temp_lat") + 3 * F.col("std_temp_lat"), True).otherwise(False))

# Ajout année pour partitionnement
df_geo_anom = df_geo_anom.withColumn("year", F.year("date"))

# Sauvegarde
df_geo_anom.write.mode("overwrite").partitionBy("year").parquet(GOLD_GEO + "/lat_incoherent")

print("✅ Analyse géographique sauvegardée!")

✅ Analyse géographique sauvegardée!
