In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
spark = SparkSession.builder.appName("GeoAnalysis").getOrCreate()

SILVER_NOAA = "hdfs://namenode:8020/silver/noaa/"
GOLD_GEO = "hdfs://namenode:8020/gold/geography/"

df = spark.read.parquet(SILVER_NOAA)

# Create latitude bands (e.g., buckets of 5°)
df = df.withColumn("lat_band", (F.floor(F.col("latitude")/5) * 5).cast("int"))

# Historical mean per lat_band and month
stats_lat = df.groupBy("lat_band", F.month("date").alias("month")).agg(
    F.mean("temp_c").alias("mean_temp_lat"),
    F.stddev("temp_c").alias("std_temp_lat")
)

df2 = df.withColumn("month", F.month("date")).join(stats_lat, on=["lat_band", "month"], how="left")

# Detect incoherent temps for latitude
df_geo_anom = df2.withColumn("is_lat_incoherent", F.when(F.col("temp_c") > F.col("mean_temp_lat") + 3 * F.col("std_temp_lat"), True).otherwise(False))

# Extreme in historically stable zone: define 'stable' as low stddev historically
stability = stats_lat.withColumn("is_stable", F.when(F.col("std_temp_lat") < 1.5, True).otherwise(False))
df_geo = df2.join(stability.select("lat_band", "month", "is_stable"), on=["lat_band", "month"], how="left")
df_geo = df_geo.withColumn("extreme_in_stable", F.when((F.col("is_stable")==True) & (F.col("temp_c") > F.col("mean_temp_lat") + 2*F.col("std_temp_lat")), True).otherwise(False))

# Save results
df_geo_anom.write.mode("overwrite").partitionBy(F.year("date").alias("year")).parquet(GOLD_GEO + "/lat_incoherent")
df_geo.write.mode("overwrite").parquet(GOLD_GEO + "/extreme_in_stable")
print("Geographic analyses saved")

IllegalArgumentException: java.net.UnknownHostException: namenode