In [0]:
import pyspark.sql.functions as F
from pyspark.sql.types import IntegerType

# 1. Standardize and Cast
df = spark.read.table("golden_360.bronze.geolocation")
# We clean the data first so the "FIRST" values picked are already standardized
cleaned_df = df.select(
    F.col("geolocation_zip_code_prefix").cast(IntegerType()),
    F.upper(F.trim(F.col("geolocation_city"))).alias("geolocation_city"),
    F.upper(F.trim(F.col("geolocation_state"))).alias("geolocation_state"),
    F.col("geolocation_lat"),
    F.col("geolocation_lng")
)

# 2. Group By and Aggregate
# We use AVG for coordinates and FIRST for the labels
deduplicated_df = cleaned_df.groupBy("geolocation_zip_code_prefix").agg(
    F.avg("geolocation_lat").alias("geolocation_lat"),
    F.avg("geolocation_lng").alias("geolocation_lng"),
    F.first("geolocation_city").alias("geolocation_city"),
    F.first("geolocation_state").alias("geolocation_state")
)

# 3. Validate: Remove any rows containing NULLs in any field
final_df = deduplicated_df.dropna(how="any")

# Quick check on the result
final_df.write.format("delta").mode("overwrite").saveAsTable("golden_360.silver.geolocation")
