In [0]:
df_silver = spark.read.format("delta").load("/mnt/silver/data/")

In [0]:
from pyspark.sql.functions import col, monotonically_increasing_id

# Define dimension tables
dim_location = df_silver.select(
    col("longitude"),
    col("latitude"),
    col("sea_level"),
    col("grnd_level"),
    col("country"),
    col("timezone"),
    col("city_id"),
    col("city_name")
).dropDuplicates(["city_id"]).withColumn("location_id", monotonically_increasing_id())

dim_conditions = df_silver.select(
    col("weather_condition"),
    col("weather_description"),
    col("weather_icon")
).dropDuplicates(["weather_condition", "weather_description", "weather_icon"]).withColumn("conditions_id", monotonically_increasing_id())

dim_time = df_silver.select(
    col("timestamp"),
    col("sunrise_time"),
    col("sunset_time")
).dropDuplicates(["timestamp"]).withColumn("time_id", monotonically_increasing_id())

dim_collection = df_silver.select(
    col("base"),
    col("sys_type"),
    col("sys_id"),
    col("cod")
).dropDuplicates(["base", "sys_type", "sys_id", "cod"]).withColumn("collection_id", monotonically_increasing_id())

In [0]:
from pyspark.sql.functions import current_date


fact_weather = df_silver.alias("f") \
    .join(dim_collection.alias("s"),
        (col("f.base") == col("s.base")) & 
        (col("f.sys_type") == col("s.sys_type")) &
        (col("f.sys_id") == col("s.sys_id")) & 
        (col("f.cod") == col("s.cod")), "left") \
    .join(dim_conditions.alias("c"), 
        (col("f.weather_condition") == col("c.weather_condition")) &
        (col("f.weather_description") == col("c.weather_description")) &
        (col("f.weather_icon") == col("c.weather_icon")), "left") \
    .join(dim_time.alias("t"), col("f.timestamp") == col("t.timestamp"), "left") \
    .join(dim_location.alias("l"), col("f.city_id") == col("l.city_id"), "left") \
    .select(
        col("l.location_id"),
        col("c.conditions_id"),
        col("s.collection_id"),
        col("t.time_id"),
        col("f.temperature"),
        col("f.feels_like"),
        col("f.temp_min"),
        col("f.temp_max"),
        col("f.pressure"),
        col("f.humidity"),
        col("f.wind_speed"),
        col("f.wind_degree"),
        col("f.wind_gust"),
        col("f.cloudiness"),
        col("f.visibility")
    ).withColumn("weather_id", monotonically_increasing_id()).withColumn("date", current_date())

In [0]:
from delta.tables import DeltaTable

# Create gold schema
spark.sql("CREATE SCHEMA IF NOT EXISTS gold")

def create_or_append_delta_table(df, table_name, path, merge_condition):
    if not spark.catalog.tableExists(table_name):
        (
            df.write
            .format("delta")
            .option("path", path)
            .saveAsTable(table_name)
        )
    else:
        delta_table = DeltaTable.forName(spark, table_name)
        (
            delta_table.alias("t")
            .merge(
                df.alias("s"),
                merge_condition
            )
            .whenNotMatchedInsertAll()
            .execute()
        )

# dim_location
create_or_append_delta_table(
    dim_location,
    "gold.dim_location",
    "/mnt/gold/weather/dim_location",
    "s.city_id = t.city_id"
)

# dim_conditions
create_or_append_delta_table(
    dim_conditions,
    "gold.dim_conditions",
    "/mnt/gold/weather/dim_conditions",
    "s.weather_condition = t.weather_condition AND s.weather_description = t.weather_description AND s.weather_icon = t.weather_icon"
)

# dim_time
create_or_append_delta_table(
    dim_time,
    "gold.dim_time",
    "/mnt/gold/weather/dim_time",
    "t.timestamp = s.timestamp"
)

# dim_collection
create_or_append_delta_table(
    dim_collection,
    "gold.dim_collection",
    "/mnt/gold/weather/dim_collection",
    "s.base = t.base AND s.sys_type = t.sys_type AND s.sys_id = t.sys_id AND s.cod = t.cod"
)

# fact_weather (append only)
(
    fact_weather.write
    .format("delta")
    .mode("append")
    .option("path", "/mnt/gold/weather/fact_weather")
    .saveAsTable("gold.fact_weather")
)