In [0]:
import pyspark.sql.functions as F
from pyspark.sql.functions import row_number, lit
from pyspark.sql.window import Window



catalog = "fingrid_test_workspace"

silver_schema = "fingrid_silver"
gold_schema = "fingrid_gold"

table_name_silver = "electricity_consumption"
silver_table = ".".join([catalog, silver_schema, table_name_silver])


date_table_name = "dim_date"
table_date = ".".join([catalog, gold_schema, date_table_name])

time_table_name = "dim_time"
table_time = ".".join([catalog, gold_schema, time_table_name])

df = spark.read.format("delta").table(silver_table)
df = df.drop("refresh_timestamp", "dataset_id", "uom", "read_ts")


df = df.withColumn("start_date", F.col("start_time").cast("date"))
df = df.withColumn("end_date", F.col("end_time").cast("date"))

df = df.withColumn("start_time", F.split(F.col("start_time"), "T").getItem(1).substr(1, 8))
df = df.withColumn("end_time", F.split(F.col("end_time"), "T").getItem(1).substr(1, 8))

# get start date id 
df_date = spark.read.format("delta").table(table_date)
join =    ((F.col("f.start_date") == F.col("d.date")))       
df = (
    df.alias("f")
        .join(
            df_date.alias("d"), join, "left"
        ).select("f.*", 
                F.col("d.date_id").alias("start_date_id"),        
).drop("start_date")
)

# get end date_id
join =    ((F.col("f.end_date") == F.col("d.date")))       
df = (
    df.alias("f")
        .join(
            df_date.alias("d"), join, "left"
        ).select("f.*", 
                F.col("d.date_id").alias("end_date_id"),        
).drop("end_date")
)

# get start time id 
df_time = spark.read.format("delta").table(table_time)
join =    ((F.col("f.start_time") == F.col("d.time_15min")))       
df = (
    df.alias("f")
        .join(
            df_time.alias("d"), join, "left"
        ).select("f.*", 
                F.col("d.time_quarter_id").alias("start_time_id"),        
).drop("start_time")
)

# get end time_id
join =    ((F.col("f.end_time") == F.col("d.time_15min")))       
df = (
    df.alias("f")
        .join(
            df_time.alias("d"), join, "left"
        ).select("f.*", 
                F.col("d.time_quarter_id").alias("end_time_id"),        
).drop("end_time")
)



customer_table_name = "dim_customer"
table_customer = ".".join([catalog, gold_schema, customer_table_name])

df_customer = spark.read.format("delta").table(table_customer)

# get customerID
join =    ((F.col("f.customer_type") == F.col("d.customer_type")) & 
           (F.col("f.time_series_type") == F.col("d.time_series_type")) &
           (F.col("f.res") == F.col("d.res"))
           )       
df = (
    df.alias("f")
        .join(
            df_customer.alias("d"), join, "left"
        ).select("f.*", 
                F.col("d.customerID").alias("customerID"),        
).drop("customer_type", "time_series_type","res")
)

df = df.withColumn("additional_value", F.col("additional_value").cast("double"))
df = df.withColumn("count", F.col("count").cast("bigint"))

df = df.withColumn("refresh_timestamp", F.current_timestamp())

external_path = "abfss://gold@fingridtest.dfs.core.windows.net/gold/fact_consumption/"
df.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(external_path)



df.display()


In [0]:
%sql
CREATE TABLE IF NOT EXISTS fingrid_test_workspace.fingrid_gold.fact_consumption
USING DELTA
LOCATION 'abfss://gold@fingridtest.dfs.core.windows.net/gold/fact_consumption/'


In [0]:
%sql
select * from fingrid_test_workspace.fingrid_gold.fact_consumption