In [0]:
import pyspark.sql.functions as F
from pyspark.sql.functions import row_number, lit
from pyspark.sql.window import Window



catalog = "fingrid_test_workspace"

silver_schema = "fingrid_silver"
gold_schema = "fingrid_gold"

table_name_solar = "solar_forecast"
table_solar = ".".join([catalog, silver_schema, table_name_solar])

table_name_wind = "wind_forecast"
table_wind = ".".join([catalog, silver_schema, table_name_wind])

date_table_name = "dim_date"
table_date = ".".join([catalog, gold_schema, date_table_name])

time_table_name = "dim_time"
table_time = ".".join([catalog, gold_schema, time_table_name])

df = spark.read.format("delta").table(table_solar).union( spark.read.format("delta").table(table_wind))
df = df.drop("refresh_timestamp")


df = df.withColumn("start_date", F.col("start_time").cast("date"))
df = df.withColumn("end_date", F.col("end_time").cast("date"))

df = df.withColumn("start_time", F.split(F.col("start_time"), "T").getItem(1).substr(1, 8))
df = df.withColumn("end_time", F.split(F.col("end_time"), "T").getItem(1).substr(1, 8))

# get start date id 
df_date = spark.read.format("delta").table(table_date)
join =    ((F.col("f.start_date") == F.col("d.date")))       
df = (
    df.alias("f")
        .join(
            df_date.alias("d"), join, "left"
        ).select("f.*", 
                F.col("d.date_id").alias("start_date_id"),        
).drop("start_date")
)

# get end date_id
join =    ((F.col("f.end_date") == F.col("d.date")))       
df = (
    df.alias("f")
        .join(
            df_date.alias("d"), join, "left"
        ).select("f.*", 
                F.col("d.date_id").alias("end_date_id"),        
).drop("end_date")
)

# get start time id 
df_time = spark.read.format("delta").table(table_time)
join =    ((F.col("f.start_time") == F.col("d.time_15min")))       
df = (
    df.alias("f")
        .join(
            df_time.alias("d"), join, "left"
        ).select("f.*", 
                F.col("d.time_quarter_id").alias("start_time_id"),        
).drop("start_time")
)

# get end date_id
join =    ((F.col("f.end_time") == F.col("d.time_15min")))       
df = (
    df.alias("f")
        .join(
            df_time.alias("d"), join, "left"
        ).select("f.*", 
                F.col("d.time_quarter_id").alias("end_time_id"),        
).drop("end_time")
)

df = df.withColumnRenamed("dataset_id", "source_dataset_id")

df = df.withColumn("refresh_timestamp", F.current_timestamp())

external_path = "abfss://gold@fingridtest.dfs.core.windows.net/gold/fact_forecast/"
df.write.format("delta").mode("overwrite").option("mergeSchema", "true").save(external_path)



df.display()

In [0]:
%sql
CREATE TABLE IF NOT EXISTS fingrid_test_workspace.fingrid_gold.fact_forecast
USING DELTA
LOCATION 'abfss://gold@fingridtest.dfs.core.windows.net/gold/fact_forecast/'


In [0]:
%sql
select * from fingrid_test_workspace.fingrid_gold.fact_forecast