# fingrid_test_workspace.fingrid_silver.solar_forecast

In [0]:
import pyspark.sql.functions as F
from delta.tables import *
import datetime
from datetime import datetime
import pytz

catalog = "fingrid_test_workspace"

# raw data
bronze_schema = "fingrid_bronze"
raw_table_name = "solar_forecast_raw"
raw_table = ".".join([catalog, bronze_schema, raw_table_name])


# silver data 
silver_schema = "fingrid_silver"
silver_table_name = "solar_forecast"
silver_table = ".".join([catalog, silver_schema, silver_table_name])

# control table 
control_table = "fingrid_test_workspace.fingrid_load_control.load_control"


local_tz = pytz.timezone("Europe/Helsinki")  

if spark.catalog.tableExists(control_table):
    current_bronze_ingestion_timestamp = spark.read.table(control_table).where(F.col("source_dataset_id") =='248').select(F.max("bronze_ingestion_timestamp")).collect()[0][0]

    # current_bronze_ingestion_timestamp = "2025-01-01T00:00:00.000+00:00"

    print("current_bronze_ingestion_timestamp:",current_bronze_ingestion_timestamp)

    df = (
        spark.read.format("delta")
        .table(raw_table)
        .where((F.col("ingestion_timestamp") > current_bronze_ingestion_timestamp ) )
    )

        
    print("Total rows(pages): ", df.count())
    print("read data done", datetime.now(tz=local_tz))

else:
    df = (
        spark.read.format("delta")
        .table(raw_table)
    )
if df.isEmpty():
        print ("No data")
else:
    max_bronze_ingestion_timestamp = df.select(F.max(F.col("ingestion_timestamp"))).collect()[0][0]
    print("max_bronze_ingestion_timestamp:",max_bronze_ingestion_timestamp)


    print ("start to transform data: ", datetime.now(tz=local_tz))


    df_flat = df.select(
        F.explode(
            F.from_json(
                F.col("data"),
                "ARRAY<STRUCT<datasetId: STRING, startTime: TIMESTAMP, endTime: TIMESTAMP, value: DOUBLE>>"
            )
        ).alias("record")
    )
    df_result = df_flat.select(
        F.col("record.datasetId").alias("dataset_id"),
        F.col("record.startTime").alias("start_time"),
        F.col("record.endTime").alias("end_time"),
        F.col("record.value").alias("value")
    )

    df_result = df_result.withColumn("refresh_timestamp", F.current_timestamp())


    print ("start to insert or update data: ", datetime.now(tz=local_tz))

    # update data to table
    df_existing_silver_table = DeltaTable.forName( sparkSession=spark, tableOrViewName=silver_table)
    df_control_table = DeltaTable.forName( sparkSession=spark, tableOrViewName=control_table)
    df_existing_silver_table.alias('df_existing') \
        .merge(
            df_result.alias('updates'),
            "df_existing.dataset_id = updates.dataset_id and df_existing.start_time = updates.start_time and df_existing.end_time = updates.end_time"
        ) \
        .whenMatchedUpdate(set={
            "value": "updates.value",
            "refresh_timestamp": "updates.refresh_timestamp"
        }
        ) \
        .whenNotMatchedInsertAll()\
        .execute()

    df_control_table.alias('control_table').update(
        condition=F.col('source_dataset_id') == '248',
        set={'bronze_ingestion_timestamp': F.to_timestamp(F.lit(max_bronze_ingestion_timestamp))}
    )
   
    print("update bronze_ingestion_timestamp:",max_bronze_ingestion_timestamp)
    print ("insert and update done: ", datetime.now(tz=local_tz))


#  fingrid_test_workspace.fingrid_silver.wind_forecast

In [0]:
import pyspark.sql.functions as F
from delta.tables import *
import datetime
from datetime import datetime
import pytz

catalog = "fingrid_test_workspace"

# raw data
bronze_schema = "fingrid_bronze"
raw_table_name = "wind_forecast_raw"
raw_table = ".".join([catalog, bronze_schema, raw_table_name])


# silver data 
silver_schema = "fingrid_silver"
silver_table_name = "wind_forecast"
silver_table = ".".join([catalog, silver_schema, silver_table_name])

# control table 
control_table = "fingrid_test_workspace.fingrid_load_control.load_control"


local_tz = pytz.timezone("Europe/Helsinki")  

if spark.catalog.tableExists(control_table):
    current_bronze_ingestion_timestamp = spark.read.table(control_table).where(F.col("source_dataset_id") =='245').select(F.max("bronze_ingestion_timestamp")).collect()[0][0]

    # current_bronze_ingestion_timestamp = "2025-01-01T00:00:00.000+00:00"

    print("current_bronze_ingestion_timestamp:",current_bronze_ingestion_timestamp)

    df = (
        spark.read.format("delta")
        .table(raw_table)
        .where((F.col("ingestion_timestamp") > current_bronze_ingestion_timestamp ) )
    )

        
    print("Total rows(pages): ", df.count())
    print("read data done", datetime.now(tz=local_tz))

else:
    df = (
        spark.read.format("delta")
        .table(raw_table)
    )
if df.isEmpty():
        print ("No data")
else:
    max_bronze_ingestion_timestamp = df.select(F.max(F.col("ingestion_timestamp"))).collect()[0][0]
    print("max_bronze_ingestion_timestamp:",max_bronze_ingestion_timestamp)


    print ("start to transform data: ", datetime.now(tz=local_tz))


    df_flat = df.select(
        F.explode(
            F.from_json(
                F.col("data"),
                "ARRAY<STRUCT<datasetId: STRING, startTime: TIMESTAMP, endTime: TIMESTAMP, value: DOUBLE>>"
            )
        ).alias("record")
    )
    df_result = df_flat.select(
        F.col("record.datasetId").alias("dataset_id"),
        F.col("record.startTime").alias("start_time"),
        F.col("record.endTime").alias("end_time"),
        F.col("record.value").alias("value")
    )

    df_result = df_result.withColumn("refresh_timestamp", F.current_timestamp())


    print ("start to insert or update data: ", datetime.now(tz=local_tz))

    # update data to table
    df_existing_silver_table = DeltaTable.forName( sparkSession=spark, tableOrViewName=silver_table)
    df_control_table = DeltaTable.forName( sparkSession=spark, tableOrViewName=control_table)
    df_existing_silver_table.alias('df_existing') \
        .merge(
            df_result.alias('updates'),
            "df_existing.dataset_id = updates.dataset_id and df_existing.start_time = updates.start_time and df_existing.end_time = updates.end_time"
        ) \
        .whenMatchedUpdate(set={
            "value": "updates.value",
            "refresh_timestamp": "updates.refresh_timestamp"
        }
        ) \
        .whenNotMatchedInsertAll()\
        .execute()

    df_control_table.alias('control_table').update(
        condition=F.col('source_dataset_id') == '245',
        set={'bronze_ingestion_timestamp': F.to_timestamp(F.lit(max_bronze_ingestion_timestamp))}
    )
   
    print("update bronze_ingestion_timestamp:",max_bronze_ingestion_timestamp)
    print ("insert and update done: ", datetime.now(tz=local_tz))


# f


In [0]:
# Define the full schema including additionalJson
schema = """
ARRAY<STRUCT<
    datasetId: STRING,
    startTime: TIMESTAMP,
    endTime: TIMESTAMP,
    value: DOUBLE,
    additionalJson: STRUCT<
        Count: STRING,
        CustomerType: STRING,
        ReadTS: TIMESTAMP,
        Res: STRING,
        TimeSeriesType: STRING,
        Uom: STRING,
        Value: STRING
    >
>>
"""

# Explode the array with proper schema
df_flat = df.select(
    F.explode(F.from_json(F.col("data"), schema)).alias("record")
)

# Extract all fields including nested ones
df_result = df_flat.select(
    F.col("record.datasetId").alias("dataset_id"),
    F.col("record.startTime").alias("start_time"),
    F.col("record.endTime").alias("end_time"),
    F.col("record.value").alias("value"),
    F.col("record.additionalJson.Count").alias("count"),
    F.col("record.additionalJson.CustomerType").alias("customer_type"),
    F.col("record.additionalJson.ReadTS").alias("read_ts"),
    F.col("record.additionalJson.Res").alias("res"),
    F.col("record.additionalJson.TimeSeriesType").alias("time_series_type"),
    F.col("record.additionalJson.Uom").alias("uom"),
    F.col("record.additionalJson.Value").alias("additional_value")
)


In [0]:
import pyspark.sql.functions as F
from delta.tables import *
import datetime
from datetime import datetime
import pytz

catalog = "fingrid_test_workspace"

# raw data
bronze_schema = "fingrid_bronze"
raw_table_name = "electricity_consumption_raw"
raw_table = ".".join([catalog, bronze_schema, raw_table_name])


# silver data 
silver_schema = "fingrid_silver"
silver_table_name = "electricity_consumption"
silver_table = ".".join([catalog, silver_schema, silver_table_name])

# control table 
control_table = "fingrid_test_workspace.fingrid_load_control.load_control"


local_tz = pytz.timezone("Europe/Helsinki")  

if spark.catalog.tableExists(control_table):
    current_bronze_ingestion_timestamp = spark.read.table(control_table).where(F.col("source_dataset_id") =='358').select(F.max("bronze_ingestion_timestamp")).collect()[0][0]

    # current_bronze_ingestion_timestamp = "2025-01-01T00:00:00.000+00:00"

    print("current_bronze_ingestion_timestamp:",current_bronze_ingestion_timestamp)

    df = (
        spark.read.format("delta")
        .table(raw_table)
        .where((F.col("ingestion_timestamp") > current_bronze_ingestion_timestamp ) )
    )

        
    print("Total rows(pages): ", df.count())
    print("read data done", datetime.now(tz=local_tz))

else:
    df = (
        spark.read.format("delta")
        .table(raw_table)
    )
if df.isEmpty():
        print ("No data")
else:
    max_bronze_ingestion_timestamp = df.select(F.max(F.col("ingestion_timestamp"))).collect()[0][0]
    print("max_bronze_ingestion_timestamp:",max_bronze_ingestion_timestamp)


    print ("start to transform data: ", datetime.now(tz=local_tz))

    # Define the full schema including additionalJson
    schema = """
    ARRAY<STRUCT<
        datasetId: STRING,
        startTime: TIMESTAMP,
        endTime: TIMESTAMP,
        value: DOUBLE,
        additionalJson: STRUCT<
            Count: STRING,
            CustomerType: STRING,
            ReadTS: TIMESTAMP,
            Res: STRING,
            TimeSeriesType: STRING,
            Uom: STRING,
            Value: STRING
        >
    >>
    """

    # Explode the array with proper schema
    df_flat = df.select(
        F.explode(F.from_json(F.col("data"), schema)).alias("record")
    )

    # Extract all fields including nested ones
    df_result = df_flat.select(
        F.col("record.datasetId").alias("dataset_id"),
        F.col("record.startTime").alias("start_time"),
        F.col("record.endTime").alias("end_time"),
        F.col("record.value").alias("value"),
        F.col("record.additionalJson.Count").alias("count"),
        F.col("record.additionalJson.CustomerType").alias("customer_type"),
        F.col("record.additionalJson.ReadTS").alias("read_ts"),
        F.col("record.additionalJson.Res").alias("res"),
        F.col("record.additionalJson.TimeSeriesType").alias("time_series_type"),
        F.col("record.additionalJson.Uom").alias("uom"),
        F.col("record.additionalJson.Value").alias("additional_value")
    )

    df_result = df_result.withColumn("refresh_timestamp", F.current_timestamp())


    print ("start to insert or update data: ", datetime.now(tz=local_tz))

    # update data to table
    df_existing_silver_table = DeltaTable.forName( sparkSession=spark, tableOrViewName=silver_table)
    df_control_table = DeltaTable.forName( sparkSession=spark, tableOrViewName=control_table)
    df_existing_silver_table.alias('df_existing') \
        .merge(
            df_result.alias('updates'),
            "df_existing.dataset_id = updates.dataset_id and df_existing.start_time = updates.start_time and df_existing.end_time = updates.end_time and df_existing.customer_type = updates.customer_type"
        ) \
        .whenMatchedUpdateAll() \
        .whenNotMatchedInsertAll()\
        .execute()

    df_control_table.alias('control_table').update(
        condition=F.col('source_dataset_id') == '358',
        set={'bronze_ingestion_timestamp': F.to_timestamp(F.lit(max_bronze_ingestion_timestamp))}
    )
   
    print("update bronze_ingestion_timestamp:",max_bronze_ingestion_timestamp)
    print ("insert and update done: ", datetime.now(tz=local_tz))
