In [None]:
# ADLS Connectivity for Portfolio Project
# Note: In production, use Service Principals or Managed Identities instead of Access Keys.
spark.sparkContext._jsc.hadoopConfiguration().set(
    "fs.azure.account.key.stenergyplatformadls.dfs.core.windows.net",
    "ACCESS_KEY"
)

## 1. Import Libraries and Dependencies
Loading standard PySpark SQL functions for data manipulation, windowing, and schema management.

In [None]:
from pyspark.sql.functions import col, when, coalesce, last, monotonically_increasing_id, lit, to_date, year, month, regexp_extract, expr, round, concat
from pyspark.sql.window import Window
from pyspark.sql.types import FloatType, DoubleType
from datetime import datetime

## 2. Ingest Silver Layer Data
Loading the cleaned datasets from the Silver container. This includes both ET 5.1 (Generation) and ET 5.2 (Supply & Demand) tables.

In [None]:
df_et51_silver_annual = spark.read.format("delta").load(
    "abfss://silver@stenergyplatformadls.dfs.core.windows.net/energy_generation_trends/annual/"
)
df_et51_silver_quarter = spark.read.format("delta").load(
    "abfss://silver@stenergyplatformadls.dfs.core.windows.net/energy_generation_trends/quarter/"
)
df_et52_silver_annual = spark.read.format("delta").load(
    "abfss://silver@stenergyplatformadls.dfs.core.windows.net/energy_trends_supply_demand/annual/"
)
df_et52_silver_quarter = spark.read.format("delta").load(
    "abfss://silver@stenergyplatformadls.dfs.core.windows.net/energy_trends_supply_demand/quarter/"
)

## 3. Curate ET 5.1: Electricity Generation by Fuel
Transforming wide-format annual and quarterly generation data into a long-format fact table.

In [None]:
# 3a. Curate ET 5.1 Annual Dataset
# Mapping table names to standardized metric types
df_et51_silver_annual = (
    df_et51_silver_annual
    .withColumn(
        "metric_type",
        when(col("table_name").contains("fuel used"), "fuel_used")
        .when(col("table_name").contains("generated"), "electricity_generated")
        .when(col("table_name").contains("supplied"), "electricity_supplied")
    )
)

# Pivoting Annual columns (Years) into rows
value_columns = [c for c in df_et51_silver_annual.columns if c.isdigit()]
df_et51_gold_annual = (
    df_et51_silver_annual
    .select("fuel", "generator_type", "metric_type", "unit", "ingestion_date", lit("annual").alias("period_type"), *value_columns)
    .selectExpr(
        "fuel", "generator_type", "metric_type", "unit", "ingestion_date", "period_type",
        "stack({}, {}) as (year, value)".format(len(value_columns), ",".join([f"'{c}', `{c}`" for c in value_columns]))
    )
    .withColumn("quarter", lit(None))
)

In [None]:
# 3b. Curate ET 5.1 Quarterly Dataset
# Transforming Quarterly columns (e.g., quarter_1_2023) into time dimensions
df_et51_silver_quarter = (
    df_et51_silver_quarter.withColumn("metric_type",when(col("table_name").contains("fuel used"), "fuel_used")
        .when(col("table_name").contains("generated"), "electricity_generated")
        .when(col("table_name").contains("supplied"), "electricity_supplied")
    )
)
quarter_cols = [c for c in df_et51_silver_quarter.columns if c.startswith("quarter_")]
stack_expr = ", ".join([f"'{c}', `{c}`" for c in quarter_cols])

df_et51_quarter_final = (
    df_et51_silver_quarter
    .select("generator_type", "fuel", "metric_type", "unit", "ingestion_date", lit("quarterly").alias("period_type"), 
            expr(f"stack({len(quarter_cols)}, {stack_expr}) as (period_col, value)"))\
    .filter(col("value").isNotNull())\
    .withColumn("quarter", concat(lit("Q"), regexp_extract("period_col", r"quarter_(\d)_", 1)))\
    .withColumn("year", regexp_extract("period_col", r"_(\d{4})$", 1).cast("int"))    
)
df_et51_quarter_gold = df_et51_quarter_final.select("generator_type", "fuel", "metric_type", "unit", "period_type", "year", "quarter", col("value").cast("double"), "ingestion_date")

## 4. Consolidate ET 5.1 Gold Table
Unifying annual and quarterly views into a single fact table partitioned by year for query optimization.

In [None]:
df_et51_gold = df_et51_gold_annual.unionByName(df_et51_quarter_gold)
df_et51_gold.write \
    .format("delta") \
    .mode("overwrite") \
    .partitionBy("year") \
    .save("abfss://gold@stenergyplatformadls.dfs.core.windows.net/energy_trends/fact_electricity_by_fuel")

## 5. Curate ET 5.2: Supply and Demand Components
The ET 5.2 table tracks electricity supply, imports, exports, and demand across the UK. Below, we pivot both Annual and Quarterly data into a long-format Gold table.
 Standardizing supply and demand metrics into a longitudinal dataset.

In [None]:
# 5a. Curate ET 5.2 Annual Dataset
value_columns_52_ann = [c for c in df_et52_silver_annual.columns if c.isdigit()]

df_et52_gold_annual = (
    df_et52_silver_annual
    .select("supply_demand_components", "unit", "ingestion_date", lit("annual").alias("period_type"), *value_columns_52_ann)
    .selectExpr("supply_demand_components", "unit", "ingestion_date", "period_type",
                "stack({}, {}) as (year, value)".format(len(value_columns_52_ann), ",".join([f"'{c}', `{c}`" for c in value_columns_52_ann])))
    .withColumn("quarter", lit(None))
    .select("supply_demand_components", "unit", "period_type", col("year").cast("int"), "quarter", round(col("value").cast("double"), 2).alias("value"), "ingestion_date")
)

In [None]:
# 5b. Curate ET 5.2 Quarterly Dataset
# Extract columns starting with 'quarter_' and unpivot using stack
quarter_cols_52 = [c for c in df_et52_silver_quarter.columns if c.endswith(("_q1", "_q2", "_q3", "_q4"))]
stack_expr_52 = ", ".join([f"'{c}', `{c}`" for c in quarter_cols_52])

df_et52_gold_quarter = (
    df_et52_silver_quarter
    .select("supply_demand_components", "unit", "ingestion_date", lit("quarterly").alias("period_type"),
            expr(f"stack({len(quarter_cols_52)}, {stack_expr_52}) as (period_col, value)"))
    .filter(col("value").isNotNull())
    .withColumn("quarter", concat(lit("Q"), regexp_extract("period_col",  r"_q(\d)$", 1)))
    .withColumn("year", regexp_extract("period_col", r"^(\d{4})", 1).cast("int"))
    .select("supply_demand_components", "unit", "period_type", "year", "quarter", round(col("value").cast("double"), 2).alias("value"), "ingestion_date")
)

## 6. Consolidate and Write ET 5.2 Gold Table
Merging the annual and quarterly views into a final Gold table partitioned by year for query performance.

In [None]:
df_et52_gold = df_et52_gold_annual.unionByName(df_et52_gold_quarter)

df_et52_gold.write \
    .format("delta") \
    .mode("overwrite") \
    .partitionBy("year") \
    .save("abfss://gold@stenergyplatformadls.dfs.core.windows.net/energy_trends/fact_supply_demand")

## Notebook exit

In [None]:
dbsparkutils.notebook.exit("Success")