In [None]:
# Below step is only for portfolio project. In real prod, need to use service principal to link storage account to Databricks
spark.sparkContext._jsc.hadoopConfiguration().set(
    "fs.azure.account.key.stenergyplatformadls.dfs.core.windows.net",
    "ACCESS_KEY"
)

In [None]:
# Validating access to storage account from Databricks
dbutils.fs.ls("abfss://bronze@stenergyplatformadls.dfs.core.windows.net/energy_trends/prices")

### Import libraries and functions

In [None]:
from pyspark.sql.functions import col, when, coalesce, last, monotonically_increasing_id, lit, to_date, year, month, regexp_extract, expr, round
from pyspark.sql.window import Window
from pyspark.sql.types import FloatType, DoubleType

### Define functions

In [None]:
def forward_fill_tablename(df_bronze):
    # Adding a column to hold table name
    df_with_table = df_bronze.withColumn(
        "table_name",
        when(col("generator_type").rlike("(?i)table"), col("generator_type"))
    )
    # Repartitioning to 1 partition to have sequential row ids
    df_with_table = df_with_table.repartition(1)

    # Forward filling generator type and table names
    df = df_with_table.withColumn("row_id", monotonically_increasing_id())
    df = df.withColumn('generator_type', coalesce(col('generator_type'), last('generator_type', True).over(Window.orderBy('row_id')), lit('0')))
    df_filled = df.withColumn('table_name', coalesce(col('table_name'), last('table_name', True).over(Window.orderBy('row_id')), lit('0')))

    df_data = df_filled.filter(~col("generator_type").rlike("(?i)table"))
    df_data = df_data.drop('row_id')

    return df_data

In [None]:
def transform_col_name(original_name, header_value):
    """
    Applies all logic: header extraction, lowercase, character removal, 
    'fuel' mapping, and 'provisional' removal.
    """
    name = str(header_value)
    
    name = (name.lower()
            .replace(" ", "_")
            .replace("(", "")
            .replace(")", "")
            .replace("[", "")
            .replace("]", "")
            .replace("\n", "_"))
    
    if "fuel" in name:
        return "fuel"
    
    name = name.replace("_provisional", "")
    return name

In [None]:
def column_mapping(df_bronze):
    header_row = df_bronze.filter('_c0 like "Generator type"').limit(1).collect()[0]
    current_cols = df_bronze.columns

    new_columns = [
        col(old_name).alias(transform_col_name(old_name, header_row[i]))
        for i, old_name in enumerate(current_cols)]
    
    df_column_mapped = df_bronze.select(*new_columns)
    return df_column_mapped

In [None]:
def round_float_columns(df):
    float_cols = [f.name for f in df.schema.fields if isinstance(f.dataType, (FloatType, DoubleType))]
    for column in float_cols:
        df = df.withColumn(column, round(col(column), 2))
    return df

In [None]:
def clean_et51_tables(bronze_path, table):
    if table == 'main':
        excel_data_address = "'Main Table'!A6"
    elif table == 'annual':
        excel_data_address = "'Annual'!A5"
    elif table == 'quarter':
        excel_data_address = "'Quarter'!A5"

    df_bronze_main = (
        spark.read
        .format("com.crealytics.spark.excel")
        .option("header", "false")
        .option("inferSchema", "true")
        .option("dataAddress", excel_data_address)
        .load(bronze_path)
    )
    df_correct_columns = column_mapping(df_bronze_main)
    df_with_tablename = forward_fill_tablename(df_correct_columns)
    df_cleaned = df_with_tablename.filter('generator_type not like "Generator type"')
    df_final = df_cleaned.withColumn("fuel", regexp_extract(col("fuel"), r"^([A-Za-z ]+)", 1))

    return df_final

### Implementation

In [None]:
bronze_path = "abfss://bronze@stenergyplatformadls.dfs.core.windows.net/energy_trends/prices/ET_5.1_SEP_25.xlsx"

for table in ('main', 'annual', 'quarter'):
    silver_path = f"abfss://silver@stenergyplatformadls.dfs.core.windows.net/energy_trends_generation/{table}/"
    df = clean_et51_tables(bronze_path, table)
    df.write.format("delta").mode("overwrite").save(silver_path)

In [None]:
dbutils.notebook.exit("Success")