# 1. Get dynamic pipeline parameters

In [None]:
# Get pipeline name
pipelinename = 'pipeline_name'

# Get pipeline run id
loadid = ''

# Get keyvault linked service name
keyvaultlsname = 'Ls_KeyVault_01'

# 2. Transform and load Dimension tables


In [None]:
import datetime
import os
from pyspark.sql.functions import col, lit
import ddo_transform.transform as t
import ddo_transform.util as util

load_id = loadid
loaded_on = datetime.datetime.now()

token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary

# Primary storage info 
account_name = token_library.getSecretWithLS(keyvaultlsname,"datalakeaccountname")
container_name = 'datalake' # fill in your container name 
relative_path = 'data/dw/' # fill in your relative folder path 

base_path = 'abfss://%s@%s.dfs.core.windows.net/%s' % (container_name, account_name, relative_path) 

# Read interim cleansed data
parkingbay_sdf = spark.read.table("interim.parking_bay").filter(col('load_id') == lit(load_id))
sensordata_sdf = spark.read.table("interim.sensor").filter(col('load_id') == lit(load_id))

# Read existing Dimensions
dim_parkingbay_sdf = spark.read.table("dw.dim_parking_bay")
dim_location_sdf = spark.read.table("dw.dim_location")
dim_st_marker = spark.read.table("dw.dim_st_marker")

# Transform
new_dim_parkingbay_sdf = t.process_dim_parking_bay(parkingbay_sdf, dim_parkingbay_sdf, load_id, loaded_on).cache()
new_dim_location_sdf = t.process_dim_location(sensordata_sdf, dim_location_sdf, load_id, loaded_on).cache()
new_dim_st_marker_sdf = t.process_dim_st_marker(sensordata_sdf, dim_st_marker, load_id, loaded_on).cache()

# Load
util.save_overwrite_unmanaged_table(spark, new_dim_parkingbay_sdf, table_name="dw.dim_parking_bay", path=os.path.join(base_path, "dim_parking_bay"))
util.save_overwrite_unmanaged_table(spark, new_dim_location_sdf, table_name="dw.dim_location", path=os.path.join(base_path, "dim_location"))
util.save_overwrite_unmanaged_table(spark, new_dim_st_marker_sdf, table_name="dw.dim_st_marker", path=os.path.join(base_path, "dim_st_marker"))

# 3. Transform and load Fact tables

In [None]:
# Read existing Dimensions
dim_parkingbay_sdf = spark.read.table("dw.dim_parking_bay")
dim_location_sdf = spark.read.table("dw.dim_location")
dim_st_marker = spark.read.table("dw.dim_st_marker")

# Process
new_fact_parking = t.process_fact_parking(sensordata_sdf, dim_parkingbay_sdf, dim_location_sdf, dim_st_marker, load_id, loaded_on)

# Insert new rows
new_fact_parking.write.mode("append").insertInto("dw.fact_parking")

# Recording record counts for logging purpose
new_dim_parkingbay_count = spark.read.table("dw.dim_parking_bay").count()
new_dim_location_count = spark.read.table("dw.dim_location").count()
new_dim_st_marker_count = spark.read.table("dw.dim_st_marker").count()
new_fact_parking_count = new_fact_parking.count()

# 4. Observability: Logging to Azure Application Insights using OpenCensus Library

In [None]:
import logging
import os
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.log_exporter import AzureEventHandler
from datetime import datetime

# Getting Application Insights instrumentation key
appi_key = token_library.getSecretWithLS(keyvaultlsname,"applicationInsightsKey")

# Enable App Insights
aiLogger = logging.getLogger(__name__)
aiLogger.addHandler(AzureEventHandler(connection_string = 'InstrumentationKey=' + appi_key))

aiLogger.setLevel(logging.INFO)

aiLogger.info("Transform (ai): Started at " + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
properties = {"custom_dimensions": {"pipeline": pipelinename, "run_id": loadid, "new_parkingbay_count": new_dim_parkingbay_count}}
aiLogger.info("Transform (ai): Completed at " + datetime.now().strftime("%Y-%m-%d %H:%M:%S"), extra=properties)

# To query this log, go to the Azure Monitor and run the following kusto query (Scope: Application Insights instance):
#customEvents
#| order by timestamp desc
#| project timestamp, appName, name,
#    pipelineName             = customDimensions.pipeline,
#    pipelineRunId            = customDimensions.run_id,
#    parkingbayCount          = customDimensions.parkingbay_count,
#    sensordataCount          = customDimensions.sensordata_count,
#    parkingbayMalformedCount = customDimensions.parkingbay_malformed_count,
#    sensordataMalformedCount = customDimensions.sensordata_malformed_count,
#    dimParkingbayCount       = customDimensions.new_parkingbay_count

# 5. Observability: Logging to Log Analytics workspace using log4j

In [None]:
import logging
import sys

# Enable Log Analytics using log4j
log4jLogger = sc._jvm.org.apache.log4j
logger = log4jLogger.LogManager.getLogger("ParkingSensorLogs")

def log(msg = ''):
    env = mssparkutils.env
    formatted_msg = f'Transform (log4j): {msg}~{pipelinename}~{env.getJobId()}~{env.getPoolName()}~{env.getWorkspaceName()}~{env.getUserId()}'
    logger.info(formatted_msg)

log("Started at " + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

log(f'new_dim_parkingbay_count: {new_dim_parkingbay_count}')
log(f'new_dim_location_count: {new_dim_location_count}')
log(f'new_dim_st_marker_count: {new_dim_st_marker_count}')
log(f'new_fact_parking_count: {new_fact_parking_count}')

log("Completed at " + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))

# To query this log, go to the log analytics workspace and run the following kusto query (Scope: Log Analytics Workspace):
#SparkLoggingEvent_CL
#| where logger_name_s == "ParkingSensorLogs"
#| order by TimeGenerated desc
#| project TimeGenerated, workspaceName_s, Level,
#    message         = split(Message, '~', 0),
#    pipelineName    = split(Message, '~', 1),
#    jobId           = split(Message, '~', 2),
#    SparkPoolName   = split(Message, '~', 3),
#    UserId          = split(Message, '~', 5)