# 1. Get dynamic pipeline parameters

In [2]:
# Get pipeline run id
loadid = 0

# Get pipeline name
pipelinename = 'pipeline_name'

# Get keyvault linked service name
keyvaultlsname = 'Ls_KeyVault_01'

StatementMeta(synspdevdep70, 39, 2, Finished, Available)

# 2. Prepare observability mechanisms variables

In [28]:
from pyspark.sql import SparkSession

sc = SparkSession.builder.getOrCreate()
token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary

env = mssparkutils.env
pipelineruninfo = f'[{pipelinename}]::[{loadid}]::[{env.getJobId()}]::[{env.getPoolName}]::[{env.getWorkspaceName}]::[{env.getUserId()}]'

# Needed to get App Insights Key
appi_key = token_library.getSecretWithLS(keyvaultlsname,"applicationInsightsKey")


StatementMeta(synspdevdep70, 39, 29, Finished, Available)

syndpdevdep70

# 3. Transform and load Dimension tables


In [42]:
import datetime
import os
from pyspark.sql.functions import col, lit
import ddo_transform.transform as t
import ddo_transform.util as util

load_id = loadid
loaded_on = datetime.datetime.now()

# Primary storage info 
account_name = token_library.getSecretWithLS(keyvaultlsname,"datalakeaccountname")
container_name = 'datalake' # fill in your container name 
relative_path = 'data/dw/' # fill in your relative folder path 

base_path = 'abfss://%s@%s.dfs.core.windows.net/%s' % (container_name, account_name, relative_path) 

# Read interim cleansed data
parkingbay_sdf = spark.read.table("interim.parking_bay").filter(col('load_id') == lit(load_id))
sensordata_sdf = spark.read.table("interim.sensor").filter(col('load_id') == lit(load_id))

# Read existing Dimensions
dim_parkingbay_sdf = spark.read.table("dw.dim_parking_bay")
dim_location_sdf = spark.read.table("dw.dim_location")
dim_st_marker = spark.read.table("dw.dim_st_marker")

# Transform
new_dim_parkingbay_sdf = t.process_dim_parking_bay(parkingbay_sdf, dim_parkingbay_sdf, load_id, loaded_on).cache()
new_dim_location_sdf = t.process_dim_location(sensordata_sdf, dim_location_sdf, load_id, loaded_on).cache()
new_dim_st_marker_sdf = t.process_dim_st_marker(sensordata_sdf, dim_st_marker, load_id, loaded_on).cache()

# Load
util.save_overwrite_unmanaged_table(spark, new_dim_parkingbay_sdf, table_name="dw.dim_parking_bay", path=os.path.join(base_path, "dim_parking_bay"))
util.save_overwrite_unmanaged_table(spark, new_dim_location_sdf, table_name="dw.dim_location", path=os.path.join(base_path, "dim_location"))
util.save_overwrite_unmanaged_table(spark, new_dim_st_marker_sdf, table_name="dw.dim_st_marker", path=os.path.join(base_path, "dim_st_marker"))




StatementMeta(synspdevdep70, 39, 43, Finished, Available)

# 4. Transform and load Fact tables

In [11]:
# Read existing Dimensions
dim_parkingbay_sdf = spark.read.table("dw.dim_parking_bay")
dim_location_sdf = spark.read.table("dw.dim_location")
dim_st_marker = spark.read.table("dw.dim_st_marker")

# Process
nr_fact_parking = t.process_fact_parking(sensordata_sdf, dim_parkingbay_sdf, dim_location_sdf, dim_st_marker, load_id, loaded_on)

# Insert new rows
nr_fact_parking.write.mode("append").insertInto("dw.fact_parking")

StatementMeta(synspdevdep70, 39, 11, Finished, Available)

AnalysisException: Table or view not found: `dw`.`dim_parking_bay`;;
'UnresolvedRelation `dw`.`dim_parking_bay`


# 5.  Observability: create log messages

In [35]:
new_dim_parkingbay_count = spark.read.table("dw.dim_parking_bay").count()
new_dim_location_count = spark.read.table("dw.dim_location").count()
new_dim_st_marker_count = spark.read.table("dw.dim_st_marker").count()
nr_fact_parking_count = nr_fact_parking.count()


final_message = f'Transform : Completed load::[new_dim_parkingbay_count::{new_dim_parkingbay_count}]::[new_dim_location_count:{new_dim_location_count}]::[new_dim_st_marker_count:{new_dim_st_marker_count}]::[nr_fact_parking_count:{nr_fact_parking_count}]'


StatementMeta(synspdevdep70, 39, 36, Finished, Available)

NameError: name 'nr_fact_parking' is not defined

# 7. Observability: logging on App Insigths using OpenCensus Library

In [None]:
import logging
import os
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.log_exporter import AzureEventHandler
from pyspark.sql.session import SparkSession
from datetime import datetime

# Enable App Insights
aiLogger = logging.getLogger("ParkingSensorLogs-Standardize")
aiLogger.addHandler(AzureEventHandler(connection_string = 'InstrumentationKey=' + appi_key))
#logger.addHandler(AzureLogHandler(connection_string = 'InstrumentationKey=' + appi_key))


aiLogger.warning("Starting at: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
properties = {'custom_dimensions': {'pipeline': pipelinename, 'run_id': loadid, 'new parking count': new_dim_parkingbay_count}}
aiLogger.warning(final_message, extra=properties)
# To query this log go to the Azure Monitor and run the following kusto query (if you are using the EventHandler)
#customEvents
#|order by timestamp desc
# To query this log go to the Azure Monitor and run the following kusto query (if you are using the LogHandler)
# traces
#|order by timestamp desc

# 6. Observability: logging on Log Analytics workspace using Log4J

In [None]:
import logging
import sys
from pyspark.sql import SparkSession

sc = SparkSession.builder.getOrCreate()
env = mssparkutils.env
pipelineruninfo = f'[{pipelinename}]::[{loadid}]::[{env.getJobId()}]::[{env.getPoolName}]::[{env.getWorkspaceName}]::[{env.getUserId()}]'
final_message = f'Standardize : Completed load::[new_dim_parkingbay_count:{new_dim_parkingbay_count}]::[new_dim_location_count:{new_dim_location_count}]::[new_dim_st_marker_count :{new_dim_st_marker_count }]::[nr_fact_parking_count:{nr_fact_parking_count}]'

# Enable Log Analytics using Log4J
log4jLogger = sc._jvm.org.apache.log4j
logger = log4jLogger.LogManager.getLogger("ParkingSensorLogs-Standardize")
logger.info(final_message)
# To query this log go to the log analytics workspace and run the following kusto query:
# SparkLoggingEvent_CL
# | where logger_name_s == "ParkingSensorLogs-Standardize"
sc.stop