# 1. Get dynamic pipeline parameters

In [1]:
# Get folder where the REST downloads were placed
infilefolder = '2021_10_05_07_58_15/'

# Get pipeline name
pipelinename = 'P_Ingest_MelbParkingData'

# Get pipeline run id
loadid = 'df2ddb82-9004-449f-84da-ae9484b446f4"'

# Get keyvault linked service name
keyvaultlsname = 'Ls_KeyVault_01'


StatementMeta(synspdevdep70, 43, 1, Finished, Available)

# 2. Prepare observability mechanisms variables

In [None]:
from pyspark.sql import SparkSession

sc = SparkSession.builder.getOrCreate()
token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary

env = mssparkutils.env
pipelineruninfo = f'[{pipelinename}]::[{loadid}]::[{env.getJobId()}]::[{env.getPoolName}]::[{env.getWorkspaceName}]::[{env.getUserId()}]'

# Needed to get App Insights Key
appi_key = token_library.getSecretWithLS(keyvaultlsname,"applicationInsightsKey")
sc.stop



StatementMeta(synspdevdep63, 3, 2, Finished, Available)

# 3. Load file path variables

In [None]:
import os
import datetime
from pyspark.sql import SparkSession

sc = SparkSession.builder.getOrCreate()
token_library = sc._jvm.com.microsoft.azure.synapse.tokenlibrary.TokenLibrary

# For testing
#infilefolder = '2021_08_17_09_23_52/'

# Primary storage info 
account_name = token_library.getSecretWithLS( keyvaultlsname, "datalakeaccountname")
container_name = 'datalake' # fill in your container name 
relative_path = 'data/lnd/' # fill in your relative folder path 

adls_path = 'abfss://%s@%s.dfs.core.windows.net/%s' % (container_name, account_name, relative_path) 
print('Primary storage account path: ' + adls_path) 
load_id = loadid
loaded_on = datetime.datetime.now()
base_path = os.path.join(adls_path, infilefolder)

parkingbay_filepath = os.path.join(base_path, "MelbParkingBayData.json")
print(parkingbay_filepath)
sensors_filepath = os.path.join(base_path, "MelbParkingSensorData.json")
print(sensors_filepath)
#sc.stop

StatementMeta(synspdevdep63, 3, 3, Finished, Available)

Primary storage account path: abfss://datalake@mdwdopsstdevdep63.dfs.core.windows.net/data/lnd/
abfss://datalake@mdwdopsstdevdep63.dfs.core.windows.net/data/lnd/2021_10_05_07_58_15/MelbParkingBayData.json
abfss://datalake@mdwdopsstdevdep63.dfs.core.windows.net/data/lnd/2021_10_05_07_58_15/MelbParkingSensorData.json


<bound method SparkSession.stop of <pyspark.sql.session.SparkSession object at 0x7f6ac9edf908>>

# 4. Transform: Standardize

In [None]:
import ddo_transform.standardize as s

# Retrieve schema
parkingbay_schema = s.get_schema("in_parkingbay_schema")
sensordata_schema = s.get_schema("in_sensordata_schema")

# Read data
parkingbay_sdf = spark.read\
  .schema(parkingbay_schema)\
  .option("badRecordsPath", os.path.join(base_path, "__corrupt", "MelbParkingBayData"))\
  .option("multiLine", True)\
  .json(parkingbay_filepath)
sensordata_sdf = spark.read\
  .schema(sensordata_schema)\
  .option("badRecordsPath", os.path.join(base_path, "__corrupt", "MelbParkingSensorData"))\
  .option("multiLine", True)\
  .json(sensors_filepath)


# Standardize
t_parkingbay_sdf, t_parkingbay_malformed_sdf = s.standardize_parking_bay(parkingbay_sdf, load_id, loaded_on)
t_sensordata_sdf, t_sensordata_malformed_sdf = s.standardize_sensordata(sensordata_sdf, load_id, loaded_on)

# Insert new rows
t_parkingbay_sdf.write.mode("append").insertInto("interim.parking_bay")
t_sensordata_sdf.write.mode("append").insertInto("interim.sensor")

# Insert bad rows
t_parkingbay_malformed_sdf.write.mode("append").insertInto("malformed.parking_bay")
t_sensordata_malformed_sdf.write.mode("append").insertInto("malformed.sensor")

StatementMeta(synspdevdep63, 3, 4, Finished, Available)

: 

# 5. Observability: create log messages

In [None]:
parkingbay_count = t_parkingbay_sdf.count()
sensordata_count = t_sensordata_sdf.count()
parkingbay_malformed_count = t_parkingbay_malformed_sdf.count()
sensordata_malformed_count = t_sensordata_malformed_sdf.count()


final_message = f'Standardize : Completed load {pipelineruninfo}::[parkingbay_filepath::{parkingbay_filepath}]::[sensors_filepath:{sensors_filepath}]::[parkingbay_count:{parkingbay_count}]::[sensordata_count:{sensordata_count}]::[parkingbay_malformed_count:{parkingbay_malformed_count}]::[sensordata_malformed_count:{sensordata_malformed_count}]'


# 6. Observability: logging on App Insigths using OpenCensus Library

In [None]:
import logging
import os
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.log_exporter import AzureEventHandler
#from pyspark.sql.session import SparkSession
from datetime import datetime

# Enable App Insights
aiLogger = logging.getLogger("ParkingSensorLogs-Standardize")
aiLogger.addHandler(AzureEventHandler(connection_string = 'InstrumentationKey=' + appi_key))
#logger.addHandler(AzureLogHandler(connection_string = 'InstrumentationKey=' + appi_key))


aiLogger.warning("Starting at: " + datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
properties = {'custom_dimensions': {'pipeline': pipelinename, 'run_id': loadid, 'parking count': parkingbay_count, 'sensor count': sensordata_count}}
aiLogger.warning(final_message, extra=properties)
# To query this log go to the Azure Monitor and run the following kusto query (if you are using the EventHandler)
#customEvents
#|order by timestamp desc
# To query this log go to the Azure Monitor and run the following kusto query (if you are using the LogHandler)
# traces
#|order by timestamp desc



# 7. Observability logging on Log Analytics workspace using Log4J

In [None]:
import logging
import sys
from pyspark.sql import SparkSession

sc = SparkSession.builder.getOrCreate()
env = mssparkutils.env
pipelineruninfo = f'[{pipelinename}]::[{loadid}]::[{env.getJobId()}]::[{env.getPoolName}]::[{env.getWorkspaceName}]::[{env.getUserId()}]'
final_message = f'Standardize:Completed load::[parkingbay_filepath::{parkingbay_filepath}]::[sensors_filepath:{sensors_filepath}]::[parkingbay_count:{parkingbay_count}]::[sensor_count:{sensordata_count}]::[parkingbay_malformed_count:{parkingbay_malformed_count}]::[sensor_malformed_count:{sensordata_malformed_count}]'

# Enable Log Analytics using Log4J
log4jLogger = sc._jvm.org.apache.log4j
logger = log4jLogger.LogManager.getLogger("ParkingSensorLogs-Standardize")
logger.info(pipelineruninfo)
logger.info(final_message)

# To query this log go to the log analytics workspace and run the following kusto query:
# SparkLoggingEvent_CL
# | where logger_name_s == "ParkingSensorLogs-Standardize"


sc.stop