##Housekeeping script for all lines
This script contains common configurations needed to transform all lines. 

###Configuration variables/set up included in this script:
1. Authentication detail to connect to Kafka topic on Confluent cluster
2. The check point location, output root directory, and trigger interval for the data stream
3. The output directory of DF_RUN (for RCA)
4. The technical schema of Kafka messages
5. Schema of the pivot-schema table
6. Column name mapping and schema of mapped fields for running the business schema mapping logic
7. Create local business-schema-mapping table and source-meta-data table

In [0]:
import pyspark.sql.types as T
from pyspark.sql.types import *
import json


### configuration details to connect to Kafka source on Confluent cluster
conf={'bootstrap.servers': 'pkc-4rn2p.canadacentral.azure.confluent.cloud:9092', 
      'security.protocol': 'SASL_SSL', 
      'sasl.mechanisms': 'PLAIN', 
      'sasl.username': 'D27IHIL45XD46XTF', 
      'sasl.password': 'UB3NYoxYI1NYvLMUEZrHuu5nYO9ZFR4jwAJMxQckb10QxvWtjU3zP1363Y2Akgcg',
      'startingOffsets': starting_offsets,
      'topic':kafka_topic
     }


### streaming specific configurations
checkpointPath = f'dbfs:/acerta/kafka-spark/checkpoint/v5/{kafka_topic}/'
triggerProcessingInterval = "30 seconds"


### output locations
# pivotRootDir = 'dbfs:/acerta/output/cube/pivot/'
# cubeRunRootDir = f'dbfs:/acerta/output/cube/run/'
pivotRootDir = 'dbfs:/acerta/output/v5/cube/pivot/'
pivotSchemaDir = 'dbfs:/acerta/output/v5/cube/pivot/schema'
cubeRunRootDir = f'dbfs:/acerta/output/v5/cube/run/'
outputDfRunRootDir = f'dbfs:/acerta/output/v5/rca/run/'
outputDfHistoryRootDir = f'dbfs:/acerta/output/v5/rca/history/'


# ingress data configuration
ingress_schema_ddl = f"`dataSourceId` STRING, `sourceFileId` STRING, `schemaVersion` STRING, `rows` ARRAY<MAP<STRING, STRING>>"
ingress_schema = T._parse_datatype_string(ingress_schema_ddl)


# pivot schema table configuration
pivot_schema_table_ddl = "`dataSourceId` STRING, `pivotSchemaVersion` INTEGER, `pivotIndexDdl` STRING, `featureList` STRING, `updated_on` STRING"
pivot_idx_ddl = "`part_number` STRING, `serial_number` STRING, `timestamp` STRING, `station_list` ARRAY<STRING>, `measurement_date` DATE" 


# map of configuration fields in the business-schema-mapping table to results dataframe column headers
config_col_mapping = {"line":"line", 
                      "station_config":"station", 
                      "sensor_config":"sensor", 
                      "part_number":"part_number", 
                      "serial_number":"serial_number",
                      "measured_time":"timestamp"
                     }

# data schema of mapped fields after business/logic schema mapping is applied
mapped_schema = (StructType([StructField("line", StringType(),True),
                             StructField("part_number", StringType(),True),
                             StructField("serial_number", StringType(),True),
                             StructField("station", StringType(),True),
                             StructField("sensor", StringType(),True),
                             StructField("timestamp", StringType(),True),
                             StructField("measurements", MapType(StringType(),StringType()), True)
                            ])
                )

### For dev/testing only, creates local table mirrors of configuration tables
# create and populate business schema mapping table
businessSchemaTabDir = f'dbfs:/acerta/schema/business/'
sourceDataMetaTabDir = f'dbfs:/acerta/schema/source_meta/'

business_schema_table_ddl = '`data_source_id` STRING, `schema_version` STRING, `schema_ddl` STRING, `logic_mapping` STRING'
business_schema_df = (spark
                      .createDataFrame(
                        data=[('b31d684c-d84b-44fa-9873-47c561542df9', '6', '`Line` STRING, `Station` STRING, `Part Number` STRING, `Database Code` STRING, `Serial Number` STRING, `Time` STRING, `Gun` STRING, `Job` STRING, `Pass` STRING, `Torque` STRING, `Ang (deg.)` STRING', '{"line":["Line"], "station_config":["Line", "Station"], "sensor_config":["Line", "Station", "Gun", "Job"], "part_number": ["Part_Number"], "serial_number": ["Serial_Number"], "measurement":["Torque", "Ang__deg__"], "measured_time":"Time"}'),
                              ('d21ae4b8-51d8-4990-b9da-91c5cfc67927', '7', "`serial_number` STRING, `process_name` STRING, `process_attribute_1` STRING, `process_attribute_2` STRING, `process_attribute_3` STRING, `process_attribute_4` STRING, `process_attribute_5` STRING, `data_element_name` STRING, `data_element_attribute_1` STRING, `data_element_attribute_2` STRING, `data_element_attribute_3` STRING, `data_element_attribute_4` STRING, `data_element_attribute_5` STRING, `location_name` STRING, `parent_location_name` STRING, `part_number` STRING, `data_value` STRING, `created` STRING, `trace` STRING, `test` STRING", 
                               '{"line":["parent_location_name"], \
                              "station_config":["location_name"], \
                              "sensor_config":["parent_location_name", "location_name", "process_name", "process_attribute_1", "process_attribute_2", "process_attribute_3", "process_attribute_4", "process_attribute_5", "data_element_name", "data_element_attribute_1", "data_element_attribute_2", "data_element_attribute_3", "data_element_attribute_4", "data_element_attribute_5"], \
                              "part_number": ["part_number"], \
                              "serial_number": ["serial_number"], \
                              "measurement":["data_value"], \
                              "measured_time":"created"}'),
                              ('86494c9c-143c-2e85-3b50-2021de22c403', '9', "`serial_number` STRING, `process_name` STRING, `process_attribute_1` STRING, `process_attribute_2` STRING, `process_attribute_3` STRING, `process_attribute_4` STRING, `process_attribute_5` STRING, `data_element_name` STRING, `data_element_attribute_1` STRING, `data_element_attribute_2` STRING, `data_element_attribute_3` STRING, `data_element_attribute_4` STRING, `data_element_attribute_5` STRING, `location_name` STRING, `parent_location_name` STRING, `part_number` STRING, `data_value` STRING, `created` STRING, `trace` STRING, `test` STRING", 
                               '{"line":["parent_location_name"], \
                              "station_config":["location_name"], \
                              "sensor_config":["parent_location_name", "location_name", "process_name", "process_attribute_1", "process_attribute_2", "process_attribute_3", "process_attribute_4", "process_attribute_5", "data_element_name", "data_element_attribute_1", "data_element_attribute_2", "data_element_attribute_3", "data_element_attribute_4", "data_element_attribute_5"], \
                              "part_number": ["part_number"], \
                              "serial_number": ["serial_number"], \
                              "measurement":["data_value"], \
                              "measured_time":"created"}'),
                              ('2665e070-cbde-ac9f-d6d0-228d82bc75ac', '8', "`serial_number` STRING, `process_name` STRING, `process_attribute_1` STRING, `process_attribute_2` STRING, `process_attribute_3` STRING, `process_attribute_4` STRING, `process_attribute_5` STRING, `data_element_name` STRING, `data_element_attribute_1` STRING, `data_element_attribute_2` STRING, `data_element_attribute_3` STRING, `data_element_attribute_4` STRING, `data_element_attribute_5` STRING, `location_name` STRING, `parent_location_name` STRING, `part_number` STRING, `data_value` STRING, `created` STRING, `trace` STRING, `test` STRING", 
                               '{"line":["parent_location_name"], \
                              "station_config":["location_name"], \
                              "sensor_config":["parent_location_name", "location_name", "process_name", "process_attribute_1", "process_attribute_2", "process_attribute_3", "process_attribute_4", "process_attribute_5", "data_element_name", "data_element_attribute_1", "data_element_attribute_2", "data_element_attribute_3", "data_element_attribute_4", "data_element_attribute_5"], \
                              "part_number": ["part_number"], \
                              "serial_number": ["serial_number"], \
                              "measurement":["data_value"], \
                              "measured_time":"created"}')
                             ],
                        schema=T._parse_datatype_string(business_schema_table_ddl)
                      )
                     )
business_schema_df.registerTempTable("business_schema_table")
dbutils.fs.rm(businessSchemaTabDir, True)
business_schema_df.write.parquet(businessSchemaTabDir)  
display(spark.sql("select * from business_schema_table"))

# create data source meta data table
data_source_meta_table_ddl = '`data_source_id` STRING, `client` STRING, `location` STRING, `line` STRING, `source_type` STRING, `folder_location` STRING'
data_source_meta_df = (spark
                      .createDataFrame(
                        data=[('b31d684c-d84b-44fa-9873-47c561542df9', 'Borg Warner', 'XYZ', 'L14', 'csv', 'server://path'),
                              ('d21ae4b8-51d8-4990-b9da-91c5cfc67927', 'Dana', 'Birmingham', 'ASC1', 'csv', 'server://path'),
                              ('86494c9c-143c-2e85-3b50-2021de22c403', 'Dana', 'Dry Ridge', '700_line', 'csv', 'server://path'),
                              ('2665e070-cbde-ac9f-d6d0-228d82bc75ac', 'Dana', 'Columbia', 'Loop_1', 'csv', 'server://path')
                             ],
                        schema=T._parse_datatype_string(data_source_meta_table_ddl)
                      )
                     )
data_source_meta_df.registerTempTable("data_source_meta_table")
dbutils.fs.rm(sourceDataMetaTabDir, True)
data_source_meta_df.write.parquet(sourceDataMetaTabDir)  
display(spark.sql("select * from data_source_meta_table"))