# Bronze Layer
The bronze layer represents the raw, unprocessed data ingested into the data pipeline.
The below code sets up streaming data ingestion from three Kafka topics and creates corresponding Delta tables for each topic. Here's a summary of the code:

1. Configure parameters for connecting to Kafka topics and the Event Hub.
2. Create the first bronze table (`dim1_raw`) by reading streaming data from Kafka topic 1 and converting the base64-encoded key and value columns to strings.
3. Decorate the `dim1_raw` function with `@dlt.table` to create a Delta table with specified table properties and Spark configurations.
4. Create the second bronze table (`dim2_raw`) by following similar steps as for `dim1_raw`, but reading from Kafka topic 2.
5. Decorate the `dim2_raw` function with `@dlt.table` to create another Delta table.
6. Create the third bronze table (`fact_raw`) by following similar steps as for `dim1_raw`, but reading from Kafka topic 3.
7. Decorate the `fact_raw` function with `@dlt.table` to create the final Delta table.

The code performs Kafka integration, and sets up Delta tables for further processing and analysis of the streaming data.

In [None]:
from delta.tables import *
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Configure parameters for connecting to Kafka topics and event hub

Topic_Name1 = {topic1} # name of the first Kafka topic. 
Topic_Name2 = {topic2} # name of the second Kafka topic.
Topic_Name3 = {topic3} # name of the third Kafka topic.
EH_NS_NAME = {Event Hb Name}  # name of the event hub
BOOTSTRAP_SERVERS = f"{EH_NS_NAME}.servicebus.windows.net:9093" # Bootstrap server address for connecting to Kafka
SAKEY = "UR+tdi5brOqFxphEl2rZdwszylRHA3tkwhOqsdqA464=" # shared access key used for authentication with the event hub
CONN_STRING = f"Endpoint=sb://{EH_NS_NAME}.servicebus.windows.net/;SharedAccessKeyName=RootManageSharedAccessKey;SharedAccessKey={SAKEY}" # connection string for connecting to the event hub
LOGIN_MODULE = "kafkashaded.org.apache.kafka.common.security.plain.PlainLoginModule" # login module used for authentication with Kafka
EH_SASL = (
    f'{LOGIN_MODULE} required username="$ConnectionString" password="{CONN_STRING}";' # SASL (Simple Authentication and Security Layer) configuration string for connecting to Kafka with the event hub
)

# --Create first bronze table--#

# read streaming data from Kafka topic 1
df1 = (
    spark.readStream.format("kafka") # Specifies format of the data source
    .option("kafka.bootstrap.servers", f"{EH_NS_NAME}.servicebus.windows.net:9093") # Specifies bootstrap server of the Kafka cluster
    .option("subscribe", Topic_Name1) # Specifies topic(s) to subscribe to
    .option("kafka.sasl.mechanism", "PLAIN") # Specifies the SASL (Simple Authentication and Security Layer) mechanism to use for authentication with Kafka
    .option("kafka.security.protocol", "SASL_SSL") # Specifies the security protocol to use for the connection. In this case, SASL_SSL is used, which combines SASL authentication with SSL encryption
    .option("kafka.sasl.jaas.config", EH_SASL) # Specifies the JAAS (Java Authentication and Authorization Service) configuration for SASL authentication
    .option("kafka.request.timeout.ms", "60000") # Specifies the timeout in milliseconds for Kafka requests.
    .option("kafka.session.timeout.ms", "60000") # Specifies the session timeout in milliseconds for Kafka consumer groups.
    .option("failOnDataLoss", "false") # Specifies whether to fail the stream query if data loss is detected. Setting it to "false" means that the query will continue processing even if data loss occurs.
    .option("startingOffsets", "earliest") # Specifies the starting offsets for reading from the Kafka topic. In this case, "earliest" is used to start reading from the earliest available offset.
    .load()
)

# Convert from base64 to string
df1 = df1.withColumn(
    "key", col("key").cast("string")
).withColumn("value", col("value").cast("string"))


# Write into delta table (/data/delta/dim1_raw)
##create a raw delta table from DataFrame
df1.writeStream.format("delta") \
   .outputMode("append") \
   .option("mergeSchema", "true") \
   .option("checkpointLocation", "</data/delta/dim1_raw_checkpoint_path>") \ ##A checkpoint directory/location is required to track the streaming updates. If not specified , a default checkpoint directory is created at /local_disk0/tmp/.
   .trigger("processing=30 seconds") \
   .start("/data/delta/dim1_raw")
## .toTable("dim1_raw")    ### Can be use .toTable instead of .start()


#### --Create second bronze table--#####

# read streaming data from Kafka topic 2
df2 = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", f"{EH_NS_NAME}.servicebus.windows.net:9093") # Specifies bootstrap server of the Kafka cluster
    .option("subscribe", Topic_Name2)  # Specifies topic(s) to subscribe to
    .option("kafka.sasl.mechanism", "PLAIN") # Specifies the SASL (Simple Authentication and Security Layer) mechanism to use for authentication with Kafka
    .option("kafka.security.protocol", "SASL_SSL") # Specifies the security protocol to use for the connection. In this case, SASL_SSL is used, which combines SASL authentication with SSL encryption 
    .option("kafka.sasl.jaas.config", EH_SASL) # Specifies the JAAS (Java Authentication and Authorization Service) configuration for SASL authentication
    .option("kafka.request.timeout.ms", "60000") # Specifies the timeout in milliseconds for Kafka requests.
    .option("kafka.session.timeout.ms", "60000") # Specifies the session timeout in milliseconds for Kafka consumer groups
    .option("failOnDataLoss", "false") # Specifies whether to fail the stream query if data loss is detected. Setting it to "false" means that the query will continue processing even if data loss occurs.
    .option("startingOffsets", "earliest") # Specifies the starting offsets for reading from the Kafka topic. In this case, "earliest" is used to start reading from the earliest available offset.
    .load()
)

# Convert from base64 to string
df2 = df2.withColumn("key", col("key").cast("string")).withColumn(
    "value", col("value").cast("string")
)

# Write into delta table (/data/delta/dim2_raw)
##create a raw delta table from DataFrame
df2.writeStream.format("delta") \
   .outputMode("append") \
   .option("mergeSchema", "true") \
   .option("checkpointLocation", "</data/delta/dim2_raw_checkpoint_path>") \ ##A checkpoint directory/location is required to track the streaming updates. If not specified , a default checkpoint directory is created at /local_disk0/tmp/.
   .trigger("processing=30 seconds") \
   .start("/data/delta/dim2_raw")
## .toTable("dim2_raw")    ### Can be use .toTable instead of .start()


#### --Create third bronze table--####

# read streaming data from Kafka topic 3
df3 = (
    spark.readStream.format("kafka")
    .option("kafka.bootstrap.servers", f"{EH_NS_NAME}.servicebus.windows.net:9093") # Specifies bootstrap server of the Kafka cluster
    .option("subscribe", Topic_Name3)  # Specifies topic(s) to subscribe to
    .option("kafka.sasl.mechanism", "PLAIN") # Specifies the SASL (Simple Authentication and Security Layer) mechanism to use for authentication with Kafka
    .option("kafka.security.protocol", "SASL_SSL") # Specifies the security protocol to use for the connection. In this case, SASL_SSL is used, which combines SASL authentication with SSL encryption
    .option("kafka.sasl.jaas.config", EH_SASL) # Specifies the JAAS (Java Authentication and Authorization Service) configuration for SASL authentication
    .option("kafka.request.timeout.ms", "60000") # Specifies the timeout in milliseconds for Kafka requests.
    .option("kafka.session.timeout.ms", "60000") # Specifies the session timeout in milliseconds for Kafka consumer groups
    .option("failOnDataLoss", "false") # Specifies whether to fail the stream query if data loss is detected. Setting it to "false" means that the query will continue processing even if data loss occurs.
    .option("startingOffsets", "earliest") # Specifies the starting offsets for reading from the Kafka topic. In this case, "earliest" is used to start reading from the earliest available offset.
    .load()
)

# Convert from base64 to string
df3 = df3.withColumn("key", col("key").cast("string")).withColumn(
    "value", col("value").cast("string")
)


# Write into delta table (/data/delta/dim3_raw)
##create a raw delta table from DataFrame
df3.writeStream.format("delta") \
   .outputMode("append") \
   .option("mergeSchema", "true") \
   .option("checkpointLocation", "</data/delta/dim3_raw_checkpoint_path>") \ ##A checkpoint directory/location is required to track the streaming updates. If not specified , a default checkpoint directory is created at /local_disk0/tmp/.
   .trigger("processing=30 seconds") \
   .start("/data/delta/dim3_raw")
## .toTable("dim2_raw")    ### Can be use .toTable instead of .start()

#################End of File ##############