# <img src ='https://airsblobstorage.blob.core.windows.net/airstream/Asset 256.png' width="50px"> Stream data into Azure Databricks using Event Hubs

In [0]:
%scala

Class.forName("org.apache.spark.sql.eventhubs.EventHubsSource")

In [0]:
# Namespace Connection String
namespaceConnectionString = "Endpoint=sb://YOUREVENTHUB.servicebus.windows.net/;SharedAccessKeyName=YOUREVENTHUBACCESSPOLICY;SharedAccessKey=YOURACCESSKEY"

# Event Hub Name
eventHubName = "databricks-eh"

# Event Hub Connection String
eventHubConnectionString = namespaceConnectionString + ";EntityPath=" + eventHubName

# Event Hub Configuration
eventHubConfiguration = {
  'eventhubs.connectionString' : sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(eventHubConnectionString)  
}

In [0]:
%python

event_hub_name = "databricks-eh"
connection_string = dbutils.secrets.get(scope="akv-bck-scope", key="EventHubCnx-Azustream") + ";EntityPath=" + event_hub_name

ehConf = {
  'eventhubs.connectionString' : connection_string
}
  
print("Event Hub Connection String From Azure Key Vault: {}".format(connection_string))

In [0]:
# Create a Streaming DataFrame
# Read directly from Event Hub or Iot Hub using the EventHubs library for Databricks
inputDF = (
  spark.readStream.format("eventhubs")                                               # Read from IoT Hubs directly
    .options(**eventHubConfiguration)                                                # Use the Event-Hub-enabled connect string
    .load()                                                                          # Load the data
)

# Read directly from Event Hub or Iot Hub using the EventHubs library for Databricks
eventhubstreamDF = (
  spark.readStream.format("eventhubs")                                               # Read from IoT Hubs directly
    .options(**ehConf)                                                               # Use the Event-Hub-enabled connect string
    .load()                                                                          # Load the data
)

# Schema must be provided for a streaming data frame
# Some of the data sources like Event Hubs provide the schema out of the box

In [0]:
# Check to see of the Data Frame is a Streaming Data Frame
inputDF.isStreaming

eventhubstreamDF.isStreaming

In [0]:
# Add the sink to a Memor Sink for Debugging
# Remember theire are two sinks available for debugging - Memory Sink and Console Sink
streamingMemoryQuery = (
                          inputDF
                              .writeStream
                              .queryName("MemoryQuery")
                              .format("memory") 
                              .trigger(processingTime = '10 seconds')
                              .start()
                       )

# Show users the Raw Data and the Partitions

In [0]:
#streamingMemoryQuery.lastProgress

In [0]:
#Lets now check and see whats in the inputDF stream
#You'll be able to see the raw data that's in the Event Hub
display(
      inputDF,
      streamName = "DisplayMemoryQuery",
      processingTime = '10 seconds'  
)

In [0]:
from pyspark.sql.functions import *

#Let's transform the body to get the actual data and display the actual json that is coming from Event Hub
rawDF = (
            inputDF
                .withColumn(
                              "rawdata",
                              col("body").cast("string")
                           )  
                .select("rawdata")
        )

display(
      rawDF,
      streamName = "DisplayMemoryQuery",
      processingTime = '10 seconds'  
)

In [0]:
from pyspark.sql.types import *

#Now to extract the actual dat from the raw JSON data string let's define the schema
schema = (
            StructType()
               .add("Id", "integer")
               .add("VendorId", "integer")
               .add("PickupTime", "timestamp")
               .add("CabLicense", "string")
               .add("DriverLicense", "string")
               .add("PickupLocationId", "integer")
               .add("PassengerCount", "integer")
               .add("RateCodeId", "integer")
         )

In [0]:
rawDF = (
            rawDF
                .select(
                          from_json(
                                      col("rawdata"),
                                      schema
                                   )
                          .alias("taxidata")
                       )                        
                .select(
                          "taxidata.Id",
                          "taxidata.VendorId",
                          "taxidata.PickupTime",
                          "taxidata.CabLicense",
                          "taxidata.DriverLicense",
                          "taxidata.PickupLocationId",
                          "taxidata.PassengerCount",
                          "taxidata.RateCodeId",
                       )
        )

In [0]:
transformedDF = (
                    rawDF
                        .withColumn("TripType",
                                        when(
                                                col("RateCodeId") == "6",
                                                    "SharedTrip"
                                            )
                                        .otherwise("SoloTrip")
                                   )  
                        .drop("RateCodeId")
                )

In [0]:
transformedDF = (
                    transformedDF
                        .where("PassengerCount > 0")
                )

In [0]:
display(
      transformedDF,
      streamName = "DisplayMemoryQuery",
      processingTime = '10 seconds'  
)

In [0]:
from pyspark.sql.functions import *

#Lets Chain Everything Together

transformedDF = (
                    spark
                        .readStream
                        .format("eventhubs")                      #1. Read messages from Event Hub
                        .options(**eventHubConfiguration)
                        .load()
  
                        .withColumn(                              #2. Convert raw binary body data to a string
                                      "rawdata",
                                      col("body").cast("string")
                                   )
  
                        .select(
                                  from_json(                      #3. Transform the string into JSON data and give it an alias
                                              col("rawdata"),
                                              schema
                                           )
                                  .alias("taxidata")
                               )
  
                        .select(                                  #4. Extract the data and create separate columns from the JSON
                                  "taxidata.Id",
                                  "taxidata.VendorId",
                                  "taxidata.PickupTime",
                                  "taxidata.CabLicense",
                                  "taxidata.DriverLicense",
                                  "taxidata.PickupLocationId",
                                  "taxidata.PassengerCount",
                                  "taxidata.RateCodeId",
                               ) 
  
                        .withColumn("TripType",                  #5. Add Transformations: Derive a new column & Drop a column
                                        when(
                                                col("RateCodeId") == "6",
                                                    "SharedTrip"
                                            )
                                        .otherwise("SoloTrip")
                                   )  
                        .drop("RateCodeId")

                        .where("PassengerCount > 0")             #6. Filter out any data
                )

In [0]:
#Now instead of writing streaming data to CSV let's write to Parquet
#Writing to Parquet is slower than CSV but Querying is much faster than CSV/JSON
rawStreamingFileQuery = (
                            rawDF                             
                                .writeStream
                                .queryName("RawTaxiQuery")
                                .format("csv")
                                .option("path", "/mnt/datalake/Raw/")
                                .option("checkpointLocation", "/mnt/datalake/checkpointRaw")
                                .trigger(processingTime = '10 seconds')                                
                                .start()  
                        )

In [0]:

processedStreamingFileQuery = (
                                  transformedDF                             
                                      .writeStream
                                      .queryName("ProcessedTaxiQuery")
                                      .format("parquet")
                                      .option("path", "/mnt/datalake/Processed/")
                                      .option("checkpointLocation", "/mnt/datalake/checkpointProcessed")
                                      .trigger(processingTime = '3 seconds')
                                      .start()  
                              )

In [0]:
transformedDF.createOrReplaceTempView("ProcessedTaxiData")

In [0]:
sqlDF = spark.sql("SELECT PickupLocationId, COUNT(*) FROM ProcessedTaxiData GROUP BY PickupLocationId")

display(sqlDF)

In [0]:
%sql

SELECT PickupLocationId, COUNT(*)
FROM ProcessedTaxiData
GROUP BY PickupLocationId

In [0]:
taxiZones = (
                spark
                    .read
                    .option("header", "true")
                    .option("inferSchema", "true")
                    .csv("/mnt/datalake/StaticData/TaxiZones.csv")
            )

display(taxiZones)

In [0]:
taxiZones.createOrReplaceTempView("TaxiZones")

In [0]:
%sql

SELECT z.Zone
    , COUNT(*) AS TripCount
FROM ProcessedTaxiData p
  INNER JOIN TaxiZones z ON p.PickupLocationId = z.LocationID
WHERE z.Borough = 'Manhattan'
GROUP BY z.Zone