# <img src ='https://airsblobstorage.blob.core.windows.net/airstream/Asset 256.png' width="50px"> Stream data into Azure Databricks using Event Hubs

In [0]:
%scala

Class.forName("org.apache.spark.sql.eventhubs.EventHubsSource")

In [0]:
# Namespace Connection String
namespaceConnectionString = "Endpoint=sb://YOUREVENTHUB.servicebus.windows.net/;SharedAccessKeyName=YOUREVENTHUBACCESSPOLICY;SharedAccessKey=YOURACCESSKEY"

# Event Hub Name
eventHubName = "databricks-eh"

# Event Hub Connection String
eventHubConnectionString = namespaceConnectionString + ";EntityPath=" + eventHubName

# Event Hub Configuration
eventHubConfiguration = {
  'eventhubs.connectionString' : sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(eventHubConnectionString)  
}

In [0]:
%python

event_hub_name = "databricks-eh"
connection_string = dbutils.secrets.get(scope="akv-bck-scope", key="EventHubCnx-Azustream") + ";EntityPath=" + event_hub_name

ehConf = {
  'eventhubs.connectionString' : connection_string
}
  
print("Event Hub Connection String From Azure Key Vault: {}".format(connection_string))

In [0]:
# Create a Streaming DataFrame
# Read directly from Event Hub or Iot Hub using the EventHubs library for Databricks
inputDF = (
  spark.readStream.format("eventhubs")                                               # Read from IoT Hubs directly
    .options(**eventHubConfiguration)                                                # Use the Event-Hub-enabled connect string
    .load()                                                                          # Load the data
)

# Read directly from Event Hub or Iot Hub using the EventHubs library for Databricks
eventhubstreamDF = (
  spark.readStream.format("eventhubs")                                               # Read from IoT Hubs directly
    .options(**ehConf)                                                               # Use the Event-Hub-enabled connect string
    .load()                                                                          # Load the data
)

# Schema must be provided for a streaming data frame
# Some of the data sources like Event Hubs provide the schema out of the box

In [0]:
# Check to see of the Data Frame is a Streaming Data Frame
inputDF.isStreaming

eventhubstreamDF.isStreaming

In [0]:
# Add the sink to a Memor Sink for Debugging
# Remember theire are two sinks available for debugging - Memory Sink and Console Sink
streamingMemoryQuery = (
                          inputDF
                              .writeStream
                              .queryName("MemoryQuery")
                              .format("memory") 
                              .trigger(processingTime = '10 seconds')
                              .start()
                       )

# Show users the Raw Data and the Partitions

In [0]:
#streamingMemoryQuery.lastProgress

In [0]:
#Lets now check and see whats in the inputDF stream
#You'll be able to see the raw data that's in the Event Hub
display(
      inputDF,
      streamName = "DisplayMemoryQuery",
      processingTime = '10 seconds'  
)

body,partition,offset,sequenceNumber,enqueuedTime,publisher,partitionKey,properties,systemProperties
eyJJZCI6MTMzLCJWZW5kb3JJZCI6MiwiUGlja3VwVGltZSI6IjIwMjAtMDItMDdUMDA6MDA6MjhaIiwiQ2FiTGljZW5zZSI6IlQ0MzI4MDJDIiwiRHJpdmVyTGljZW5zZSI6NDc5MDQyLCJQaWNrdXBMb2NhdGlvbklkIjoxMDc= (truncated),0,33456,160,2021-07-19T15:44:26.981+0000,,,Map(),Map()
eyJJZCI6MTM1LCJWZW5kb3JJZCI6MiwiUGlja3VwVGltZSI6IjIwMjAtMDItMDdUMDA6MDA6MjhaIiwiQ2FiTGljZW5zZSI6IlQ2MDk1NzlDIiwiRHJpdmVyTGljZW5zZSI6NTE1MTI0NSwiUGlja3VwTG9jYXRpb25JZCI6Nzk= (truncated),0,33672,161,2021-07-19T15:44:27.091+0000,,,Map(),Map()
eyJJZCI6MTM3LCJWZW5kb3JJZCI6MiwiUGlja3VwVGltZSI6IjIwMjAtMDItMDdUMDA6MDA6MjhaIiwiQ2FiTGljZW5zZSI6IlQ2MjI1NzZDIiwiRHJpdmVyTGljZW5zZSI6NTAyMTIwNCwiUGlja3VwTG9jYXRpb25JZCI6MTY= (truncated),0,33888,162,2021-07-19T15:44:27.106+0000,,,Map(),Map()
eyJJZCI6MTM5LCJWZW5kb3JJZCI6MiwiUGlja3VwVGltZSI6IjIwMjAtMDItMDdUMDA6MDA6MjlaIiwiQ2FiTGljZW5zZSI6IlQ2MTM1MTBDIiwiRHJpdmVyTGljZW5zZSI6NDk5MzE5LCJQaWNrdXBMb2NhdGlvbklkIjoxNDE= (truncated),0,34104,163,2021-07-19T15:44:28.122+0000,,,Map(),Map()
eyJJZCI6MTQxLCJWZW5kb3JJZCI6MSwiUGlja3VwVGltZSI6IjIwMjAtMDItMDdUMDA6MDA6MjlaIiwiQ2FiTGljZW5zZSI6IkZBUkVMNzQiLCJEcml2ZXJMaWNlbnNlIjo0NzIzNTIsIlBpY2t1cExvY2F0aW9uSWQiOjIzMCw= (truncated),0,34320,164,2021-07-19T15:44:28.137+0000,,,Map(),Map()
eyJJZCI6MTQzLCJWZW5kb3JJZCI6MiwiUGlja3VwVGltZSI6IjIwMjAtMDItMDdUMDA6MDA6MjlaIiwiQ2FiTGljZW5zZSI6IlQ1MjU0NDNDIiwiRHJpdmVyTGljZW5zZSI6NTE1MTI5MywiUGlja3VwTG9jYXRpb25JZCI6Nzk= (truncated),0,34536,165,2021-07-19T15:44:28.137+0000,,,Map(),Map()
eyJJZCI6MTQ1LCJWZW5kb3JJZCI6MiwiUGlja3VwVGltZSI6IjIwMjAtMDItMDdUMDA6MDA6MjlaIiwiQ2FiTGljZW5zZSI6IlQ0NjIxNTJDIiwiRHJpdmVyTGljZW5zZSI6NTEyNDQ0NiwiUGlja3VwTG9jYXRpb25JZCI6MjM= (truncated),0,34752,166,2021-07-19T15:44:28.137+0000,,,Map(),Map()
eyJJZCI6MTQ3LCJWZW5kb3JJZCI6MSwiUGlja3VwVGltZSI6IjIwMjAtMDItMDdUMDA6MDA6MzBaIiwiQ2FiTGljZW5zZSI6IlQ2MTU0MDBDIiwiRHJpdmVyTGljZW5zZSI6NTAyMDg4MSwiUGlja3VwTG9jYXRpb25JZCI6MTA= (truncated),0,34968,167,2021-07-19T15:44:29.216+0000,,,Map(),Map()
eyJJZCI6MTQ5LCJWZW5kb3JJZCI6MSwiUGlja3VwVGltZSI6IjIwMjAtMDItMDdUMDA6MDA6MzBaIiwiQ2FiTGljZW5zZSI6IlQ0MDM1ODJDIiwiRHJpdmVyTGljZW5zZSI6NDQ4MDc1LCJQaWNrdXBMb2NhdGlvbklkIjoxNDg= (truncated),0,35184,168,2021-07-19T15:44:29.231+0000,,,Map(),Map()
eyJJZCI6MTUxLCJWZW5kb3JJZCI6MiwiUGlja3VwVGltZSI6IjIwMjAtMDItMDdUMDA6MDA6MzBaIiwiQ2FiTGljZW5zZSI6IlQ4MDc0NTJDIiwiRHJpdmVyTGljZW5zZSI6NDY0Mjg0LCJQaWNrdXBMb2NhdGlvbklkIjo0OCw= (truncated),0,35400,169,2021-07-19T15:44:29.231+0000,,,Map(),Map()


In [0]:
from pyspark.sql.functions import *

#Let's transform the body to get the actual data and display the actual json that is coming from Event Hub
rawDF = (
            inputDF
                .withColumn(
                              "rawdata",
                              col("body").cast("string")
                           )  
                .select("rawdata")
        )

display(
      rawDF,
      streamName = "DisplayMemoryQuery",
      processingTime = '10 seconds'  
)

rawdata
"{""Id"":1825,""VendorId"":2,""PickupTime"":""2020-02-07T00:06:32Z"",""CabLicense"":""T624072C"",""DriverLicense"":461097,""PickupLocationId"":142,""PassengerCount"":3,""RateCodeId"":1}"
"{""Id"":1827,""VendorId"":2,""PickupTime"":""2020-02-07T00:06:32Z"",""CabLicense"":""T424047C"",""DriverLicense"":5105834,""PickupLocationId"":48,""PassengerCount"":1,""RateCodeId"":1}"
"{""Id"":1829,""VendorId"":2,""PickupTime"":""2020-02-07T00:06:32Z"",""CabLicense"":""LEGND36"",""DriverLicense"":5103849,""PickupLocationId"":162,""PassengerCount"":3,""RateCodeId"":1}"
"{""Id"":1831,""VendorId"":2,""PickupTime"":""2020-02-07T00:06:33Z"",""CabLicense"":""T451110C"",""DriverLicense"":390797,""PickupLocationId"":186,""PassengerCount"":1,""RateCodeId"":1}"
"{""Id"":1833,""VendorId"":2,""PickupTime"":""2020-02-07T00:06:33Z"",""CabLicense"":""SKNY2"",""DriverLicense"":5135647,""PickupLocationId"":61,""PassengerCount"":6,""RateCodeId"":1}"
"{""Id"":1835,""VendorId"":1,""PickupTime"":""2020-02-07T00:06:33Z"",""CabLicense"":""T499206C"",""DriverLicense"":468992,""PickupLocationId"":79,""PassengerCount"":2,""RateCodeId"":1}"
"{""Id"":1837,""VendorId"":2,""PickupTime"":""2020-02-07T00:06:33Z"",""CabLicense"":""T603517C"",""DriverLicense"":5107700,""PickupLocationId"":79,""PassengerCount"":1,""RateCodeId"":1}"
"{""Id"":1839,""VendorId"":1,""PickupTime"":""2020-02-07T00:06:34Z"",""CabLicense"":""T607714C"",""DriverLicense"":5137754,""PickupLocationId"":234,""PassengerCount"":2,""RateCodeId"":1}"
"{""Id"":1841,""VendorId"":2,""PickupTime"":""2020-02-07T00:06:34Z"",""CabLicense"":""COMM289"",""DriverLicense"":489168,""PickupLocationId"":239,""PassengerCount"":1,""RateCodeId"":1}"
"{""Id"":1843,""VendorId"":2,""PickupTime"":""2020-02-07T00:06:34Z"",""CabLicense"":""T617537C"",""DriverLicense"":5104612,""PickupLocationId"":74,""PassengerCount"":3,""RateCodeId"":1}"


In [0]:
from pyspark.sql.types import *

#Now to extract the actual dat from the raw JSON data string let's define the schema
schema = (
            StructType()
               .add("Id", "integer")
               .add("VendorId", "integer")
               .add("PickupTime", "timestamp")
               .add("CabLicense", "string")
               .add("DriverLicense", "string")
               .add("PickupLocationId", "integer")
               .add("PassengerCount", "integer")
               .add("RateCodeId", "integer")
         )

In [0]:
rawDF = (
            rawDF
                .select(
                          from_json(
                                      col("rawdata"),
                                      schema
                                   )
                          .alias("taxidata")
                       )                        
                .select(
                          "taxidata.Id",
                          "taxidata.VendorId",
                          "taxidata.PickupTime",
                          "taxidata.CabLicense",
                          "taxidata.DriverLicense",
                          "taxidata.PickupLocationId",
                          "taxidata.PassengerCount",
                          "taxidata.RateCodeId",
                       )
        )

In [0]:
transformedDF = (
                    rawDF
                        .withColumn("TripType",
                                        when(
                                                col("RateCodeId") == "6",
                                                    "SharedTrip"
                                            )
                                        .otherwise("SoloTrip")
                                   )  
                        .drop("RateCodeId")
                )

In [0]:
transformedDF = (
                    transformedDF
                        .where("PassengerCount > 0")
                )

In [0]:
display(
      transformedDF,
      streamName = "DisplayMemoryQuery",
      processingTime = '10 seconds'  
)

Id,VendorId,PickupTime,CabLicense,DriverLicense,PickupLocationId,PassengerCount,TripType
4,2,2020-02-07T00:00:00.000+0000,VG354,5012911,161,2,SoloTrip
8,2,2020-02-07T00:00:00.000+0000,T489378C,460439,234,5,SoloTrip
9,2,2020-02-07T00:00:00.000+0000,T602840C,460019,113,1,SoloTrip
7,2,2020-02-07T00:00:00.000+0000,T526628C,370606,48,1,SoloTrip
10,2,2020-02-07T00:00:01.000+0000,T515369C,493727,137,4,SoloTrip
1,1,2020-02-07T00:00:00.000+0000,TAC399,5131685,170,1,SoloTrip
14,1,2020-02-07T00:00:02.000+0000,T415818C,414057,164,3,SoloTrip
16,2,2020-02-07T00:00:02.000+0000,T606623C,489189,234,5,SoloTrip
18,1,2020-02-07T00:00:02.000+0000,T462832C,5066390,148,1,SoloTrip
5,2,2020-02-07T00:00:00.000+0000,T517119C,429996,239,1,SoloTrip


In [0]:
from pyspark.sql.functions import *

#Lets Chain Everything Together

transformedDF = (
                    spark
                        .readStream
                        .format("eventhubs")                      #1. Read messages from Event Hub
                        .options(**eventHubConfiguration)
                        .load()
  
                        .withColumn(                              #2. Convert raw binary body data to a string
                                      "rawdata",
                                      col("body").cast("string")
                                   )
  
                        .select(
                                  from_json(                      #3. Transform the string into JSON data and give it an alias
                                              col("rawdata"),
                                              schema
                                           )
                                  .alias("taxidata")
                               )
  
                        .select(                                  #4. Extract the data and create separate columns from the JSON
                                  "taxidata.Id",
                                  "taxidata.VendorId",
                                  "taxidata.PickupTime",
                                  "taxidata.CabLicense",
                                  "taxidata.DriverLicense",
                                  "taxidata.PickupLocationId",
                                  "taxidata.PassengerCount",
                                  "taxidata.RateCodeId",
                               ) 
  
                        .withColumn("TripType",                  #5. Add Transformations: Derive a new column & Drop a column
                                        when(
                                                col("RateCodeId") == "6",
                                                    "SharedTrip"
                                            )
                                        .otherwise("SoloTrip")
                                   )  
                        .drop("RateCodeId")

                        .where("PassengerCount > 0")             #6. Filter out any data
                )

In [0]:
#Now instead of writing streaming data to CSV let's write to Parquet
#Writing to Parquet is slower than CSV but Querying is much faster than CSV/JSON
rawStreamingFileQuery = (
                            rawDF                             
                                .writeStream
                                .queryName("RawTaxiQuery")
                                .format("csv")
                                .option("path", "/mnt/datalake/Raw/")
                                .option("checkpointLocation", "/mnt/datalake/checkpointRaw")
                                .trigger(processingTime = '10 seconds')                                
                                .start()  
                        )

In [0]:

processedStreamingFileQuery = (
                                  transformedDF                             
                                      .writeStream
                                      .queryName("ProcessedTaxiQuery")
                                      .format("parquet")
                                      .option("path", "/mnt/datalake/Processed/")
                                      .option("checkpointLocation", "/mnt/datalake/checkpointProcessed")
                                      .trigger(processingTime = '3 seconds')
                                      .start()  
                              )

In [0]:
transformedDF.createOrReplaceTempView("ProcessedTaxiData")

In [0]:
sqlDF = spark.sql("SELECT PickupLocationId, COUNT(*) FROM ProcessedTaxiData GROUP BY PickupLocationId")

display(sqlDF)

PickupLocationId,count(1)
148,4
230,3
224,1
132,1
142,4
164,8
48,2
262,1
107,4
231,2


In [0]:
%sql

SELECT PickupLocationId, COUNT(*)
FROM ProcessedTaxiData
GROUP BY PickupLocationId

PickupLocationId,count(1)
148,1
137,1
211,1
26,1
230,3
233,1
140,1
142,2
164,3
48,2


In [0]:
taxiZones = (
                spark
                    .read
                    .option("header", "true")
                    .option("inferSchema", "true")
                    .csv("/mnt/datalake/StaticData/TaxiZones.csv")
            )

display(taxiZones)

LocationID,Borough,Zone,service_zone
1,EWR,Newark Airport,EWR
2,Queens,Jamaica Bay,Boro Zone
3,Bronx,Allerton/Pelham Gardens,Boro Zone
4,Manhattan,Alphabet City,Yellow Zone
5,Staten Island,Arden Heights,Boro Zone
6,Staten Island,Arrochar/Fort Wadsworth,Boro Zone
7,Queens,Astoria,Boro Zone
8,Queens,Astoria Park,Boro Zone
9,Queens,Auburndale,Boro Zone
10,Queens,Baisley Park,Boro Zone


In [0]:
taxiZones.createOrReplaceTempView("TaxiZones")

In [0]:
%sql

SELECT z.Zone
    , COUNT(*) AS TripCount
FROM ProcessedTaxiData p
  INNER JOIN TaxiZones z ON p.PickupLocationId = z.LocationID
WHERE z.Borough = 'Manhattan'
GROUP BY z.Zone

Zone,TripCount
Upper East Side South,12
Yorkville West,5
TriBeCa/Civic Center,10
Upper West Side North,5
East Harlem South,2
Hudson Sq,3
Lenox Hill East,3
Upper East Side North,4
UN/Turtle Bay South,3
Union Sq,25
