In [0]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StringType, DoubleType

In [0]:
eh_conn = "Endpoint=sb://labb2-eh-namespace.servicebus.windows.net/; \
    SharedAccessKeyName=labb2-eh-key; \
    SharedAccessKey=---; \
    EntityPath=labb2-event-hub"

conf = {
    "eventhubs.connectionString": eh_conn,
    "eventhubs.consumerGroup": "$Default",
    "eventhubs.startingPosition": "{\"offset\":\"-1\"}"
}

starting_position = json.dumps({"enqueuedTime": t})

conf  # quick sanity check


{'eventhubs.connectionString': 'Endpoint=sb://labb2-eh-namespace.servicebus.windows.net/;     SharedAccessKeyName=labb2-eh-key;     SharedAccessKey=---;     EntityPath=labb2-event-hub',
 'eventhubs.consumerGroup': '$Default',
 'eventhubs.startingPosition': '{"offset":"-1"}'}

In [0]:
raw = spark.readStream.format("eventhubs").options(**conf).load()

raw.printSchema()

root
 |-- body: binary (nullable = true)
 |-- partition: string (nullable = true)
 |-- offset: string (nullable = true)
 |-- sequenceNumber: long (nullable = true)
 |-- enqueuedTime: timestamp (nullable = true)
 |-- publisher: string (nullable = true)
 |-- partitionKey: string (nullable = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)
 |-- systemProperties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [0]:
import json
from datetime import datetime, timedelta, timezone

eh_conn_raw = "Endpoint=sb://labb2-eh-namespace.servicebus.windows.net/;SharedAccessKeyName=labb2-eh-key;SharedAccessKey=---;EntityPath=labb2-event-hub"
eh_conn = sc._jvm.org.apache.spark.eventhubs.EventHubsUtils.encrypt(eh_conn_raw)

t = (datetime.now(timezone.utc) - timedelta(days=1)).strftime("%Y-%m-%dT%H:%M:%S.%fZ")

starting_pos = {
    "offset": None,
    "seqNo": -1,
    "enqueuedTime": t,
    "isInclusive": True
}

starting_position_json = json.dumps(starting_pos)

conf = {
    "eventhubs.connectionString": eh_conn,
    "eventhubs.consumerGroup": "$Default",
    "eventhubs.startingPosition": starting_position_json,
    "eventhub.failOnDataLoss": "false"
}

raw = (spark.readStream
       .format("eventhubs")
       .options(**conf)
       .load()
       )
 
df_min = raw.select("body")

q = (df_min.writeStream
     .format("console")
     .option("truncate","false")
     .outputMode("append")
     .start()
     )

In [0]:
for s in spark.streams.active:
 try: s.stop()
 except: pass

In [0]:
import time; time.sleep(5)
print("STATUS:", q.status)
print("LAST PROGRESS:", q.lastProgress)
print("EXCEPTION:", q.exception())

STATUS: {'message': 'Stopped', 'isDataAvailable': False, 'isTriggerActive': False}
LAST PROGRESS: {'id': '0b82f6cf-72ca-4211-a3f0-3b25d67218e5', 'runId': '905cbbff-756a-4dfd-9fa4-6c2a31cda67a', 'name': None, 'timestamp': '2025-10-07T17:12:52.202Z', 'batchId': 2, 'batchDuration': 5, 'numInputRows': 0, 'inputRowsPerSecond': 0.0, 'processedRowsPerSecond': 0.0, 'durationMs': {'getOffset': 5, 'triggerExecution': 5}, 'stateOperators': [], 'sources': [{'description': 'org.apache.spark.sql.eventhubs.EventHubsSource@69f81aa7', 'startOffset': {'labb2-event-hub': {'0': 9247}}, 'endOffset': {'labb2-event-hub': {'0': 9247}}, 'latestOffset': {'labb2-event-hub': {'0': 9247}}, 'numInputRows': 0, 'inputRowsPerSecond': 0.0, 'processedRowsPerSecond': 0.0}], 'sink': {'description': 'org.apache.spark.sql.execution.streaming.ConsoleTable$@750595f3', 'numOutputRows': 0}}
EXCEPTION: None


In [0]:
query.stop()  # if you assigned the StreamingQuery to `query`


In [0]:
# Define your JSON schema (adjust to match your Event Hub messages)
json_schema = StructType() \
    .add("temperature", DoubleType()) \
    .add("humidity", DoubleType()) \
    .add("timestamp", StringType())

# Cast body to string
df_json = raw.selectExpr("cast(body as string) as json_str") \
             .select(from_json(col("json_str"), json_schema).alias("data")) \
             .select("data.*")


In [0]:
from pyspark.sql.functions import from_json, col

df_json = raw.selectExpr("cast(body as string) as json_str") \
             .select(from_json(col("json_str"), schema).alias("data")) \
             .select("data.*") \
             .na.drop()  # drop null rows caused by invalid JSON


In [0]:
raw.selectExpr("cast(body as string) as json_str").display()


In [0]:
# Write to Delta
(df_json.writeStream
       .format("delta")
       .outputMode("append")
       .option("checkpointLocation", "/mnt/delta/checkpoints/eventhub")
       .start("/mnt/delta/eventhub_data"))


<pyspark.sql.streaming.query.StreamingQuery at 0x7f650715c5f0>

In [0]:
query = (df_json.writeStream
         .format("console")
         .option("truncate", "false")
         .outputMode("append")
         .start())


In [0]:
for s in spark.streams.active:
    try:
        s.stop()
    except:
        pass


In [0]:
# Schema of your JSON messages
json_schema = StructType() \
    .add("temperature", DoubleType()) \
    .add("humidity", DoubleType()) \
    .add("timestamp", StringType())

# Read from Event Hub
df = (spark.readStream
      .format("eventhubs")
      .options(**conf)
      .load()
     )

# Event Hub payload is in 'body' as binary
df_json = df.selectExpr("cast(body as string) as json_str") \
            .select(from_json(col("json_str"), json_schema).alias("data")) \
            .select("data.*")

# Show streaming data in notebook
display(df_json)
