In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType
from datetime import datetime
from pyspark.sql.functions import *
from pyspark.sql.window import Window
# Initialize Spark Session
spark = SparkSession.builder.appName("EventStatusTable").getOrCreate()

# Define Schema for event_status Table
event_status_schema = StructType([
    StructField("event_time", StringType(), False),
    StructField("status", StringType(), False)
])

# Define Data for event_status Table
event_status_data = [
    ('10:01', 'on'), ('10:02', 'on'), ('10:03', 'on'),
    ('10:04', 'off'), ('10:07', 'on'), ('10:08', 'on'),
    ('10:09', 'off'), ('10:11', 'on'), ('10:12', 'off')
]

# Create event_status DataFrame
event_status_df = spark.createDataFrame(event_status_data, schema=event_status_schema)
event_status_df.createOrReplaceTempView("Event")

# Show DataFrame
event_status_df.show()


+----------+------+
|event_time|status|
+----------+------+
|     10:01|    on|
|     10:02|    on|
|     10:03|    on|
|     10:04|   off|
|     10:07|    on|
|     10:08|    on|
|     10:09|   off|
|     10:11|    on|
|     10:12|   off|
+----------+------+



In [9]:
spark.sql(
"""
    with cte as (
    select *,
    lag(status, 1, status) over(order by event_time asc) as prev_status
    from Event
    ),
    cte2 as (
    select *, 
    sum(case when status="on" and prev_status='off' then 1 else 0 end) over(order by event_time) as group_key
    from cte)
    select min(event_time), max(event_time), count(*) from cte2 group by group_key
""").show()

+---------------+---------------+--------+
|min(event_time)|max(event_time)|count(1)|
+---------------+---------------+--------+
|          10:01|          10:04|       4|
|          10:07|          10:09|       3|
|          10:11|          10:12|       2|
+---------------+---------------+--------+



In [10]:
df = event_status_df
window_spec = Window.orderBy("event_time")

# Compute previous status
cte = df.withColumn("prev_status", lag("status", 1, "status").over(window_spec))

# Compute group_key
cte2 = cte.withColumn("group_key", 
                      sum(when((col("status") == "on") & (col("prev_status") == "off"), 1).otherwise(0))
                      .over(window_spec))

# Aggregate results
result = cte2.groupBy("group_key").agg(
    min("event_time").alias("start_time"),
    max("event_time").alias("end_time"),
    count("event_time").alias("event_count")
)

result.show()

+---------+----------+--------+-----------+
|group_key|start_time|end_time|event_count|
+---------+----------+--------+-----------+
|        0|     10:01|   10:04|          4|
|        1|     10:07|   10:09|          3|
|        2|     10:11|   10:12|          2|
+---------+----------+--------+-----------+

