In [1]:
!pip install pyspark

Collecting pyspark
  Downloading pyspark-3.4.1.tar.gz (310.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m310.8/310.8 MB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.4.1-py2.py3-none-any.whl size=311285397 sha256=e7c3fb797e8d2c1cae9065556e1e124ec8a2056e6cad28b297381fa072da91c7
  Stored in directory: /root/.cache/pip/wheels/0d/77/a3/ff2f74cc9ab41f8f594dabf0579c2a7c6de920d584206e0834
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.4.1


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.streaming import StreamingContext
import json
from pyspark.sql.window import Window

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
json_file_path = '/content/drive/MyDrive/'

In [5]:
# Initialize a SparkSession with the given application name
spark = SparkSession.builder.appName("PyRidesUniqueDrivers").getOrCreate()

# Define the schema for the JSON data
schema = StructType([
    StructField("driver_id", StringType(), True),
    StructField("timestamp", TimestampType(), True),
    StructField("latitude", FloatType(), True),
    StructField("longitude", FloatType(), True),
    StructField("trip_distance", FloatType(), True),
    StructField("event_type", StringType(), True),
])

# Read JSON file using Spark streaming
json_data = spark.readStream.schema(schema).option("multiline", "true").json(json_file_path)

# Select all columns from the read JSON data
data_df = json_data.select("*")

# Configure and start the streaming query
query = data_df.writeStream \
    .format("memory") \
    .outputMode("append") \
    .queryName("streamingPyRidesdf") \
    .trigger(processingTime='5 seconds') \
    .start()

In [10]:
# Task1
# PySpark Streaming code to calculate the count of unique drivers within a sliding window of 10 minutes, updated every 5 minutes.

# Execute a SQL query to retrieve data from a streaming DataFrame named "streamingPyRidesdf"
stream_df = spark.sql("select * from streamingPyRidesdf")

# Drop rows with all null values from the streaming DataFrame
stream_df = stream_df.dropna(how="all")

# Group data by a 10-minute sliding window with a 5-minute slide duration
distinct_driver_count = stream_df.groupBy(window(col("timestamp"), "10 minutes", "5 minutes")).agg(
    countDistinct("driver_id").alias("distinct_driver_count"))

# Print a message to indicate the purpose of the following output
print('Distinct driver count :')

# Order the distinct driver count data by the "window" column
sorted_data = distinct_driver_count.orderBy("window")

# Display the sorted distinct driver count data, showing all rows without truncating the output
sorted_data.show(sorted_data.count(), False)

Distinct driver count :
+------------------------------------------+---------------------+
|window                                    |distinct_driver_count|
+------------------------------------------+---------------------+
|{2023-07-30 05:05:00, 2023-07-30 05:15:00}|47                   |
|{2023-07-30 05:10:00, 2023-07-30 05:20:00}|50                   |
|{2023-07-30 05:15:00, 2023-07-30 05:25:00}|50                   |
|{2023-07-30 05:20:00, 2023-07-30 05:30:00}|50                   |
|{2023-07-30 05:25:00, 2023-07-30 05:35:00}|50                   |
|{2023-07-30 05:30:00, 2023-07-30 05:40:00}|50                   |
|{2023-07-30 05:35:00, 2023-07-30 05:45:00}|50                   |
|{2023-07-30 05:40:00, 2023-07-30 05:50:00}|50                   |
|{2023-07-30 05:45:00, 2023-07-30 05:55:00}|49                   |
|{2023-07-30 05:50:00, 2023-07-30 06:00:00}|50                   |
|{2023-07-30 05:55:00, 2023-07-30 06:05:00}|50                   |
|{2023-07-30 06:00:00, 2023-07-30 06:1

In [11]:
# Task2
# implement a PySpark Streaming code to calculate the average trip duration for each driver within a tumbling window of 15 minutes. Display the results for each window update

# Step 1: Select relevant columns from the streaming DataFrame
driverwise_event_data = stream_df.select("driver_id", "timestamp", "event_type")

# Step 2: Define a window specification for calculating lagged timestamps
windows_spec = Window.partitionBy('driver_id').orderBy('timestamp')

# Step 3: Calculate lagged timestamps for each event
driverwise_event_data_with_lagged_TS = driverwise_event_data.withColumn("lagged_timestamp", lag('timestamp').over(windows_spec))

# Step 4: Calculate event durations (time difference between consecutive events)
driverwise_event_data_with_event_duration = driverwise_event_data_with_lagged_TS.withColumn(
    "event_duration", (col("timestamp").cast('long') - col('lagged_timestamp').cast('long')))

# Step 5: Define a window specification for calculating idle duration
window_spec_idle_duration = Window.orderBy(col("driver_id"))

# Step 6: Calculate actual event durations between events
driverwise_event_data_with_event_duration = driverwise_event_data_with_event_duration.withColumn(
    "event_duration_actual", lag(col("event_duration"), -1).over(window_spec_idle_duration))

# Step 7: Filter for events with event_type "Trip"
event_trip_data = driverwise_event_data_with_event_duration.filter(
    driverwise_event_data_with_event_duration.event_type == "Trip")

# Step 8: Calculate the average trip duration within 15-minute windows
average_trip_duration_min = event_trip_data.groupBy('driver_id', window('timestamp', "15 minutes")).agg(
    round(avg('event_duration_actual') / 60, 2).alias('average trip duration(minutes)'))

# Step 9: Filter out rows with non-null average trip durations
average_trip_duration = average_trip_duration_min.filter(col('average trip duration(minutes)').isNotNull())

# Step 10: Display the computed average trip durations with specified formatting
average_trip_duration.show(average_trip_duration.count(), truncate=False)


+---------+------------------------------------------+------------------------------+
|driver_id|window                                    |average trip duration(minutes)|
+---------+------------------------------------------+------------------------------+
|D001     |{2023-07-30 05:00:00, 2023-07-30 05:15:00}|3.02                          |
|D001     |{2023-07-30 05:15:00, 2023-07-30 05:30:00}|0.94                          |
|D001     |{2023-07-30 05:30:00, 2023-07-30 05:45:00}|0.9                           |
|D001     |{2023-07-30 05:45:00, 2023-07-30 06:00:00}|0.65                          |
|D001     |{2023-07-30 06:00:00, 2023-07-30 06:15:00}|1.09                          |
|D001     |{2023-07-30 06:15:00, 2023-07-30 06:30:00}|1.53                          |
|D001     |{2023-07-30 06:30:00, 2023-07-30 06:45:00}|1.18                          |
|D001     |{2023-07-30 06:45:00, 2023-07-30 07:00:00}|1.04                          |
|D001     |{2023-07-30 07:00:00, 2023-07-30 07:15:00}|

In [8]:
# Task 3
# Implement a PySpark Streaming code to detect idle time for each driver using session windows. Consider it an idle session if the driver's location remains unchanged for more
# than 30 minutes.

In [24]:
# Step 1: Select relevant columns from the streaming DataFrame
driverwise_event_data = stream_df.select("driver_id", "timestamp", "event_type")

# Step 2: Define a window specification for calculating lagged timestamps
windows_spec = Window.partitionBy('driver_id').orderBy('timestamp')

# Step 3: Calculate lagged timestamps for each event
driverwise_event_data_with_lagged_TS = driverwise_event_data.withColumn("lagged_timestamp", lag('timestamp').over(windows_spec))

# Step 4: Calculate event durations (time difference between consecutive events)
driverwise_event_data_with_event_duration = driverwise_event_data_with_lagged_TS.withColumn(
    "event_duration", (col("timestamp").cast('long') - col('lagged_timestamp').cast('long')))

# Step 5: Define a window specification for calculating idle duration
window_spec_idle_duration = Window.orderBy(col("driver_id"))

# Step 6: Calculate actual event durations between events
driverwise_event_data_with_event_duration = driverwise_event_data_with_event_duration.withColumn(
    "event_duration_actual", lag(col("event_duration"), -1).over(window_spec_idle_duration))

# Filter events with event_type "GPS"
event_GPS_data = driverwise_event_data_with_event_duration.filter(driverwise_event_data_with_event_duration.event_type == "GPS")

# Identify idle sessions by marking events with actual duration > 1800 seconds (30 minutes)
idle_sessions = event_GPS_data.withColumn("Idle Session Present", when(col("event_duration_actual") > 1800, 1).otherwise(0))

# Filter rows to include only idle sessions
idle_sessions_present = idle_sessions.filter(col("Idle Session Present") > 0)
idle_sessions_present.show(idle_sessions_present.count(), truncate=False)

# Group data by driver_id and calculate the maximum idle duration in minutes
driverwise_max_idle_time = event_GPS_data.groupBy(col('driver_id')).agg(
    round(max(col('event_duration_actual')) / 60, 2).alias('Max Idle Duration (minutes)')
)

# Display the computed maximum idle times for each driver
driverwise_max_idle_time.show(driverwise_max_idle_time.count())

+---------+---------+----------+----------------+--------------+---------------------+--------------------+
|driver_id|timestamp|event_type|lagged_timestamp|event_duration|event_duration_actual|Idle Session Present|
+---------+---------+----------+----------------+--------------+---------------------+--------------------+
+---------+---------+----------+----------------+--------------+---------------------+--------------------+

+---------+---------------------------+
|driver_id|Max Idle Duration (minutes)|
+---------+---------------------------+
|     D001|                        6.7|
|     D002|                       4.97|
|     D003|                       6.37|
|     D004|                       6.17|
|     D005|                       4.02|
|     D006|                       3.17|
|     D007|                       4.65|
|     D008|                       5.73|
|     D009|                       5.88|
|     D010|                       7.17|
|     D011|                       4.32|
|     D

In [33]:
# Approach 2 using session windows
# Step 1: Select relevant columns from the streaming DataFrame
driverwise_event_data = stream_df.select("driver_id", "timestamp", "event_type")

# Step 2: Define a window specification for calculating lagged timestamps
windows_spec = Window.partitionBy('driver_id').orderBy('timestamp')

# Step 3: Calculate lagged timestamps for each event
driverwise_event_data_with_lagged_TS = driverwise_event_data.withColumn("lagged_timestamp", lag('timestamp').over(windows_spec))

# Step 4: Calculate event durations (time difference between consecutive events)
driverwise_event_data_with_event_duration = driverwise_event_data_with_lagged_TS.withColumn(
    "event_duration", (col("timestamp").cast('long') - col('lagged_timestamp').cast('long')))

# Step 5: Define a window specification for calculating idle duration
window_spec_idle_duration = Window.orderBy(col("driver_id"))

# Step 6: Calculate actual event durations between events
driverwise_event_data_with_event_duration = driverwise_event_data_with_event_duration.withColumn(
    "event_duration_actual", lag(col("event_duration"), -1).over(window_spec_idle_duration))

# Filter events with event_type "GPS"
event_GPS_data = driverwise_event_data_with_event_duration.filter(driverwise_event_data_with_event_duration.event_type == "GPS")

# Calculate idle session durations using session windows
sessionWindows = event_GPS_data.groupBy("driver_id", session_window("timestamp", "30 minutes")).agg(
    round(max(col("event_duration_actual")) / 60, 2).alias('max Idle Duration (minutes)')).orderBy(col("max Idle Duration (minutes)").desc())

# Display idle session durations
sessionWindows.show(sessionWindows.count(),truncate=False)


+---------+------------------------------------------+---------------------------+
|driver_id|session_window                            |max Idle Duration (minutes)|
+---------+------------------------------------------+---------------------------+
|D035     |{2023-07-30 05:17:12, 2023-07-30 07:41:25}|7.72                       |
|D010     |{2023-07-30 05:13:19, 2023-07-30 07:39:54}|7.17                       |
|D032     |{2023-07-30 05:14:15, 2023-07-30 07:41:18}|6.82                       |
|D016     |{2023-07-30 05:11:51, 2023-07-30 07:41:08}|6.8                        |
|D001     |{2023-07-30 05:15:57, 2023-07-30 07:41:06}|6.7                        |
|D018     |{2023-07-30 05:12:53, 2023-07-30 07:39:37}|6.57                       |
|D003     |{2023-07-30 05:14:34, 2023-07-30 07:41:19}|6.37                       |
|D023     |{2023-07-30 05:14:03, 2023-07-30 07:41:08}|6.27                       |
|D037     |{2023-07-30 05:13:26, 2023-07-30 07:39:55}|6.27                       |
|D00