# Question 1
![image.png](attachment:image.png)

# Install Kafka and Zookeeper

In [None]:
!curl -sSOL https://downloads.apache.org/kafka/3.5.0/kafka_2.12-3.5.0.tgz
!tar -xzf kafka_2.12-3.5.0.tgz

In [None]:
!echo "Starting ZooKeeper service..."
!./kafka_2.12-3.5.0/bin/zookeeper-server-start.sh -daemon ./kafka_2.12-3.5.0/config/zookeeper.properties

!echo "Starting Kafka service..."
!./kafka_2.12-3.5.0/bin/kafka-server-start.sh -daemon ./kafka_2.12-3.5.0/config/server.properties

!echo "Waiting for 10 secs until Kafka and ZooKeeper services are up and running..."

!sleep 10

!ps -ef | grep kafka


Starting ZooKeeper service...
Starting Kafka service...
Waiting for 10 secs until Kafka and ZooKeeper services are up and running...
root       10429       1  0 18:10 ?        00:00:06 java -Xmx512M -Xms512M -server -XX:+UseG1GC -XX:MaxGCPauseMillis=20 -XX:InitiatingHeapOccupancyPercent=35 -XX:+ExplicitGCInvokesConcurrent -XX:MaxInlineLevel=15 -Djava.awt.headless=true -Xlog:gc*:file=/content/kafka_2.12-3.5.0/bin/../logs/zookeeper-gc.log:time,tags:filecount=10,filesize=100M -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.ssl=false -Dkafka.logs.dir=/content/kafka_2.12-3.5.0/bin/../logs -Dlog4j.configuration=file:./kafka_2.12-3.5.0/bin/../config/log4j.properties -cp /content/kafka_2.12-3.5.0/bin/../libs/activation-1.1.1.jar:/content/kafka_2.12-3.5.0/bin/../libs/aopalliance-repackaged-2.6.1.jar:/content/kafka_2.12-3.5.0/bin/../libs/argparse4j-0.7.0.jar:/content/kafka_2.12-3.5.0/bin/../libs/audience-annotations-0.13.0.jar:/cont

## Running Kafka and Zookeeper in daemon mode on port 9092


In [None]:
!./kafka_2.12-3.5.0/bin/kafka-topics.sh --create --bootstrap-server 127.0.0.1:9092 --replication-factor 1 --partitions 1 --topic yelp_reviews

Error while executing topic command : Topic 'yelp_reviews' already exists.
[2023-08-08 18:56:00,817] ERROR org.apache.kafka.common.errors.TopicExistsException: Topic 'yelp_reviews' already exists.
 (kafka.admin.TopicCommand$)


## Create new topics in kafka


In [None]:
!./kafka_2.12-3.5.0/bin/kafka-topics.sh --describe --bootstrap-server 127.0.0.1:9092 --topic yelp_reviews

Topic: yelp_reviews	TopicId: bTYawxo6R3ujR6EBj8a_Aw	PartitionCount: 1	ReplicationFactor: 1	Configs: 
	Topic: yelp_reviews	Partition: 0	Leader: 0	Replicas: 0	Isr: 0


## Install OpenJDK

In [None]:
!echo "Installing OpenJDK 8 JDK..."

!apt-get install openjdk-8-jdk-headless -qq > /dev/null

Installing OpenJDK 8 JDK...


## Install Kafka's client


In [None]:
!pip install kafka-python



## Importing the necessary packages

In [None]:
import pandas as pd
from kafka import KafkaProducer, KafkaConsumer
import json
import time
from pandas import Timestamp
import threading

# Reading the JSON data into pandas dataframe

In [None]:
yelp_reviews = pd.read_json('yelp_reviews_condensed.json', lines = True)
dat = yelp_reviews.to_dict ("reviews")

  dat = yelp_reviews.to_dict ("reviews")


In [None]:
# Number of datapoints and columns
no_of_datapoints, no_of_columns = len(yelp_reviews), len(yelp_reviews.columns)
print("Data/Sample Size: ", no_of_datapoints)
print("Number of columns: ", no_of_columns)

Data/Sample Size:  1000
Number of columns:  9


# Kafka Producer Logic

In [None]:
# Kafka topic
topic = "yelp_reviews"

# Kafka producer
producer = KafkaProducer(bootstrap_servers="localhost:9092")
producer.flush()

# Iterate over the datapoints and send 10 records -> 10 secs sleep time -> total 100 records
# to Kafka
row_no = 0
count = 0
for row in dat:
  row_no += 1
  print(row)
  row['date'] = row['date'].strftime('%Y-%m-%d %H:%M:%S')
  message = json.dumps(row).encode("utf-8")

  if count > 9:
    break

  producer.send(topic, message)
  if row_no == 10:
    print('Batch', count+1, 'completed')
    row_no = 0
    count += 1
    time.sleep(10)
# Flush the producer
producer.flush()

{'review_id': 'KU_O5udG6zpxOg-VcAEodg', 'user_id': 'mh_-eMZ6K5RLWhZyISBhwA', 'business_id': 'XQfwVwDr-v0ZS3_CbbE5Xw', 'stars': 3, 'useful': 0, 'funny': 0, 'cool': 0, 'text': "If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. \n\nThe food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker.", 'date': Timestamp('2018-07-07 22:09:11')}
{'review_id': 'BiTunyQ73aT9WBnpR9DZGw', 'user_id': 'OyoGAe7OKpv6SyGZT5g77Q', 'business_id': '7ATYjTIgM3jUlt4UM3IypQ', 'stars': 5, 'useful': 1, 'funny': 0, 'cool': 1, 'text': "I've taken a lot of spin classes over the years, and nothing compares to the classes at Bo

## View the data from the topic

In [None]:
!./kafka_2.12-3.5.0/bin/kafka-console-consumer.sh --bootstrap-server localhost:9092 --topic yelp_reviews --from-beginning --max-messages 10

{"review_id": "KU_O5udG6zpxOg-VcAEodg", "user_id": "mh_-eMZ6K5RLWhZyISBhwA", "business_id": "XQfwVwDr-v0ZS3_CbbE5Xw", "stars": 3, "useful": 0, "funny": 0, "cool": 0, "text": "If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it's other locations in NJ and never had a bad experience. \n\nThe food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker.", "date": "2018-07-07 22:09:11"}
{"review_id": "BiTunyQ73aT9WBnpR9DZGw", "user_id": "OyoGAe7OKpv6SyGZT5g77Q", "business_id": "7ATYjTIgM3jUlt4UM3IypQ", "stars": 5, "useful": 1, "funny": 0, "cool": 1, "text": "I've taken a lot of spin classes over the years, and nothing compares to the classes at Body Cycle. F

# Kafka Consumer Logic

In [None]:
def message_consumer(consumer):

  try:
    while True:  # Run until interrupted by keyboard
      for msg in consumer:
        record = msg.value
        print(record)
        time.sleep(5)  # Sleep for 5 seconds before reading more messages

  except Exception as e:
    print(f"Error occurred while consuming messages: {e}")

  finally:

    consumer.close()

In [None]:
consumer = KafkaConsumer(topic, bootstrap_servers='localhost:9092', auto_offset_reset='earliest')

# Start consuming messages in a separate thread
consumption_thread = threading.Thread(target=message_consumer, args=(consumer,))
consumption_thread.start()
# Main thread sleep time is 30 seconds
time.sleep(30)
consumer.close()
consumption_thread.join()




b'{"review_id": "KU_O5udG6zpxOg-VcAEodg", "user_id": "mh_-eMZ6K5RLWhZyISBhwA", "business_id": "XQfwVwDr-v0ZS3_CbbE5Xw", "stars": 3, "useful": 0, "funny": 0, "cool": 0, "text": "If you decide to eat here, just be aware it is going to take about 2 hours from beginning to end. We have tried it multiple times, because I want to like it! I have been to it\'s other locations in NJ and never had a bad experience. \\n\\nThe food is good, but it takes a very long time to come out. The waitstaff is very young, but usually pleasant. We have just had too many experiences where we spent way too long waiting. We usually opt for another diner or restaurant on the weekends, in order to be done quicker.", "date": "2018-07-07 22:09:11"}'
b'{"review_id": "BiTunyQ73aT9WBnpR9DZGw", "user_id": "OyoGAe7OKpv6SyGZT5g77Q", "business_id": "7ATYjTIgM3jUlt4UM3IypQ", "stars": 5, "useful": 1, "funny": 0, "cool": 1, "text": "I\'ve taken a lot of spin classes over the years, and nothing compares to the classes at Body

KeyboardInterrupt: ignored

b'{"review_id": "JrIxlS1TzJ-iCu79ul40cQ", "user_id": "eUta8W_HdHMXPzLBBZhL1A", "business_id": "04UD14gamNjLY0IDYVhHJg", "stars": 1, "useful": 1, "funny": 2, "cool": 1, "text": "I am a long term frequent customer of this establishment. I just went in to order take out (3 apps) and was told they\'re too busy to do it. Really? The place is maybe half full at best. Does your dick reach your ass? Yes? Go fuck yourself! I\'m a frequent customer AND great tipper. Glad that Kanella just opened. NEVER going back to dmitris!", "date": "2015-09-23 23:10:31"}'


# Question 2
![image.png](attachment:image.png)


# Importing necessary packages

In [None]:
! pip install pyspark



In [2]:
import os
import numpy as np
import pandas as pd
import warnings
import threading
from pyspark.sql import SparkSession
from pyspark.sql.functions import * # window, col, countDistinct, avg, lag, when, approx_count_distinct, sum, min, max,from_unixtime, unix_timestamp,expr
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType, LongType
from pyspark.sql.window import Window
from pyspark.streaming import StreamingContext
from pyspark import SparkConf, SparkContext

# Declarations

In [4]:
# Create a SparkSession
spark_session = SparkSession.builder.master("local").appName("PyRidesDriverPerformance").config('spark.ui.port', '4050').getOrCreate()
# File path to the JSON data
file_path = '/content/'
# Define the sliding window duration and slide duration
window_duration_part1, slide_duration_part1 = "10 minutes", "5 minutes"
# Define the tumbling window of 15 minutes
window_duration_part2 = "15 minutes"
# Define the session window of 30 minutes
session_gap_duration = "30 minutes"
# Define a threshold for idle session detection (30 minutes in seconds)
idle_threshold_seconds = 1800
# Define the schema for the streaming data
schema = StructType([
    StructField("driver_id", StringType(), nullable=False),
    StructField("timestamp", TimestampType(), nullable=False),
    StructField("latitude", DoubleType(), nullable=False),
    StructField("longitude", DoubleType(), nullable=False),
    StructField("trip_distance", DoubleType(), nullable=True),
    StructField("event_type", StringType(), nullable=True)  # Set nullable=True if trip_distance can be missing
])

In [5]:
# Read the JSON file into a DataFrame using the specified schema.
ride_information_stream = spark_session.readStream.format("json").option('multiline', True).schema(schema).json(file_path)
# Display the schema of the DataFrame.
ride_information_stream.printSchema()
# Select all columns from the streaming DataFrame.
ride_information_dataframe = ride_information_stream.select("*")
# Start a streaming query to write the DataFrame into a memory sink.
# The query is named "ride_information_using_stream" and operates in append mode.
# The data is processed every 5 seconds as specified by the 'trigger' parameter.
ride_information_query = ride_information_dataframe.writeStream.format("memory").outputMode("append").queryName("ride_information_using_stream").trigger(processingTime='5 seconds').start()

root
 |-- driver_id: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- trip_distance: double (nullable = true)
 |-- event_type: string (nullable = true)



In [6]:
# Retrieve data from the "ride_information_using_stream" temporary view using SQL query
ride_information = spark_session.sql("select * from ride_information_using_stream")

In [7]:
# Display the data

ride_information.show()

+---------+-------------------+------------------+-------------------+------------------+----------+
|driver_id|          timestamp|          latitude|          longitude|     trip_distance|event_type|
+---------+-------------------+------------------+-------------------+------------------+----------+
|     D001|2023-07-30 05:55:48|37.538849579360246|-121.22748988885533| 3.423076573047031|      Trip|
|     D001|2023-07-30 07:10:08| 37.93476356947273| -121.0215678805398| 5.072880200441588|      Trip|
|     D001|2023-07-30 07:11:01|37.795197133875064|  -121.920222434928|              null|       GPS|
|     D001|2023-07-30 05:20:21| 37.46871545044007|-121.05888928225791|1.5685949947600415|      Trip|
|     D001|2023-07-30 05:27:29| 37.67930257604861| -121.7174686646489|              null|       GPS|
|     D001|2023-07-30 06:20:49| 37.57858135955766|-121.45726820764473|              null|       GPS|
|     D001|2023-07-30 07:06:38| 37.74438722614793|-121.17710081904086|7.7232027230627365|  

# Task 1: Count of Unique Drivers

In [8]:
# Sorting the ride information DataFrame by 'driver_id' and 'timestamp'
ride_information = ride_information.orderBy('driver_id', 'timestamp')
# Grouping the ride information DataFrame by time windows and calculating the count of unique drivers within each window
unique_drivers_count = ride_information.groupBy(
    window(col("timestamp"), window_duration_part1, slide_duration_part1)
).agg(countDistinct("driver_id").alias("unique_drivers_count"))

In [9]:
# Print a descriptive message indicating the task being performed
print('Task 1: Count of Unique Drivers : ')
# Perform ordering of the DataFrame containing the count of unique drivers using the 'window' column
# The 'orderBy' function arranges the data based on the specified column(s)
ordered_unique_drivers_data = unique_drivers_count.orderBy('window')

# Display the ordered DataFrame containing the count of unique drivers
# The 'show' function displays a specified number of rows from the DataFrame
# The 'count()' function is used to determine the total number of rows in the DataFrame
# The 'False' parameter in the 'show' function ensures that the content of the cells is not truncated
ordered_unique_drivers_data.show(ordered_unique_drivers_data.count(), False)

Task 1: Count of Unique Drivers : 
+------------------------------------------+--------------------+
|window                                    |unique_drivers_count|
+------------------------------------------+--------------------+
|{2023-07-30 05:05:00, 2023-07-30 05:15:00}|47                  |
|{2023-07-30 05:10:00, 2023-07-30 05:20:00}|50                  |
|{2023-07-30 05:15:00, 2023-07-30 05:25:00}|50                  |
|{2023-07-30 05:20:00, 2023-07-30 05:30:00}|50                  |
|{2023-07-30 05:25:00, 2023-07-30 05:35:00}|50                  |
|{2023-07-30 05:30:00, 2023-07-30 05:40:00}|50                  |
|{2023-07-30 05:35:00, 2023-07-30 05:45:00}|50                  |
|{2023-07-30 05:40:00, 2023-07-30 05:50:00}|50                  |
|{2023-07-30 05:45:00, 2023-07-30 05:55:00}|49                  |
|{2023-07-30 05:50:00, 2023-07-30 06:00:00}|50                  |
|{2023-07-30 05:55:00, 2023-07-30 06:05:00}|50                  |
|{2023-07-30 06:00:00, 2023-07-30 06:10:0

# Task 2: Average Trip Duration

In [10]:
# Selecting specific columns "driver_id", "timestamp", and "event_type" from the ride_information DataFrame
eventwise_ride = ride_information.select("driver_id", "timestamp", "event_type")
# Dropping rows where all columns have null values
eventwise_ride = eventwise_ride.dropna(how="all")

In [11]:
windows_specs = Window.partitionBy('driver_id').orderBy('timestamp')
eventwise_data_within_window = eventwise_ride.withColumn("prev_timestamp", lag('timestamp').over(windows_specs))
eventwise_data_within_window = eventwise_data_within_window.withColumn(
    "event_duration", (col("timestamp").cast('long') - col('prev_timestamp').cast('long'))
)

In [12]:
# Display the DataFrame containing trip data within the specified window
eventwise_data_within_window.show()

+---------+-------------------+----------+-------------------+--------------+
|driver_id|          timestamp|event_type|     prev_timestamp|event_duration|
+---------+-------------------+----------+-------------------+--------------+
|     D001|2023-07-30 05:12:56|      Trip|               null|          null|
|     D001|2023-07-30 05:15:57|       GPS|2023-07-30 05:12:56|           181|
|     D001|2023-07-30 05:16:08|      Trip|2023-07-30 05:15:57|            11|
|     D001|2023-07-30 05:17:46|      Trip|2023-07-30 05:16:08|            98|
|     D001|2023-07-30 05:17:51|      Trip|2023-07-30 05:17:46|             5|
|     D001|2023-07-30 05:20:21|      Trip|2023-07-30 05:17:51|           150|
|     D001|2023-07-30 05:20:57|      Trip|2023-07-30 05:20:21|            36|
|     D001|2023-07-30 05:21:07|       GPS|2023-07-30 05:20:57|            10|
|     D001|2023-07-30 05:23:03|      Trip|2023-07-30 05:21:07|           116|
|     D001|2023-07-30 05:23:27|       GPS|2023-07-30 05:23:03|  

In [13]:
window_spec_idle_duration = Window.orderBy(col("driver_id"))
eventwise_data_within_window = eventwise_data_within_window.withColumn(
    "event_duration_actual", lag(col("event_duration"), -1).over(window_spec_idle_duration)
)
eventwise_data_within_window.show()

+---------+-------------------+----------+-------------------+--------------+---------------------+
|driver_id|          timestamp|event_type|     prev_timestamp|event_duration|event_duration_actual|
+---------+-------------------+----------+-------------------+--------------+---------------------+
|     D001|2023-07-30 05:12:56|      Trip|               null|          null|                  181|
|     D001|2023-07-30 05:15:57|       GPS|2023-07-30 05:12:56|           181|                   11|
|     D001|2023-07-30 05:16:08|      Trip|2023-07-30 05:15:57|            11|                   98|
|     D001|2023-07-30 05:17:46|      Trip|2023-07-30 05:16:08|            98|                    5|
|     D001|2023-07-30 05:17:51|      Trip|2023-07-30 05:17:46|             5|                  150|
|     D001|2023-07-30 05:20:21|      Trip|2023-07-30 05:17:51|           150|                   36|
|     D001|2023-07-30 05:20:57|      Trip|2023-07-30 05:20:21|            36|                   10|


In [14]:
# Filtering the eventwise_ride DataFrame to keep only rows where the "event_type" column is "Trip"
only_trip_data = eventwise_data_within_window.filter(eventwise_ride.event_type == "Trip")
# Displaying the resulting DataFrame with only "Trip" event type data
only_trip_data.show()

+---------+-------------------+----------+-------------------+--------------+---------------------+
|driver_id|          timestamp|event_type|     prev_timestamp|event_duration|event_duration_actual|
+---------+-------------------+----------+-------------------+--------------+---------------------+
|     D001|2023-07-30 05:12:56|      Trip|               null|          null|                  181|
|     D001|2023-07-30 05:16:08|      Trip|2023-07-30 05:15:57|            11|                   98|
|     D001|2023-07-30 05:17:46|      Trip|2023-07-30 05:16:08|            98|                    5|
|     D001|2023-07-30 05:17:51|      Trip|2023-07-30 05:17:46|             5|                  150|
|     D001|2023-07-30 05:20:21|      Trip|2023-07-30 05:17:51|           150|                   36|
|     D001|2023-07-30 05:20:57|      Trip|2023-07-30 05:20:21|            36|                   10|
|     D001|2023-07-30 05:23:03|      Trip|2023-07-30 05:21:07|           116|                   24|


In [15]:
# Calculate the average trip duration within the specified window for each driver
average_trip_duration = only_trip_data.groupBy('driver_id', window('timestamp', window_duration_part2)).agg(round(avg('event_duration_actual')/60,2).alias('average trip duration'))
# Filter out rows where the calculated average trip duration is not null
average_trip_duration = average_trip_duration.filter(col('average trip duration').isNotNull())

In [16]:
# Display the calculated average trip duration along with the count of rows in the DataFrame
average_trip_duration.show(average_trip_duration.count(), truncate=False)

+---------+------------------------------------------+---------------------+
|driver_id|window                                    |average trip duration|
+---------+------------------------------------------+---------------------+
|D001     |{2023-07-30 05:00:00, 2023-07-30 05:15:00}|3.02                 |
|D001     |{2023-07-30 05:15:00, 2023-07-30 05:30:00}|0.94                 |
|D001     |{2023-07-30 05:30:00, 2023-07-30 05:45:00}|0.9                  |
|D001     |{2023-07-30 05:45:00, 2023-07-30 06:00:00}|0.65                 |
|D001     |{2023-07-30 06:00:00, 2023-07-30 06:15:00}|1.09                 |
|D001     |{2023-07-30 06:15:00, 2023-07-30 06:30:00}|1.53                 |
|D001     |{2023-07-30 06:30:00, 2023-07-30 06:45:00}|1.18                 |
|D001     |{2023-07-30 06:45:00, 2023-07-30 07:00:00}|1.04                 |
|D001     |{2023-07-30 07:00:00, 2023-07-30 07:15:00}|1.24                 |
|D002     |{2023-07-30 05:00:00, 2023-07-30 05:15:00}|0.43                 |

# Task 3: Idle Time Detection

In [17]:
# Select specific columns "driver_id", "timestamp", and "event_type" from the ride_information DataFrame.
eventwise_ride = ride_information.select("driver_id", "timestamp", "event_type")
# Drop rows where all columns have missing (null) values.
eventwise_ride = eventwise_ride.dropna(how="all")
# Display the resulting DataFrame "eventwise_ride" to inspect the data.
eventwise_ride.show()

+---------+-------------------+----------+
|driver_id|          timestamp|event_type|
+---------+-------------------+----------+
|     D001|2023-07-30 05:12:56|      Trip|
|     D001|2023-07-30 05:15:57|       GPS|
|     D001|2023-07-30 05:16:08|      Trip|
|     D001|2023-07-30 05:17:46|      Trip|
|     D001|2023-07-30 05:17:51|      Trip|
|     D001|2023-07-30 05:20:21|      Trip|
|     D001|2023-07-30 05:20:57|      Trip|
|     D001|2023-07-30 05:21:07|       GPS|
|     D001|2023-07-30 05:23:03|      Trip|
|     D001|2023-07-30 05:23:27|       GPS|
|     D001|2023-07-30 05:27:29|       GPS|
|     D001|2023-07-30 05:29:41|      Trip|
|     D001|2023-07-30 05:29:56|      Trip|
|     D001|2023-07-30 05:31:50|       GPS|
|     D001|2023-07-30 05:32:08|       GPS|
|     D001|2023-07-30 05:32:20|      Trip|
|     D001|2023-07-30 05:32:34|      Trip|
|     D001|2023-07-30 05:34:54|      Trip|
|     D001|2023-07-30 05:36:21|       GPS|
|     D001|2023-07-30 05:38:09|       GPS|
+---------+

In [18]:
# Define a window specification for partitioning the data by driver_id and ordering by timestamp.
windows_specs = Window.partitionBy('driver_id').orderBy('timestamp')
# Create a new DataFrame 'eventwise_ride_within_window' by adding a new column 'prev_timestamp'
eventwise_ride_within_window = eventwise_ride.withColumn("prev_timestamp", lag('timestamp').over(windows_specs))

In [19]:

# Display the contents of the DataFrame "eventwise_ride_within_window"
eventwise_ride_within_window.show()

+---------+-------------------+----------+-------------------+
|driver_id|          timestamp|event_type|     prev_timestamp|
+---------+-------------------+----------+-------------------+
|     D001|2023-07-30 05:12:56|      Trip|               null|
|     D001|2023-07-30 05:15:57|       GPS|2023-07-30 05:12:56|
|     D001|2023-07-30 05:16:08|      Trip|2023-07-30 05:15:57|
|     D001|2023-07-30 05:17:46|      Trip|2023-07-30 05:16:08|
|     D001|2023-07-30 05:17:51|      Trip|2023-07-30 05:17:46|
|     D001|2023-07-30 05:20:21|      Trip|2023-07-30 05:17:51|
|     D001|2023-07-30 05:20:57|      Trip|2023-07-30 05:20:21|
|     D001|2023-07-30 05:21:07|       GPS|2023-07-30 05:20:57|
|     D001|2023-07-30 05:23:03|      Trip|2023-07-30 05:21:07|
|     D001|2023-07-30 05:23:27|       GPS|2023-07-30 05:23:03|
|     D001|2023-07-30 05:27:29|       GPS|2023-07-30 05:23:27|
|     D001|2023-07-30 05:29:41|      Trip|2023-07-30 05:27:29|
|     D001|2023-07-30 05:29:56|      Trip|2023-07-30 05

In [20]:
# Calculate the duration of each event within the specified window
eventwise_ride_details_with_idle_duration = eventwise_ride_within_window.withColumn(
    "event_duration",
    (col("timestamp").cast('long') - col('prev_timestamp').cast('long'))
)

In [21]:
# Display the DataFrame containing ride details along with calculated idle durations
eventwise_ride_details_with_idle_duration.show()

+---------+-------------------+----------+-------------------+--------------+
|driver_id|          timestamp|event_type|     prev_timestamp|event_duration|
+---------+-------------------+----------+-------------------+--------------+
|     D001|2023-07-30 05:12:56|      Trip|               null|          null|
|     D001|2023-07-30 05:15:57|       GPS|2023-07-30 05:12:56|           181|
|     D001|2023-07-30 05:16:08|      Trip|2023-07-30 05:15:57|            11|
|     D001|2023-07-30 05:17:46|      Trip|2023-07-30 05:16:08|            98|
|     D001|2023-07-30 05:17:51|      Trip|2023-07-30 05:17:46|             5|
|     D001|2023-07-30 05:20:21|      Trip|2023-07-30 05:17:51|           150|
|     D001|2023-07-30 05:20:57|      Trip|2023-07-30 05:20:21|            36|
|     D001|2023-07-30 05:21:07|       GPS|2023-07-30 05:20:57|            10|
|     D001|2023-07-30 05:23:03|      Trip|2023-07-30 05:21:07|           116|
|     D001|2023-07-30 05:23:27|       GPS|2023-07-30 05:23:03|  

In [22]:
# Create a window specification for ordering by the "driver_id" column
window_spec_idle_duration = Window.orderBy(col("driver_id"))
# Add a new column "event_duration_actual" to the DataFrame
eventwise_ride_details_with_idle_duration_new = eventwise_ride_details_with_idle_duration.withColumn(
    "event_duration_actual", lag(col("event_duration"), -1).over(window_spec_idle_duration)
)
# Display the resulting DataFrame with the added column
eventwise_ride_details_with_idle_duration_new.show(eventwise_ride_details_with_idle_duration_new.count(), truncate=False)

+---------+-------------------+----------+-------------------+--------------+---------------------+
|driver_id|timestamp          |event_type|prev_timestamp     |event_duration|event_duration_actual|
+---------+-------------------+----------+-------------------+--------------+---------------------+
|D001     |2023-07-30 05:12:56|Trip      |null               |null          |181                  |
|D001     |2023-07-30 05:15:57|GPS       |2023-07-30 05:12:56|181           |11                   |
|D001     |2023-07-30 05:16:08|Trip      |2023-07-30 05:15:57|11            |98                   |
|D001     |2023-07-30 05:17:46|Trip      |2023-07-30 05:16:08|98            |5                    |
|D001     |2023-07-30 05:17:51|Trip      |2023-07-30 05:17:46|5             |150                  |
|D001     |2023-07-30 05:20:21|Trip      |2023-07-30 05:17:51|150           |36                   |
|D001     |2023-07-30 05:20:57|Trip      |2023-07-30 05:20:21|36            |10                   |


In [23]:
# Filter the DataFrame to retain only rows with the "GPS" event type
only_GPS_data = eventwise_ride_details_with_idle_duration_new.filter(eventwise_ride_details_with_idle_duration_new.event_type == "GPS")
# Display the content of the filtered DataFrame along with its total row count
only_GPS_data.show(only_GPS_data.count())

+---------+-------------------+----------+-------------------+--------------+---------------------+
|driver_id|          timestamp|event_type|     prev_timestamp|event_duration|event_duration_actual|
+---------+-------------------+----------+-------------------+--------------+---------------------+
|     D001|2023-07-30 05:15:57|       GPS|2023-07-30 05:12:56|           181|                   11|
|     D001|2023-07-30 05:21:07|       GPS|2023-07-30 05:20:57|            10|                  116|
|     D001|2023-07-30 05:23:27|       GPS|2023-07-30 05:23:03|            24|                  242|
|     D001|2023-07-30 05:27:29|       GPS|2023-07-30 05:23:27|           242|                  132|
|     D001|2023-07-30 05:31:50|       GPS|2023-07-30 05:29:56|           114|                   18|
|     D001|2023-07-30 05:32:08|       GPS|2023-07-30 05:31:50|            18|                   12|
|     D001|2023-07-30 05:36:21|       GPS|2023-07-30 05:34:54|            87|                  108|


In [24]:
# Identify idle sessions
idle_sessions = only_GPS_data.withColumn("Idle Duration Flag", when(col("event_duration_actual") > idle_threshold_seconds, 1).otherwise(0))
idle_sessions.show()

+---------+-------------------+----------+-------------------+--------------+---------------------+------------------+
|driver_id|          timestamp|event_type|     prev_timestamp|event_duration|event_duration_actual|Idle Duration Flag|
+---------+-------------------+----------+-------------------+--------------+---------------------+------------------+
|     D001|2023-07-30 05:15:57|       GPS|2023-07-30 05:12:56|           181|                   11|                 0|
|     D001|2023-07-30 05:21:07|       GPS|2023-07-30 05:20:57|            10|                  116|                 0|
|     D001|2023-07-30 05:23:27|       GPS|2023-07-30 05:23:03|            24|                  242|                 0|
|     D001|2023-07-30 05:27:29|       GPS|2023-07-30 05:23:27|           242|                  132|                 0|
|     D001|2023-07-30 05:31:50|       GPS|2023-07-30 05:29:56|           114|                   18|                 0|
|     D001|2023-07-30 05:32:08|       GPS|2023-0

In [25]:
# Print a descriptive message indicating the purpose of the following code block
print('Driver idle for 30 minutes')
# Filter the DataFrame 'idle_sessions' to retain rows where the "Idle Duration Flag" column is greater than 0
idle_sessions = idle_sessions.filter(col("Idle Duration Flag") > 0)
# Display the filtered DataFrame 'idle_sessions' using the 'show' method
idle_sessions.show(idle_sessions.count(), truncate=False)

Driver idle for 30 minutes
+---------+---------+----------+--------------+--------------+---------------------+------------------+
|driver_id|timestamp|event_type|prev_timestamp|event_duration|event_duration_actual|Idle Duration Flag|
+---------+---------+----------+--------------+--------------+---------------------+------------------+
+---------+---------+----------+--------------+--------------+---------------------+------------------+



In [26]:
# Group the DataFrame 'only_GPS_data' by the 'driver_id' column
driver_wise_max_idle_time = only_GPS_data.groupBy(col('driver_id')).agg(round(max(col('event_duration_actual'))/60,2).alias('Max Idle Duration'))
# Display the results of the 'driver_wise_max_idle_time' DataFrame
driver_wise_max_idle_time.show(driver_wise_max_idle_time.count())

+---------+-----------------+
|driver_id|Max Idle Duration|
+---------+-----------------+
|     D001|              6.7|
|     D002|             4.97|
|     D003|             6.37|
|     D004|             6.17|
|     D005|             4.02|
|     D006|             3.17|
|     D007|             4.65|
|     D008|             5.73|
|     D009|             5.88|
|     D010|             7.17|
|     D011|             4.32|
|     D012|             4.13|
|     D013|              5.4|
|     D014|             3.48|
|     D015|             4.82|
|     D016|              6.8|
|     D017|             4.43|
|     D018|             6.57|
|     D019|             5.53|
|     D020|             4.78|
|     D021|              4.8|
|     D022|              5.5|
|     D023|             6.27|
|     D024|             5.27|
|     D025|             4.72|
|     D026|             4.12|
|     D027|              4.3|
|     D028|              4.7|
|     D029|             4.98|
|     D030|             4.58|
|     D031