In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window

spark = SparkSession.builder.appName("Driver Analytics Case Study").getOrCreate()

#DATASET 1 — DRIVER MASTER (CORRUPTED)

In [2]:
raw_drivers = [
("D001","Ramesh","35","Hyderabad","Car,Bike"),
("D002","Suresh","Forty","Bangalore","Auto"),
("D003","Anita",None,"Mumbai",["Car"]),
("D004","Kiran","29","Delhi","Car|Bike"),
("D005","", "42","Chennai",None)
]

dr_schema = StructType([
    StructField("driver_id", StringType(), True),
    StructField("dnam", StringType(), True),
    StructField("age_raw", StringType(), True),
    StructField("city", StringType(), True),
    StructField("vehicle_raw", StringType(), True)
])
dr_df = spark.createDataFrame(raw_drivers, dr_schema)
dr_df.show(truncate=False)
dr_df.printSchema()

+---------+------+-------+---------+-----------+
|driver_id|dnam  |age_raw|city     |vehicle_raw|
+---------+------+-------+---------+-----------+
|D001     |Ramesh|35     |Hyderabad|Car,Bike   |
|D002     |Suresh|Forty  |Bangalore|Auto       |
|D003     |Anita |NULL   |Mumbai   |[Car]      |
|D004     |Kiran |29     |Delhi    |Car|Bike   |
|D005     |      |42     |Chennai  |NULL       |
+---------+------+-------+---------+-----------+

root
 |-- driver_id: string (nullable = true)
 |-- dnam: string (nullable = true)
 |-- age_raw: string (nullable = true)
 |-- city: string (nullable = true)
 |-- vehicle_raw: string (nullable = true)



#DATASET 2 — CITY MASTER (SMALL LOOKUP)

In [3]:
raw_cities = [
("Hyderabad","South"),
("Bangalore","South"),
("Mumbai","West"),
("Delhi","North"),
("Chennai","South")
]

city_schema = StructType([
    StructField("city", StringType(), True),
    StructField("region", StringType(), True),
])

city_df = spark.createDataFrame(raw_cities, city_schema)
city_df.show(truncate=False)
city_df.printSchema()

+---------+------+
|city     |region|
+---------+------+
|Hyderabad|South |
|Bangalore|South |
|Mumbai   |West  |
|Delhi    |North |
|Chennai  |South |
+---------+------+

root
 |-- city: string (nullable = true)
 |-- region: string (nullable = true)



#DATASET 3 — TRIPS DATA

In [4]:
raw_trips = [
("T001","D001","Hyderabad","2024-01-05","Completed","450"),
("T002","D002","Bangalore","05/01/2024","Cancelled","0"),
("T003","D003","Mumbai","2024/01/06","Completed","620"),
("T004","D004","Delhi","invalid_date","Completed","540"),
("T005","D001","Hyderabad","2024-01-10","Completed","700"),
("T006","D005","Chennai","2024-01-12","Completed","350")
]

trip_schema = StructType([
    StructField("trip_id", StringType(), True),
    StructField("driver_id", StringType(), True),
    StructField("city", StringType(), True),
    StructField("date_raw", StringType(), True),
    StructField("status", StringType(), True),
    StructField("fare_raw", StringType(), True),
])

tdf = spark.createDataFrame(raw_trips, trip_schema)
tdf.show(truncate=False)
tdf.printSchema()

+-------+---------+---------+------------+---------+--------+
|trip_id|driver_id|city     |date_raw    |status   |fare_raw|
+-------+---------+---------+------------+---------+--------+
|T001   |D001     |Hyderabad|2024-01-05  |Completed|450     |
|T002   |D002     |Bangalore|05/01/2024  |Cancelled|0       |
|T003   |D003     |Mumbai   |2024/01/06  |Completed|620     |
|T004   |D004     |Delhi    |invalid_date|Completed|540     |
|T005   |D001     |Hyderabad|2024-01-10  |Completed|700     |
|T006   |D005     |Chennai  |2024-01-12  |Completed|350     |
+-------+---------+---------+------------+---------+--------+

root
 |-- trip_id: string (nullable = true)
 |-- driver_id: string (nullable = true)
 |-- city: string (nullable = true)
 |-- date_raw: string (nullable = true)
 |-- status: string (nullable = true)
 |-- fare_raw: string (nullable = true)



#DATASET 4 — DRIVER ACTIVITY LOGS

In [5]:
raw_activity = [
("D001","login,accept_trip,logout","{'device':'mobile'}",180),
("D002",["login","logout"],"device=laptop",60),
("D003","login|accept_trip",None,120),
("D004",None,"{'device':'tablet'}",90),
("D005","login","{'device':'mobile'}",30)
]
act_schema = StructType([
    StructField("driver_id", StringType(), True),
    StructField("actions_raw", StringType(), True),
    StructField("metadata_raw", StringType(), True),
    StructField("duration", StringType(), True)
])

acdf = spark.createDataFrame(raw_activity, act_schema)
acdf.show(truncate=False)
acdf.printSchema()

+---------+------------------------+-------------------+--------+
|driver_id|actions_raw             |metadata_raw       |duration|
+---------+------------------------+-------------------+--------+
|D001     |login,accept_trip,logout|{'device':'mobile'}|180     |
|D002     |[login, logout]         |device=laptop      |60      |
|D003     |login|accept_trip       |NULL               |120     |
|D004     |NULL                    |{'device':'tablet'}|90      |
|D005     |login                   |{'device':'mobile'}|30      |
+---------+------------------------+-------------------+--------+

root
 |-- driver_id: string (nullable = true)
 |-- actions_raw: string (nullable = true)
 |-- metadata_raw: string (nullable = true)
 |-- duration: string (nullable = true)



PART A — DATA CLEANING & STRUCTURING

- Design explicit schemas for all datasets
- Normalize:

Age

Fare

Dates

3. Convert vehicle types and actions into arrays
4. Handle missing and invalid records gracefully
5. Produce clean DataFrames:
drivers_df

cities_df

trips_df

activity_df

In [None]:
#1

In [6]:
df1 = dr_df.withColumn("dnam", when(trim(col("dnam")) == "", None).otherwise(col("dnam")))\
    .withColumn("age",when(col("age_raw").rlike("^[0-9]+$"), col("age_raw").cast("int"))
    ) \
    .withColumn("vehicles",split(regexp_replace(coalesce(col("vehicle_raw"), lit("")), "[|]", ","),",")
    ) \
    .drop("age_raw", "vehicle_raw")
df1.show()

+---------+------+---------+----+-----------+
|driver_id|  dnam|     city| age|   vehicles|
+---------+------+---------+----+-----------+
|     D001|Ramesh|Hyderabad|  35|[Car, Bike]|
|     D002|Suresh|Bangalore|NULL|     [Auto]|
|     D003| Anita|   Mumbai|NULL|    [[Car]]|
|     D004| Kiran|    Delhi|  29|[Car, Bike]|
|     D005|  NULL|  Chennai|  42|         []|
+---------+------+---------+----+-----------+



In [None]:
#2

In [7]:
df2 = tdf.withColumn("trip_date",coalesce(
            to_date(try_to_timestamp(col("date_raw"),lit("yyyy-MM-dd"))),
                                        to_date(try_to_timestamp(col("date_raw"),lit("dd/MM/yyyy"))),
                                        to_date(try_to_timestamp(col("date_raw"),lit("yyyy/MM/dd")))
        )).withColumn("fare", col("fare_raw").cast("double")) \
    .drop("date_raw", "fare_raw")
df2.show()

+-------+---------+---------+---------+----------+-----+
|trip_id|driver_id|     city|   status| trip_date| fare|
+-------+---------+---------+---------+----------+-----+
|   T001|     D001|Hyderabad|Completed|2024-01-05|450.0|
|   T002|     D002|Bangalore|Cancelled|2024-01-05|  0.0|
|   T003|     D003|   Mumbai|Completed|2024-01-06|620.0|
|   T004|     D004|    Delhi|Completed|      NULL|540.0|
|   T005|     D001|Hyderabad|Completed|2024-01-10|700.0|
|   T006|     D005|  Chennai|Completed|2024-01-12|350.0|
+-------+---------+---------+---------+----------+-----+



In [10]:
from pyspark.sql.functions import split, regexp_replace, col, lit, when

df3 = acdf.withColumn("actions",when(col("actions_raw").isNull(), array())
        .otherwise(split(regexp_replace(col("actions_raw"),"[|]",","),","))
    ) \
    .withColumn("metadata",
        from_json(col("metadata_raw"), MapType(StringType(),StringType()))
    ).drop("actions_raw","metadata_raw")
df3.show(truncate=False)


+---------+--------+----------------------------+------------------+
|driver_id|duration|actions                     |metadata          |
+---------+--------+----------------------------+------------------+
|D001     |180     |[login, accept_trip, logout]|{device -> mobile}|
|D002     |60      |[[login,  logout]]          |NULL              |
|D003     |120     |[login, accept_trip]        |NULL              |
|D004     |90      |[]                          |{device -> tablet}|
|D005     |30      |[login]                     |{device -> mobile}|
+---------+--------+----------------------------+------------------+



PART B — DATA INTEGRATION (JOINS)

6. Join trips with drivers
7. Join trips with cities
8. Decide which dataset should be broadcast
9. Prove your decision using explain(True)
10. Remove orphan trips (drivers not in master)

In [16]:
#6
t = df2.alias("t")
d = df1.alias("d")

trip_driver_df = t.join(
    d, col("t.driver_id")==col("d.driver_id"), "inner")
trip_driver_df.show()

#7
c = city_df.alias("c")

trip_full_df = trip_driver_df.join(
    broadcast(c),col("t.city")==col("c.city"),"left")
trip_full_df.show()

#8
#City DataFrame should be broadcast because of the very small lookup table

#9
trip_full_df.explain(True)

#10
#Orphan trips are already removed by Inner Join

+-------+---------+---------+---------+----------+-----+---------+------+---------+----+-----------+
|trip_id|driver_id|     city|   status| trip_date| fare|driver_id|  dnam|     city| age|   vehicles|
+-------+---------+---------+---------+----------+-----+---------+------+---------+----+-----------+
|   T001|     D001|Hyderabad|Completed|2024-01-05|450.0|     D001|Ramesh|Hyderabad|  35|[Car, Bike]|
|   T005|     D001|Hyderabad|Completed|2024-01-10|700.0|     D001|Ramesh|Hyderabad|  35|[Car, Bike]|
|   T002|     D002|Bangalore|Cancelled|2024-01-05|  0.0|     D002|Suresh|Bangalore|NULL|     [Auto]|
|   T003|     D003|   Mumbai|Completed|2024-01-06|620.0|     D003| Anita|   Mumbai|NULL|    [[Car]]|
|   T004|     D004|    Delhi|Completed|      NULL|540.0|     D004| Kiran|    Delhi|  29|[Car, Bike]|
|   T006|     D005|  Chennai|Completed|2024-01-12|350.0|     D005|  NULL|  Chennai|  42|         []|
+-------+---------+---------+---------+----------+-----+---------+------+---------+----+---

PART C — ANALYTICS & AGGREGATIONS

11. Total trips per city
12. Total revenue per city
13. Average fare per driver
14. Total completed trips per driver
15. Identify drivers with no completed trips

In [25]:
#11
trip_full_df.groupBy(col("t.city").alias("city")).count().show()

#12
trip_full_df.filter(col("t.status")=="Completed").groupBy(col("t.city").alias("city")) \
    .agg(sum("fare").alias("total_revenue")).show()

#13
trip_full_df.filter(col("t.status")=="Completed").groupBy("d.driver_id") \
    .agg(avg("fare").alias("avg_fare")).show()

#14
trip_full_df.filter(col("t.status")=="Completed").groupBy("t.driver_id").count().show()

#15
completed = trip_full_df.filter(col("t.status")=="Completed").select("t.driver_id").distinct()

dr_df.select("driver_id").subtract(completed).show()

+---------+-----+
|     city|count|
+---------+-----+
|Bangalore|    1|
|  Chennai|    1|
|   Mumbai|    1|
|    Delhi|    1|
|Hyderabad|    2|
+---------+-----+

+---------+-------------+
|     city|total_revenue|
+---------+-------------+
|  Chennai|        350.0|
|   Mumbai|        620.0|
|    Delhi|        540.0|
|Hyderabad|       1150.0|
+---------+-------------+

+---------+--------+
|driver_id|avg_fare|
+---------+--------+
|     D001|   575.0|
|     D003|   620.0|
|     D004|   540.0|
|     D005|   350.0|
+---------+--------+

+---------+-----+
|driver_id|count|
+---------+-----+
|     D001|    2|
|     D003|    1|
|     D004|    1|
|     D005|    1|
+---------+-----+

+---------+
|driver_id|
+---------+
|     D002|
+---------+



PART D — WINDOW FUNCTIONS

16. Rank drivers by total revenue (overall)
17. Rank drivers by revenue within each city
18. Calculate running revenue per city by date
19. Compare GroupBy vs Window for one metric

In [30]:
#16
rev_df = trip_full_df.filter(col("t.status")=="Completed").groupBy("t.driver_id").agg(sum("fare").alias("revenue"))

w = Window.orderBy(desc("revenue"))

rev_df.withColumn("rank", dense_rank().over(w)).show()

#17
city_rev = trip_full_df.filter(col("t.status")=="Completed").groupBy(col("t.city"),"t.driver_id").agg(sum("fare").alias("revenue"))

w_city = Window.partitionBy("city").orderBy(desc("revenue"))

city_rev.withColumn("city_rank", dense_rank().over(w_city)).show()

#18
w_run = Window.partitionBy("t.city").orderBy("trip_date").rowsBetween(Window.unboundedPreceding, Window.currentRow)

trip_full_df.filter(col("t.status")=="Completed").withColumn("running_revenue", sum("fare").over(w_run)).show()


+---------+-------+----+
|driver_id|revenue|rank|
+---------+-------+----+
|     D001| 1150.0|   1|
|     D003|  620.0|   2|
|     D004|  540.0|   3|
|     D005|  350.0|   4|
+---------+-------+----+

+---------+---------+-------+---------+
|     city|driver_id|revenue|city_rank|
+---------+---------+-------+---------+
|  Chennai|     D005|  350.0|        1|
|    Delhi|     D004|  540.0|        1|
|Hyderabad|     D001| 1150.0|        1|
|   Mumbai|     D003|  620.0|        1|
+---------+---------+-------+---------+

+-------+---------+---------+---------+----------+-----+---------+------+---------+----+-----------+---------+------+---------------+
|trip_id|driver_id|     city|   status| trip_date| fare|driver_id|  dnam|     city| age|   vehicles|     city|region|running_revenue|
+-------+---------+---------+---------+----------+-----+---------+------+---------+----+-----------+---------+------+---------------+
|   T006|     D005|  Chennai|Completed|2024-01-12|350.0|     D005|  NULL|  C

19
###GroupBy
- it collapses the rows
- Gives aggregated results

###Window
- it retains the rows
- does row level analytics

PART E — UDF (ONLY IF REQUIRED)

20. Classify drivers into performance levels:

High
Medium
Low

Rules:

Prefer built-in functions,
Use UDF only if unavoidable,
Justify your choice

In [32]:
rev_df2 = rev_df.withColumn(
    "performance",when(col("revenue")>=600,"High")
    .when(col("revenue")>=400,"Medium")
    .otherwise("Low"))

print("Classifying drivers into performance levels\n")
rev_df2.show()

# Justification for not using a UDF:
# PySpark's `when().otherwise()` provides native, optimized functionality for conditional logic.
# It is executed within the Spark engine, benefiting from Catalyst Optimizer and Tungsten execution engine,
# leading to significantly better performance compared to Python UDFs. UDFs involve serialization/deserialization
# overhead and context switching between JVM and Python, which can be very slow for large datasets.
# Since `when().otherwise()` perfectly handles the tier classification logic, a UDF is unnecessary and less efficient

Classifying drivers into performance levels

+---------+-------+-----------+
|driver_id|revenue|performance|
+---------+-------+-----------+
|     D001| 1150.0|       High|
|     D003|  620.0|       High|
|     D004|  540.0|     Medium|
|     D005|  350.0|        Low|
+---------+-------+-----------+



PART F — SORTING & ORDERING

21. Sort cities by total revenue (descending)
22. Sort drivers by revenue within each city
23. Explain why sorting caused a shuffle

In [34]:
#21
city_rev.orderBy(desc("revenue")).show()


#22
city_rev.orderBy("city", desc("revenue")).show()

#23
# Sorting a DataFrame in Spark often triggers a 'shuffle' operation.
# A shuffle is the process of redistributing data across partitions (and potentially across machines in a cluster).
# This is necessary because to perform a global sort (or even a sort within groups if data is not pre-partitioned
# or pre-sorted), all data relevant to a specific sort key range might need to be collected on the same partition.
# For example, when sorting categories by total revenue, Spark needs to know the total revenue for all categories
# to correctly order them. If different parts of a category's data reside on different partitions,
# Spark must move this data to ensure a consistent global order. This involves serializing data,
# sending it over the network, and deserializing it on the receiving end, which is a resource-intensive operation.

+---------+---------+-------+
|     city|driver_id|revenue|
+---------+---------+-------+
|Hyderabad|     D001| 1150.0|
|   Mumbai|     D003|  620.0|
|    Delhi|     D004|  540.0|
|  Chennai|     D005|  350.0|
+---------+---------+-------+

+---------+---------+-------+
|     city|driver_id|revenue|
+---------+---------+-------+
|  Chennai|     D005|  350.0|
|    Delhi|     D004|  540.0|
|Hyderabad|     D001| 1150.0|
|   Mumbai|     D003|  620.0|
+---------+---------+-------+



PART G — SET OPERATIONS

Create two DataFrames:
Drivers who completed trips
Drivers who were active (login)

24. Find drivers who logged in but never completed trips
25. Find drivers who completed trips and were active
26. Explain why set operations differ from joins

In [44]:
completed_set = trip_full_df.filter(col("t.status")=="Completed").select("t.driver_id").distinct()
completed_set.show()

ac_clean_df = acdf.withColumn("actions",when(col("actions_raw").isNull(), array())
    .otherwise(split(regexp_replace(col("actions_raw"),"[|]",","),",")))

active_set = ac_clean_df.filter(array_contains(col("actions"),"login")).select("driver_id").distinct()

active_set.show()


#24
active_set.subtract(completed_set).show()


#25
active_set.intersect(completed_set).show()



+---------+
|driver_id|
+---------+
|     D001|
|     D003|
|     D004|
|     D005|
+---------+

+---------+
|driver_id|
+---------+
|     D001|
|     D003|
|     D005|
+---------+

+---------+
|driver_id|
+---------+
+---------+

+---------+
|driver_id|
+---------+
|     D003|
|     D005|
|     D001|
+---------+



In [45]:
#26
# Differences between Set Operations and Joins:
#
# Set Operations (UNION, INTERSECT, EXCEPT/SUBTRACT):
# - Operate on the *rows* of DataFrames.
# - Require the DataFrames to have a compatible schema (same number of columns, same column names, and compatible data types).
# - Combine or compare rows based on their *entire content*.
# - The result has the same schema as the input DataFrames.
#
# Join Operations (INNER, LEFT, RIGHT, FULL, ANTI, SEMI):
# - Combine *columns* from two DataFrames.
# - Combine data based on a *common key* or a specified condition.
# - Typically result in a wider DataFrame (more columns) by merging information from both DataFrames.
# - The schema of the result is a combination of the schemas of the input DataFrames (excluding duplicate join keys if specified).

print("\n--- Set Operations (Operating on rows) ---")
print("Drivers active but never completed trips (using subtract):")
active_set.subtract(completed_set).show()

print("Drivers who completed trips AND were active (using intersect):")
active_set.subtract(completed_set).show()

print("\n--- Join Operations (Operating on columns based on keys) ---")
print("Inner Join: Combining trips and drivers for matching driver_ids:")
t = df2.alias("t")
d = df1.alias("d")
trip_driver_df = t.join(d, col("t.driver_id")==col("d.driver_id"), "inner")
trip_driver_df.show()

print("Left Join: Combining trips and  cities")
c = city_df.alias("c")
trip_full_df = trip_driver_df.join(broadcast(c),col("t.city")==col("c.city"),"left")
trip_full_df.show()


--- Set Operations (Operating on rows) ---
Drivers active but never completed trips (using subtract):
+---------+
|driver_id|
+---------+
+---------+

Drivers who completed trips AND were active (using intersect):
+---------+
|driver_id|
+---------+
+---------+


--- Join Operations (Operating on columns based on keys) ---
Inner Join: Combining trips and drivers for matching driver_ids:
+-------+---------+---------+---------+----------+-----+---------+------+---------+----+-----------+
|trip_id|driver_id|     city|   status| trip_date| fare|driver_id|  dnam|     city| age|   vehicles|
+-------+---------+---------+---------+----------+-----+---------+------+---------+----+-----------+
|   T001|     D001|Hyderabad|Completed|2024-01-05|450.0|     D001|Ramesh|Hyderabad|  35|[Car, Bike]|
|   T005|     D001|Hyderabad|Completed|2024-01-10|700.0|     D001|Ramesh|Hyderabad|  35|[Car, Bike]|
|   T002|     D002|Bangalore|Cancelled|2024-01-05|  0.0|     D002|Suresh|Bangalore|NULL|     [Auto]|
|  

PART H — DAG & PERFORMANCE ANALYSIS

27. Run explain(True) for:
Join with city master,
Window ranking,
Sorting
28. Identify:
Shuffles
Broadcast joins
Sort stages
29. Suggest one performance improvemen

In [48]:
#27
trip_full_df.explain(True)
rev_df.explain(True)
city_rev.orderBy(desc("revenue")).explain(True)

#28
#Shuffles: GroupBy, sort
#Broadcast joins: Seller join
#Sort stages: Window+OrderBy

#29
# Performance Improvement Suggestion:

trip_full_df.cache()
# Cache the 'orders_products_df' DataFrame.
# This DataFrame is the result of a join and is used multiple times in subsequent calculations
# (e.g., total revenue per category/seller, running revenue, top products).
# Caching it will prevent Spark from recomputing this DataFrame every time it's accessed.

== Parsed Logical Plan ==
Join LeftOuter, (city#32 = city#21)
:- Join Inner, (driver_id#31 = driver_id#0)
:  :- SubqueryAlias t
:  :  +- Project [trip_id#30, driver_id#31, city#32, status#34, trip_date#91, fare#92]
:  :     +- Project [trip_id#30, driver_id#31, city#32, date_raw#33, status#34, fare_raw#35, trip_date#91, cast(fare_raw#35 as double) AS fare#92]
:  :        +- Project [trip_id#30, driver_id#31, city#32, date_raw#33, status#34, fare_raw#35, coalesce(to_date(try_to_timestamp(date_raw#33, Some(yyyy-MM-dd), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(date_raw#33, Some(dd/MM/yyyy), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true), to_date(try_to_timestamp(date_raw#33, Some(yyyy/MM/dd), TimestampType, Some(Etc/UTC), false), None, Some(Etc/UTC), true)) AS trip_date#91]
:  :           +- LogicalRDD [trip_id#30, driver_id#31, city#32, date_raw#33, status#34, fare_raw#35], false
:  +- SubqueryAlias d
:     +- Project [d

DataFrame[trip_id: string, driver_id: string, city: string, status: string, trip_date: date, fare: double, driver_id: string, dnam: string, city: string, age: int, vehicles: array<string>, city: string, region: string]