In [0]:
from pyspark.sql import SparkSession
spark=SparkSession.builder.getOrCreate()
spark

Data Ingestion & Schema Analysis

In [0]:
# Load CSV using PySpark with schema inference
df = spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/traffic_logs.csv")
df.show()

+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+
| L001|     V001|     GateA|    GateC|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|      60|      50|
| L002|     V002|     GateB|    GateC|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|      45|     100|
| L003|     V003|     GateA|    GateD|2024-05-01 09:00:00|2024-05-01 09:18:00|       Bike|      55|      30|
| L004|     V004|     GateC|    GateD|2024-05-01 09:15:00|2024-05-01 09:35:00|        Car|      80|      50|
| L005|     V005|     GateB|    GateA|2024-05-01 10:05:00|2024-05-01 10:40:00|        Bus|      40|      70|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+



In [0]:
# Manually define schema and compare
from pyspark.sql.types import *
schema = StructType([
    StructField("LogID", StringType(), True),
    StructField("VehicleID", StringType(), True),
    StructField("EntryPoint", StringType(), True),
    StructField("ExitPoint", StringType(), True),
    StructField("EntryTime", TimestampType(), True),
    StructField("ExitTime", TimestampType(), True),
    StructField("VehicleType", StringType(), True),
    StructField("SpeedKMH", IntegerType(), True),
    StructField("TollPaid", IntegerType(), True)
])

df_manual = spark.read.schema(schema).option("header", True).csv("file:/Workspace/Shared/traffic_logs.csv")
df_manual.printSchema()

root
 |-- LogID: string (nullable = true)
 |-- VehicleID: string (nullable = true)
 |-- EntryPoint: string (nullable = true)
 |-- ExitPoint: string (nullable = true)
 |-- EntryTime: timestamp (nullable = true)
 |-- ExitTime: timestamp (nullable = true)
 |-- VehicleType: string (nullable = true)
 |-- SpeedKMH: integer (nullable = true)
 |-- TollPaid: integer (nullable = true)



In [0]:
# Ensure EntryTime/ExitTime are timestamp
from pyspark.sql.functions import *
df = df_manual.withColumn("EntryTime", to_timestamp("EntryTime")).withColumn("ExitTime", to_timestamp("ExitTime"))

Derived Column Creation

In [0]:
# Calculate TripDurationMinutes = ExitTime - EntryTime
df = df.withColumn("TripDurationMinutes", (unix_timestamp("ExitTime") - unix_timestamp("EntryTime")) / 60)
df.show()

+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+
| L001|     V001|     GateA|    GateC|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|      60|      50|               19.0|
| L002|     V002|     GateB|    GateC|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|      45|     100|               35.0|
| L003|     V003|     GateA|    GateD|2024-05-01 09:00:00|2024-05-01 09:18:00|       Bike|      55|      30|               18.0|
| L004|     V004|     GateC|    GateD|2024-05-01 09:15:00|2024-05-01 09:35:00|        Car|      80|      50|               20.0|
| L005|     V005|     GateB|    GateA|2024-05-01 10:05:00|2024-05-01 10:40:00|        Bus|      4

In [0]:
# Add IsOverspeed = SpeedKMH > 60
df = df.withColumn("IsOverspeed", col("SpeedKMH") > 60)
df.show()

+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+
| L001|     V001|     GateA|    GateC|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|      60|      50|               19.0|      false|
| L002|     V002|     GateB|    GateC|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|      45|     100|               35.0|      false|
| L003|     V003|     GateA|    GateD|2024-05-01 09:00:00|2024-05-01 09:18:00|       Bike|      55|      30|               18.0|      false|
| L004|     V004|     GateC|    GateD|2024-05-01 09:15:00|2024-05-01 09:35:00|        Car|      80|      50|               20.0|       true|
| L005|     V

Vehicle Behavior Aggregations

In [0]:
# Average speed per VehicleType
avg_speed = df.groupBy("VehicleType").agg(avg("SpeedKMH").alias("AvgSpeed"))
avg_speed.show()

+-----------+--------+
|VehicleType|AvgSpeed|
+-----------+--------+
|       Bike|    55.0|
|        Car|    70.0|
|      Truck|    45.0|
|        Bus|    40.0|
+-----------+--------+



In [0]:
# Total toll collected per gate (EntryPoint)
toll_per_gate = df.groupBy("EntryPoint").agg(sum("TollPaid").alias("TotalToll"))
toll_per_gate.show()

+----------+---------+
|EntryPoint|TotalToll|
+----------+---------+
|     GateA|       80|
|     GateB|      170|
|     GateC|       50|
+----------+---------+



In [0]:
# Most used ExitPoint
most_exit = df.groupBy("ExitPoint").count().orderBy(desc("count")).limit(1)
most_exit.show()

+---------+-----+
|ExitPoint|count|
+---------+-----+
|    GateD|    2|
+---------+-----+



Window Functions

In [0]:
# Rank vehicles by speed within VehicleType
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, lag

vt_window = Window.partitionBy("VehicleType").orderBy(col("SpeedKMH").desc())
df = df.withColumn("SpeedRank", rank().over(vt_window))
df.show()

+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|SpeedRank|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+
| L003|     V003|     GateA|    GateD|2024-05-01 09:00:00|2024-05-01 09:18:00|       Bike|      55|      30|               18.0|      false|        1|
| L005|     V005|     GateB|    GateA|2024-05-01 10:05:00|2024-05-01 10:40:00|        Bus|      40|      70|               35.0|      false|        1|
| L004|     V004|     GateC|    GateD|2024-05-01 09:15:00|2024-05-01 09:35:00|        Car|      80|      50|               20.0|       true|        1|
| L001|     V001|     GateA|    GateC|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|    

In [0]:
# Find last exit time for each vehicle using lag()
v_window = Window.partitionBy("VehicleID").orderBy("ExitTime")
df = df.withColumn("LastExitTime", lag("ExitTime").over(v_window))
df.show()

+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+------------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|SpeedRank|LastExitTime|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+------------+
| L001|     V001|     GateA|    GateC|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|      60|      50|               19.0|      false|        2|        NULL|
| L002|     V002|     GateB|    GateC|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|      45|     100|               35.0|      false|        1|        NULL|
| L003|     V003|     GateA|    GateD|2024-05-01 09:00:00|2024-05-01 09:18:00|       Bike|      55|      30|               18.0|      false|        1|        NULL|
| L004|     V004

Session Segmentation

In [0]:
# Group by VehicleID to simulate route sessions
# Find duration between subsequent trips (idle time)
df = df.withColumn("IdleTimeMinutes",(unix_timestamp("EntryTime") - unix_timestamp("LastExitTime")) / 60)

Anomaly Detection

In [0]:
# Identify vehicles with speed > 70 and TripDuration < 10 minutes
df.filter((col("SpeedKMH") > 70) & (col("TripDurationMinutes") < 10)).select("LogID","VehicleID","VehicleType","SpeedKMH","TripDurationMinutes").show()

+-----+---------+-----------+--------+-------------------+
|LogID|VehicleID|VehicleType|SpeedKMH|TripDurationMinutes|
+-----+---------+-----------+--------+-------------------+
+-----+---------+-----------+--------+-------------------+



In [0]:
# Vehicles that paid less toll for longer trips
df.filter((col("TripDurationMinutes") > 30) & (col("TollPaid") < 40)).select("LogId","VehicleID","TripDurationMinutes","TollPaid").show()

+-----+---------+-------------------+--------+
|LogId|VehicleID|TripDurationMinutes|TollPaid|
+-----+---------+-------------------+--------+
+-----+---------+-------------------+--------+



In [0]:
# Suspicious backtracking (ExitPoint earlier than EntryPoint)
df.filter(col("ExitPoint") < col("EntryPoint")).select("LogID","VehicleID","ExitPoint","EntryPoint").show()

+-----+---------+---------+----------+
|LogID|VehicleID|ExitPoint|EntryPoint|
+-----+---------+---------+----------+
| L005|     V005|    GateA|     GateB|
+-----+---------+---------+----------+



Join with Metadata

In [0]:
# Join and group trips by RegisteredCity
registry = spark.read.option("header", True).csv("file:/Workspace/Shared/vehicle_registry.csv")
df_joined = df.join(registry, on="VehicleID", how="left")
city_trips = df_joined.groupBy("RegisteredCity").agg(count("*").alias("TotalTrips"))
city_trips.show()

+--------------+----------+
|RegisteredCity|TotalTrips|
+--------------+----------+
|     Bangalore|         1|
|       Chennai|         1|
|        Mumbai|         1|
|          Pune|         1|
|         Delhi|         1|
+--------------+----------+



Delta Lake Features

In [0]:
# Save traffic_logs as Delta Table
df.write.format("delta").mode("overwrite").save("file:/Workspace/Shared/traffic_delta")

In [0]:
# Apply MERGE INTO to update toll rates for all Bikes
from delta.tables import DeltaTable
dt = DeltaTable.forPath(spark, "file:/Workspace/Shared/traffic_delta")
dt.update(condition=col("VehicleType") == "Bike",set={"TollPaid": "35"})

In [0]:
# Delete trips longer than 60 minutes
dt.delete(col("TripDurationMinutes") > 60)

In [0]:
# Use DESCRIBE HISTORY and VERSION AS OF
dt.history().show()
old_df = spark.read.format("delta").option("versionAsOf", 0).load("file:/Workspace/Shared/traffic_delta")


+-------+--------------------+----------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|version|           timestamp|          userId|            userName|operation| operationParameters| job|          notebook|           clusterId|readVersion|   isolationLevel|isBlindAppend|    operationMetrics|userMetadata|          engineInfo|
+-------+--------------------+----------------+--------------------+---------+--------------------+----+------------------+--------------------+-----------+-----------------+-------------+--------------------+------------+--------------------+
|      2|2025-06-19 09:52:...|4042796083082360|azuser3548_mml.lo...|   DELETE|{predicate -> ["(...|NULL|{4419187724732167}|0612-043650-nhuexwr6|          1|WriteSerializable|        false|{numRemovedFiles ...|        NULL|Databricks-Runtim...|
|      1|2025-06-19 09:5

Advanced Conditions

In [0]:
# when/otherwise : Tag trip type as:
# "Short" <15min
# "Medium" 15-30min
# "Long" >30min
df = df.withColumn("TripType", when(col("TripDurationMinutes") < 15, "Short")
.when(col("TripDurationMinutes") <= 30, "Medium").otherwise("Long"))
df.show()

+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+------------+---------------+-----------+--------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|SpeedRank|LastExitTime|IdleTimeMinutes|IsBacktrack|TripType|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+------------+---------------+-----------+--------+
| L001|     V001|     GateA|    GateC|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|      60|      50|               19.0|      false|        2|        NULL|           NULL|      false|  Medium|
| L002|     V002|     GateB|    GateC|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|      45|     100|               35.0|      false|        1|        NULL|           NULL|      false|    L

In [0]:
# Flag vehicles with more than 3 trips in a day
df = df.withColumn("TripDate", to_date("EntryTime"))
trips_df=df.groupBy("VehicleID", "TripDate").agg(count("*").alias("DailyTrips"))
trips_df=trips_df.withColumn("IsFrequent", col("DailyTrips") > 3).show()

+---------+----------+----------+----------+
|VehicleID|  TripDate|DailyTrips|IsFrequent|
+---------+----------+----------+----------+
|     V002|2024-05-01|         1|     false|
|     V001|2024-05-01|         1|     false|
|     V005|2024-05-01|         1|     false|
|     V003|2024-05-01|         1|     false|
|     V004|2024-05-01|         1|     false|
+---------+----------+----------+----------+



Export & Reporting

In [0]:
# Write final enriched DataFrame to:
# Parquet partitioned by VehicleType
df.write.mode("overwrite").partitionBy("VehicleType").parquet("file:/Workspace/Shared/traffic_parquet")

# CSV for dashboards
df.write.mode("overwrite").csv("file:/Workspace/Shared/traffic_csv", header=True)


In [0]:
# Create summary SQL View: total toll by VehicleType + ExitPoint
df.createOrReplaceTempView("traffic")
spark.sql("""CREATE OR REPLACE TEMP VIEW toll_summary AS
SELECT VehicleType, ExitPoint, SUM(TollPaid) AS TotalToll
FROM traffic
GROUP BY VehicleType, ExitPoint""")

DataFrame[]