In [0]:
# 1
# Data Ingestion & Schema Analysis
# Load CSV using PySpark with schema inference
# Manually define schema and compare
# Ensure EntryTime/ExitTime are timestamp
from pyspark.sql.types import *
from pyspark.sql.functions import *

df = spark.read.csv("file:/Workspace/Shared/traffic_logs19.csv",header=True,inferSchema=True)
df.printSchema()

schema = StructType([
    StructField("LogID", StringType(), True),
    StructField("VehicleID", StringType(), True),
    StructField("EntryPoint", StringType(), True),
    StructField("ExitPoint", StringType(), True),
    StructField("EntryTime", TimestampType(), True),
    StructField("ExitTime", TimestampType(), True),
    StructField("VehicleType", StringType(), True),
    StructField("SpeedKMH", DoubleType(), True),
    StructField("TollPaid", DoubleType(), True),
])
tl= spark.read.csv("file:/Workspace/Shared/traffic_logs19.csv",header=True,inferSchema=True)
tl.show()


root
 |-- LogID: string (nullable = true)
 |-- VehicleID: string (nullable = true)
 |-- EntryPoint: string (nullable = true)
 |-- ExitPoint: string (nullable = true)
 |-- EntryTime: timestamp (nullable = true)
 |-- ExitTime: timestamp (nullable = true)
 |-- VehicleType: string (nullable = true)
 |-- SpeedKMH: integer (nullable = true)
 |-- TollPaid: integer (nullable = true)

+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+
| L001|     V001|     GateA|    GateC|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|      60|      50|
| L002|     V002|     GateB|    GateC|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|      45|     100|
| L003|     V003|     GateA|    GateD|2024-05-01 09:00:00|2024-05-01 09:18:0

In [0]:

# 2
# Derived Column Creation
# Calculate TripDurationMinutes = ExitTime - EntryTime
# Add IsOverspeed = SpeedKMH > 60
df = df.withColumn("TripDurationMinutes",round((col("ExitTime").cast("long") - col("EntryTime").cast("long")) / 60.0, 2)).withColumn("IsOverspeed", col("SpeedKMH") > 60)
df.show()



+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+
| L001|     V001|     GateA|    GateC|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|      60|      50|               19.0|      false|
| L002|     V002|     GateB|    GateC|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|      45|     100|               35.0|      false|
| L003|     V003|     GateA|    GateD|2024-05-01 09:00:00|2024-05-01 09:18:00|       Bike|      55|      30|               18.0|      false|
| L004|     V004|     GateC|    GateD|2024-05-01 09:15:00|2024-05-01 09:35:00|        Car|      80|      50|               20.0|       true|
| L005|     V

In [0]:
# 3
# Vehicle Behavior Aggregations
# Average speed per VehicleType
# Total toll collected per gate (EntryPoint)
# Most used ExitPoint

df.groupBy("VehicleType").avg("SpeedKMH").show()


df.groupBy("EntryPoint").sum("TollPaid").show()


df.groupBy("ExitPoint").agg(count("*").alias("cnt")).orderBy(col("cnt").desc()).show(1)


+-----------+-------------+
|VehicleType|avg(SpeedKMH)|
+-----------+-------------+
|       Bike|         55.0|
|        Car|         70.0|
|      Truck|         45.0|
|        Bus|         40.0|
+-----------+-------------+

+----------+-------------+
|EntryPoint|sum(TollPaid)|
+----------+-------------+
|     GateA|           80|
|     GateB|          170|
|     GateC|           50|
+----------+-------------+

+---------+---+
|ExitPoint|cnt|
+---------+---+
|    GateD|  2|
+---------+---+
only showing top 1 row



In [0]:

# 4
# Window Functions
# Rank vehicles by speed within VehicleType
# Find last exit time for each vehicle using lag()
from pyspark.sql.window import Window

win = Window.partitionBy("VehicleType").orderBy(col("SpeedKMH").desc())
df3 = df.withColumn("SpeedRank", rank().over(win))

win2 = Window.partitionBy("VehicleID").orderBy("ExitTime")
df3 = df3.withColumn("LastExit", lag("ExitTime").over(win2))
df3.show()



+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+--------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|SpeedRank|LastExit|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+--------+
| L001|     V001|     GateA|    GateC|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|      60|      50|               19.0|      false|        2|    NULL|
| L002|     V002|     GateB|    GateC|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|      45|     100|               35.0|      false|        1|    NULL|
| L003|     V003|     GateA|    GateD|2024-05-01 09:00:00|2024-05-01 09:18:00|       Bike|      55|      30|               18.0|      false|        1|    NULL|
| L004|     V004|     GateC|    GateD|20

In [0]:

# 5
# Session Segmentation
# Group by VehicleID to simulate route sessions
# Find duration between subsequent trips (idle time)
df3 = df3.withColumn("IdleMinutes",(unix_timestamp("EntryTime") - unix_timestamp("LastExit"))/60)
df3.show()


+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+--------+-----------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|SpeedRank|LastExit|IdleMinutes|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+--------+-----------+
| L001|     V001|     GateA|    GateC|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|      60|      50|               19.0|      false|        2|    NULL|       NULL|
| L002|     V002|     GateB|    GateC|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|      45|     100|               35.0|      false|        1|    NULL|       NULL|
| L003|     V003|     GateA|    GateD|2024-05-01 09:00:00|2024-05-01 09:18:00|       Bike|      55|      30|               18.0|      false|

In [0]:

# 6
# Anomaly Detection
# Identify vehicles with speed > 70 and TripDuration < 10 minutes
# Vehicles that paid less toll for longer trips
# Suspicious backtracking (ExitPoint earlier than EntryPoint)
anomalies = df3.filter((col("SpeedKMH") > 70) & (col("TripDurationMinutes") < 10)|(col("TollPaid") < 5) & (col("TripDurationMinutes") > 30) |(col("ExitPoint") < col("EntryPoint")))
anomalies.show()


+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+--------+-----------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|SpeedRank|LastExit|IdleMinutes|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+--------+-----------+
| L005|     V005|     GateB|    GateA|2024-05-01 10:05:00|2024-05-01 10:40:00|        Bus|      40|      70|               35.0|      false|        1|    NULL|       NULL|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+--------+-----------+



In [0]:

# 7
# Join with Metadata
# Prepare this small vehicle_registry.csv :
# VehicleID,OwnerName,Model,RegisteredCity
# V001,Anil,Hyundai i20,Delhi
# V002,Rakesh,Tata Truck,Chennai
# V003,Sana,Yamaha R15,Mumbai
# V004,Neha,Honda City,Bangalore
# V005,Zoya,Volvo Bus,Pune
# Join and group trips by RegisteredCity
veh = spark.read.csv("file:/Workspace/Shared/vehicle_registry19.csv",header=True,inferSchema=True)

df4 = df3.join(veh, "VehicleID", "left")
df4.groupBy("RegisteredCity").count().show()


+--------------+-----+
|RegisteredCity|count|
+--------------+-----+
|     Bangalore|    1|
|       Chennai|    1|
|        Mumbai|    1|
|          Pune|    1|
|         Delhi|    1|
+--------------+-----+



In [0]:

# 8
# Delta Lake Features
# Save traffic_logs as Delta Table
# Apply MERGE INTO to update toll rates for all Bikes
# Delete trips longer than 60 minutes
# Use DESCRIBE HISTORY and VERSION AS OF
df4.write.format("delta").mode("overwrite").save("file:/Workspace/Shared/traffic_logs")
spark.read.format("delta").load("file:/Workspace/Shared/traffic_logs").createOrReplaceTempView("traffic_delta")

from delta.tables import DeltaTable
dl = DeltaTable.forPath(spark, "file:/Workspace/Shared/traffic_logs")
dl.update(condition=col("VehicleType")=="Bike", set={"TollPaid": col("TollPaid")*1.1})

dl.delete(condition=col("TripDurationMinutes") > 60)

spark.read.format("delta").option("versionAsOf", 0).load("file:/Workspace/Shared/traffic_logs").show()
spark.sql("DESCRIBE HISTORY delta.`file:/Workspace/Shared/traffic_logs`").show()


+---------+-----+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+--------+-----------+---------+-----------+--------------+
|VehicleID|LogID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|SpeedRank|LastExit|IdleMinutes|OwnerName|      Model|RegisteredCity|
+---------+-----+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+--------+-----------+---------+-----------+--------------+
|     V001| L001|     GateA|    GateC|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|      60|      50|               19.0|      false|        2|    NULL|       NULL|     Anil|Hyundai i20|         Delhi|
|     V002| L002|     GateB|    GateC|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|      45|     100|               35.0|      false|        1|    NULL|     

In [0]:

# 9
# Advanced Conditions
# when/otherwise : Tag trip type as:
# "Short" <15min
# "Medium" 15-30min
# "Long" >30min

df3 = df.withColumn(
    "TripType",
    when(col("TripDurationMinutes") < 15, "Short")
    .when((col("TripDurationMinutes") >= 15) & (col("TripDurationMinutes") <= 30), "Medium")
    .otherwise("Long")
)


df3 = df3.withColumn("TripDate", to_date("EntryTime"))

trip_counts = df3.groupBy("VehicleID", "TripDate").agg(count("*").alias("TripCount"))

df4 = df3.join(trip_counts, ["VehicleID", "TripDate"]).withColumn(
    "FlaggedFrequent", when(col("TripCount") > 3, True).otherwise(False)
)



In [0]:
# Export & Reporting
# Write final enriched DataFrame to:
# Parquet partitioned by VehicleType
# CSV for dashboards
# Create summary SQL View: total toll by VehicleType + ExitPoint
df4.write.mode("overwrite").partitionBy("VehicleType").parquet("file:/Workspace/Shared/traffic_data_parquet")
df4.write.mode("overwrite").option("header", "true").csv("file:/Workspace/Shared/traffic_data_csv")
df4.createOrReplaceTempView("traffic_logs_view")

summary_df = spark.sql(
    """
    SELECT VehicleType, ExitPoint, SUM(TollPaid) AS TotalToll
    FROM traffic_logs_view
    GROUP BY VehicleType, ExitPoint
    ORDER BY TotalToll DESC
    """
)

summary_df.show()

+-----------+---------+---------+
|VehicleType|ExitPoint|TotalToll|
+-----------+---------+---------+
|      Truck|    GateC|      100|
|        Bus|    GateA|       70|
|        Car|    GateC|       50|
|        Car|    GateD|       50|
|       Bike|    GateD|       30|
+-----------+---------+---------+

