**Intialize the SparkSession**

In [0]:
from pyspark.sql import SparkSession
spark=SparkSession.builder\
      .appName("Traffic-log analysis")\
      .getOrCreate()
spark

**Data Ingestion & Schema Analysis**

In [0]:
#Load CSV using PySpark with schema inference
a= spark.read.option("header", True).option("inferSchema", True).csv("file:/Workspace/Shared/traffic_logs.csv")
a.printSchema()
a.show()
#Manually define schema and compare
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, DoubleType
schema = StructType([
    StructField("LogID", StringType(), True),
    StructField("VehicleID", StringType(), True),
    StructField("EntryPoint", StringType(), True),
    StructField("ExitPoint", StringType(), True),
    StructField("EntryTime", TimestampType(), True),
    StructField("ExitTime", TimestampType(), True),
    StructField("VehicleType", StringType(), True),
    StructField("SpeedKMH", DoubleType(), True),
    StructField("TollPaid", DoubleType(), True),
])
df=spark.read.option("header", True).schema(schema).csv("file:/Workspace/Shared/traffic_logs.csv")
df.printSchema()
df.show()

root
 |-- LogID: string (nullable = true)
 |-- VehicleID: string (nullable = true)
 |-- EntryPoint: string (nullable = true)
 |-- ExitPoint: string (nullable = true)
 |-- EntryTime: timestamp (nullable = true)
 |-- ExitTime: timestamp (nullable = true)
 |-- VehicleType: string (nullable = true)
 |-- SpeedKMH: integer (nullable = true)
 |-- TollPaid : double (nullable = true)

+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+---------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid |
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+---------+
| L001|     V001|     GateA|    GateC|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|      60|     50.0|
| L002|     V002|     GateB|    GateC|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|      45|    100.0|
| L003|     V003|     GateA|    GateD|2024-05-01 09:00:00|2024-05-01 09

**Derived Column Creation**

In [0]:
#Calculate TripDurationMinutes = ExitTime - EntryTime
#Add IsOverspeed = SpeedKMH > 60
from pyspark.sql.functions import col
df = df.withColumn(
    "TripDurationMinutes",
    (col("ExitTime").cast("long") - col("EntryTime").cast("long")) / 60
).withColumn(
    "IsOverspeed",
    col("SpeedKMH") > 60
)
df.select("EntryTime","ExitTime","TripDurationMinutes", "IsOverspeed").show()

+-------------------+-------------------+-------------------+-----------+
|          EntryTime|           ExitTime|TripDurationMinutes|IsOverspeed|
+-------------------+-------------------+-------------------+-----------+
|2024-05-01 08:01:00|2024-05-01 08:20:00|               19.0|      false|
|2024-05-01 08:10:00|2024-05-01 08:45:00|               35.0|      false|
|2024-05-01 09:00:00|2024-05-01 09:18:00|               18.0|      false|
|2024-05-01 09:15:00|2024-05-01 09:35:00|               20.0|       true|
|2024-05-01 10:05:00|2024-05-01 10:40:00|               35.0|      false|
+-------------------+-------------------+-------------------+-----------+



**Vehicle Behavior Aggregations**

In [0]:
#Average speed per VehicleType
from pyspark.sql.functions import *
print("Average speed per VehicleType")
df.groupBy("VehicleType").agg(avg("SpeedKMH").alias("AvgSpeed")).show()
#Total toll collected per gate (EntryPoint)
print("Total toll collected per gate (EntryPoint)")
df.groupBy("EntryPoint").agg(sum("TollPaid").alias("TotalToll")).show()
#Most used ExitPoint
print("Most used ExitPoint")
df.groupBy("ExitPoint").count().orderBy(desc("count")).show(1)

Average speed per VehicleType
+-----------+--------+
|VehicleType|AvgSpeed|
+-----------+--------+
|       Bike|    55.0|
|        Car|    70.0|
|      Truck|    45.0|
|        Bus|    40.0|
+-----------+--------+

Total toll collected per gate (EntryPoint)
+----------+---------+
|EntryPoint|TotalToll|
+----------+---------+
|     GateA|     80.0|
|     GateB|    170.0|
|     GateC|     50.0|
+----------+---------+

Most used ExitPoint
+---------+-----+
|ExitPoint|count|
+---------+-----+
|    GateD|    2|
+---------+-----+
only showing top 1 row



**Window Functions**

In [0]:
#Rank vehicles by speed within VehicleType
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, lag
print("Rank vehicles by speed within VehicleType")
w=Window.partitionBy("VehicleType").orderBy(col("SpeedKMH").desc())
df=df.withColumn("SpeedRank", rank().over(w))
df.select("VehicleID","VehicleType","SpeedKMH","SpeedRank").show()
#Find last exit time for each vehicle using lag()
print("Last exit time for each vehicle")
w2=Window.partitionBy("VehicleID").orderBy("ExitTime")
df=df.withColumn("LastExitTime", lag("ExitTime").over(w2))
df.select("VehicleID","ExitTime","LastExitTime").show()

Rank vehicles by speed within VehicleType
+---------+-----------+--------+---------+
|VehicleID|VehicleType|SpeedKMH|SpeedRank|
+---------+-----------+--------+---------+
|     V003|       Bike|    55.0|        1|
|     V005|        Bus|    40.0|        1|
|     V004|        Car|    80.0|        1|
|     V001|        Car|    60.0|        2|
|     V002|      Truck|    45.0|        1|
+---------+-----------+--------+---------+

Last exit time for each vehicle
+---------+-------------------+------------+
|VehicleID|           ExitTime|LastExitTime|
+---------+-------------------+------------+
|     V001|2024-05-01 08:20:00|        NULL|
|     V002|2024-05-01 08:45:00|        NULL|
|     V003|2024-05-01 09:18:00|        NULL|
|     V004|2024-05-01 09:35:00|        NULL|
|     V005|2024-05-01 10:40:00|        NULL|
+---------+-------------------+------------+



**Session Segmentation**

In [0]:
 #Group by VehicleID to simulate route sessions
 #Find duration between subsequent trips (idle time)
 df = df.withColumn(
    "IdleTimeMinutes",
    (col("EntryTime").cast("long") - col("LastExitTime").cast("long")) / 60
)
df.show()

+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+------------+---------------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|SpeedRank|LastExitTime|IdleTimeMinutes|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+------------+---------------+
| L001|     V001|     GateA|    GateC|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|    60.0|    50.0|               19.0|      false|        2|        NULL|           NULL|
| L002|     V002|     GateB|    GateC|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|    45.0|   100.0|               35.0|      false|        1|        NULL|           NULL|
| L003|     V003|     GateA|    GateD|2024-05-01 09:00:00|2024-05-01 09:18:00|       Bike|    55.0| 

**Anomaly Detection**

In [0]:
print(df.columns)

['LogID', 'VehicleID', 'EntryPoint', 'ExitPoint', 'EntryTime', 'ExitTime', 'VehicleType', 'SpeedKMH', 'TollPaid', 'TripDurationMinutes', 'IsOverspeed', 'SpeedRank', 'LastExitTime', 'IdleTimeMinutes']


In [0]:
#Identify vehicles with speed > 70 and TripDuration < 10 minutes
print("Vehicles with speed > 70 and TripDuration < 10 minutes")
a=df.filter((col("SpeedKMH")>70) & (col("TripDurationMinutes") < 10))
a.select("VehicleID","SpeedKMH","TripDurationMinutes").show()
#Vehicles that paid less toll for longer trips
print("Vehicles that paid less toll for longer trips")
a1=df.filter((col("TripDurationMinutes") > 60) & (col("TollPaid") < 20))
a1.select("VehicleID","TripDurationMinutes","TollPaid").show()
#Suspicious backtracking (ExitPoint earlier than EntryPoint)
print("Suspicious backtracking (ExitPoint earlier than EntryPoint)")
a2=df.filter((col("ExitPoint") < col("EntryPoint")))
a2.select("VehicleID","ExitPoint","EntryPoint").show()

Vehicles with speed > 70 and TripDuration < 10 minutes
+---------+--------+-------------------+
|VehicleID|SpeedKMH|TripDurationMinutes|
+---------+--------+-------------------+
+---------+--------+-------------------+

Vehicles that paid less toll for longer trips
+---------+-------------------+--------+
|VehicleID|TripDurationMinutes|TollPaid|
+---------+-------------------+--------+
+---------+-------------------+--------+

Suspicious backtracking (ExitPoint earlier than EntryPoint)
+---------+---------+----------+
|VehicleID|ExitPoint|EntryPoint|
+---------+---------+----------+
|     V005|    GateA|     GateB|
+---------+---------+----------+



**Join with Metadata**

In [0]:
# Join and group trips by RegisteredCity
r=spark.createDataFrame([
    ("V001","Anil","Hyundai i20","Delhi"),
    ("V002","Rakesh","Tata Truck","Chennai"),
    ("V003","Sana","Yamaha R15","Mumbai"),
    ("V004","Neha","Honda City","Bangalore"),
    ("V005","Zoya","Volvo Bus","Pune"),
], ["VehicleID","OwnerName","Model","RegisteredCity"])
df = df.join(r, "VehicleID", "left")
df.groupBy("RegisteredCity").agg(count("*").alias("TripsByCity")).show()

+--------------+-----------+
|RegisteredCity|TripsByCity|
+--------------+-----------+
|     Bangalore|          1|
|       Chennai|          1|
|        Mumbai|          1|
|          Pune|          1|
|         Delhi|          1|
+--------------+-----------+



**Delta Lake Features**

In [0]:
#Save traffic_logs as Delta Table
df.write.mode("overwrite").format("delta").save("file:/Workspace/Shared/traffic_logs")
#Apply MERGE INTO to update toll rates for all Bikes
from delta.tables import DeltaTable
dt=DeltaTable.forPath(spark,"file:/Workspace/Shared/traffic_logs")
dt.update(condition="VehicleType='Bike'", set={"TollPaid":"TollPaid + 10"})
#Delete trips longer than 60 minutes
dt.delete("TripDurationMinutes > 60")
#Use DESCRIBE HISTORY and VERSION AS OF
spark.read.format("delta").option("versionAsOf", 0).load("file:/Workspace/Shared/traffic_logs").show()

+---------+-----+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+------------+---------------+---------+-----------+--------------+
|VehicleID|LogID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|SpeedRank|LastExitTime|IdleTimeMinutes|OwnerName|      Model|RegisteredCity|
+---------+-----+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+------------+---------------+---------+-----------+--------------+
|     V001| L001|     GateA|    GateC|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|    60.0|    50.0|               19.0|      false|        2|        NULL|           NULL|     Anil|Hyundai i20|         Delhi|
|     V002| L002|     GateB|    GateC|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|    45.0|   100.0|               35.0|    

**Advanced Conditions**

In [0]:
 #when/otherwise : Tag trip type as:"Short" <15min "Medium" 15-30min "Long" >30min
 df=df.withColumn(
    "TripCategory",
    when(col("TripDurationMinutes") < 15, "Short")
      .when(col("TripDurationMinutes") < 30, "Medium")
      .otherwise("Long")
).withColumn(
    "TripsPerDay",
    count("*").over(Window.partitionBy("VehicleID", col("EntryTime").substr(1,10)))
)
df.select("VehicleID","TripDurationMinutes","TripCategory","TripsPerDay").show()
 #Flag vehicles with more than 3 trips in a day
 df = df.withColumn("IsFrequent", col("TripsPerDay") > 3)
 df.select("VehicleID","TripsPerDay","IsFrequent").show()

+---------+-------------------+------------+-----------+
|VehicleID|TripDurationMinutes|TripCategory|TripsPerDay|
+---------+-------------------+------------+-----------+
|     V001|               19.0|      Medium|          1|
|     V002|               35.0|        Long|          1|
|     V003|               18.0|      Medium|          1|
|     V004|               20.0|      Medium|          1|
|     V005|               35.0|        Long|          1|
+---------+-------------------+------------+-----------+

+---------+-----------+----------+
|VehicleID|TripsPerDay|IsFrequent|
+---------+-----------+----------+
|     V001|          1|     false|
|     V002|          1|     false|
|     V003|          1|     false|
|     V004|          1|     false|
|     V005|          1|     false|
+---------+-----------+----------+



**Export & Reporting**

In [0]:
#Write final enriched DataFrame to:Parquet partitioned by VehicleType and CSV for dashboard
df.write.partitionBy("VehicleType").mode("overwrite").parquet("file/Workspace/Shared/trafficlog_parquet")
df.write.mode("overwrite").option("header",True).csv("file:/Workspace/Shared/trafficlog_csv")
#Create summary SQL View: total toll by VehicleType + ExitPoint
df.createOrReplaceTempView("traffic_view")
spark.sql("""
CREATE OR REPLACE TEMP VIEW traffic_summary AS
SELECT VehicleType, ExitPoint, SUM(TollPaid) AS TotalToll
FROM traffic_view
GROUP BY VehicleType, ExitPoint
""")


DataFrame[]