In [1]:

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.3.0/spark-3.3.0-bin-hadoop3.tgz
!tar xf spark-3.3.0-bin-hadoop3.tgz
!pip install -q findspark

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.0-bin-hadoop3"

import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, TimestampType, IntegerType
from pyspark.sql.functions import *

spark = SparkSession.builder.appName("TrafficMonitoring").getOrCreate()


In [3]:
from google.colab import files
uploaded = files.upload()  # Upload traffic_logs.csv and vehicle_registry.csv

from pyspark.sql.types import *
from pyspark.sql.functions import *

# Schema for traffic_logs
schema_logs = StructType([
    StructField("LogID", StringType(), True),
    StructField("VehicleID", StringType(), True),
    StructField("EntryPoint", StringType(), True),
    StructField("ExitPoint", StringType(), True),
    StructField("EntryTime", TimestampType(), True),
    StructField("ExitTime", TimestampType(), True),
    StructField("VehicleType", StringType(), True),
    StructField("SpeedKMH", IntegerType(), True),
    StructField("TollPaid", IntegerType(), True),
])

# Schema for vehicle_registry
schema_registry = StructType([
    StructField("VehicleID", StringType(), True),
    StructField("OwnerName", StringType(), True),
    StructField("Model", StringType(), True),
    StructField("RegisteredCity", StringType(), True),
])

df = spark.read.option("header", True).schema(schema_logs).csv("traffic_logs.csv")
registry = spark.read.option("header", True).schema(schema_registry).csv("vehicle_registry.csv")


Saving traffic_logs.csv to traffic_logs (1).csv
Saving vehicle_registry.csv to vehicle_registry (1).csv


In [4]:
df_manual = spark.read.option("header", True).schema(schema).csv("traffic_logs.csv")
df_manual.printSchema()
df_manual.show()

root
 |-- LogID: string (nullable = true)
 |-- VehicleID: string (nullable = true)
 |-- EntryPoint: string (nullable = true)
 |-- ExitPoint: string (nullable = true)
 |-- EntryTime: timestamp (nullable = true)
 |-- ExitTime: timestamp (nullable = true)
 |-- VehicleType: string (nullable = true)
 |-- SpeedKMH: integer (nullable = true)
 |-- TollPaid: integer (nullable = true)

+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+
| L001|     V001|     GateA|    GateC|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|      60|      50|
| L002|     V002|     GateB|    GateC|2024-05-01 08:10:00|2024-05-01 08:45:00|      Truck|      45|     100|
| L003|     V003|     GateA|    GateD|2024-05-01 09:00:00|2024-05-01 09:18:0

In [5]:
df = df.withColumn("TripDurationMinutes",
                   (unix_timestamp("ExitTime") - unix_timestamp("EntryTime")) / 60)

df = df.withColumn("IsOverspeed", col("SpeedKMH") > 60)


In [6]:

df.groupBy("VehicleType").agg(avg("SpeedKMH").alias("AvgSpeed")).show()

df.groupBy("EntryPoint").agg(sum("TollPaid").alias("TotalToll")).show()

df.groupBy("ExitPoint").count().orderBy(desc("count")).show(1)


+-----------+--------+
|VehicleType|AvgSpeed|
+-----------+--------+
|       Bike|    55.0|
|        Car|    70.0|
|      Truck|    45.0|
|        Bus|    40.0|
+-----------+--------+

+----------+---------+
|EntryPoint|TotalToll|
+----------+---------+
|     GateA|       80|
|     GateB|      170|
|     GateC|       50|
+----------+---------+

+---------+-----+
|ExitPoint|count|
+---------+-----+
|    GateD|    2|
+---------+-----+
only showing top 1 row



In [7]:
from pyspark.sql.window import Window

window_spec = Window.partitionBy("VehicleType").orderBy(desc("SpeedKMH"))
df.withColumn("SpeedRank", rank().over(window_spec)).show()

vehicle_window = Window.partitionBy("VehicleID").orderBy("EntryTime")
df.withColumn("LastExitTime", lag("ExitTime").over(vehicle_window)).show()


+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+
|LogID|VehicleID|EntryPoint|ExitPoint|          EntryTime|           ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|SpeedRank|
+-----+---------+----------+---------+-------------------+-------------------+-----------+--------+--------+-------------------+-----------+---------+
| L003|     V003|     GateA|    GateD|2024-05-01 09:00:00|2024-05-01 09:18:00|       Bike|      55|      30|               18.0|      false|        1|
| L005|     V005|     GateB|    GateA|2024-05-01 10:05:00|2024-05-01 10:40:00|        Bus|      40|      70|               35.0|      false|        1|
| L004|     V004|     GateC|    GateD|2024-05-01 09:15:00|2024-05-01 09:35:00|        Car|      80|      50|               20.0|       true|        1|
| L001|     V001|     GateA|    GateC|2024-05-01 08:01:00|2024-05-01 08:20:00|        Car|    

In [8]:
df = df.withColumn("PreviousExit", lag("ExitTime").over(vehicle_window))
df = df.withColumn("IdleMinutes",
                   (unix_timestamp("EntryTime") - unix_timestamp("PreviousExit")) / 60)
df.select("VehicleID", "EntryTime", "PreviousExit", "IdleMinutes").show()


+---------+-------------------+------------+-----------+
|VehicleID|          EntryTime|PreviousExit|IdleMinutes|
+---------+-------------------+------------+-----------+
|     V001|2024-05-01 08:01:00|        null|       null|
|     V002|2024-05-01 08:10:00|        null|       null|
|     V003|2024-05-01 09:00:00|        null|       null|
|     V004|2024-05-01 09:15:00|        null|       null|
|     V005|2024-05-01 10:05:00|        null|       null|
+---------+-------------------+------------+-----------+



In [9]:
# 1. Speed > 70 and trip < 10
df.filter((col("SpeedKMH") > 70) & (col("TripDurationMinutes") < 10)).show()

# 2. Low toll for long trips
df.filter((col("TripDurationMinutes") > 30) & (col("TollPaid") < 50)).show()

# 3. Suspicious backtracking
df.filter(col("ExitPoint") < col("EntryPoint")).show()  # ASCII based, meh logic


+-----+---------+----------+---------+---------+--------+-----------+--------+--------+-------------------+-----------+------------+-----------+
|LogID|VehicleID|EntryPoint|ExitPoint|EntryTime|ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|PreviousExit|IdleMinutes|
+-----+---------+----------+---------+---------+--------+-----------+--------+--------+-------------------+-----------+------------+-----------+
+-----+---------+----------+---------+---------+--------+-----------+--------+--------+-------------------+-----------+------------+-----------+

+-----+---------+----------+---------+---------+--------+-----------+--------+--------+-------------------+-----------+------------+-----------+
|LogID|VehicleID|EntryPoint|ExitPoint|EntryTime|ExitTime|VehicleType|SpeedKMH|TollPaid|TripDurationMinutes|IsOverspeed|PreviousExit|IdleMinutes|
+-----+---------+----------+---------+---------+--------+-----------+--------+--------+-------------------+-----------+----------

In [10]:
df_joined = df.join(registry, on="VehicleID", how="left")
df_joined.groupBy("RegisteredCity").count().show()


+--------------+-----+
|RegisteredCity|count|
+--------------+-----+
|     Bangalore|    1|
|       Chennai|    1|
|        Mumbai|    1|
|          Pune|    1|
|         Delhi|    1|
+--------------+-----+



In [11]:

df = df.withColumn("TripType", when(col("TripDurationMinutes") < 15, "Short")
                   .when(col("TripDurationMinutes") <= 30, "Medium")
                   .otherwise("Long"))

from pyspark.sql.functions import to_date, count as spark_count

df = df.withColumn("TripDate", to_date("EntryTime"))

trip_counts = df.groupBy("VehicleID", "TripDate").agg(spark_count("*").alias("TripCount"))
frequent = trip_counts.filter("TripCount > 3")
frequent.show()


+---------+--------+---------+
|VehicleID|TripDate|TripCount|
+---------+--------+---------+
+---------+--------+---------+



In [13]:

df_export = df.withColumn("EntryTime", col("EntryTime").cast("string")) \
              .withColumn("ExitTime", col("ExitTime").cast("string")) \
              .withColumn("TripDate", col("TripDate").cast("string")) \
              .withColumn("PreviousExit", col("PreviousExit").cast("string"))


df_export.toPandas().to_csv("final_dashboard.csv", index=False)

from google.colab import files
files.download("final_dashboard.csv")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>