PHASE 1 — INGESTION

In [2]:
from google.colab import files
uploaded = files.upload()

Saving traffic_data_large.csv to traffic_data_large.csv


In [3]:

from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("SmartCityTraffic").getOrCreate()

df_raw = spark.read.csv("traffic_data_large.csv", header=True, inferSchema=False)


In [4]:

df_raw.printSchema()
print("Total records:", df_raw.count())
df_raw.show(10, truncate=False)


root
 |-- sensor_id: string (nullable = true)
 |-- location: string (nullable = true)
 |-- road_name: string (nullable = true)
 |-- vehicle_count: string (nullable = true)
 |-- avg_speed: string (nullable = true)
 |-- temperature: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- status: string (nullable = true)

Total records: 500000
+---------+---------+---------------+-------------+---------+-----------+-------------------+--------+
|sensor_id|location |road_name      |vehicle_count|avg_speed|temperature|timestamp          |status  |
+---------+---------+---------------+-------------+---------+-----------+-------------------+--------+
|S105     |Chennai  |OMR            |invalid      |NULL     |39         |12/01/2026 06:00:00|INACTIVE|
|S113     |Chennai  |Mount Road     |103          |73.5     |36         |2026-01-12 06:00:05|ACTIVE  |
|S228     |Delhi    |Janpath        |16           |20.0     |35         |2026-01-12 06:00:10|ACTIVE  |
|S160     |Bangalore|MG 

3. Identify data quality issues


You will observe:

* vehicle_count contains invalid
* avg_speed has empty strings
* timestamps appear in yyyy-MM-dd, dd/MM/yyyy, and yyyy/MM/dd
* INACTIVE sensors still send data
* temperature exists but is irrelevant for traffic analytics

PHASE 2 — CLEANING

In [5]:

from pyspark.sql.functions import trim, col

for c in df_raw.columns:
    df_raw = df_raw.withColumn(c, trim(col(c)))


In [6]:

from pyspark.sql.functions import when, regexp_replace

df = df_raw.withColumn(
    "vehicle_count_clean",
    when(col("vehicle_count").rlike("^[0-9]+$"), col("vehicle_count")).otherwise(None)
).withColumn("vehicle_count_int", col("vehicle_count_clean").cast("int"))


In [7]:

df = df.withColumn(
    "avg_speed_clean",
    when(col("avg_speed").rlike("^[0-9.]+$"), col("avg_speed")).otherwise(None)
).withColumn("avg_speed_double", col("avg_speed_clean").cast("double"))


In [9]:
from pyspark.sql.functions import coalesce, try_to_timestamp, col, lit

df = df.withColumn(
    "event_time",
    coalesce(
        try_to_timestamp(col("timestamp"), lit("yyyy-MM-dd HH:mm:ss")),
        try_to_timestamp(col("timestamp"), lit("dd/MM/yyyy HH:mm:ss")),
        try_to_timestamp(col("timestamp"), lit("yyyy/MM/dd HH:mm:ss"))
    )
)

Keep original timestamp for audit


Already preserved as timestamp.

PHASE 3 — VALIDATION

In [10]:

invalid_vehicle = df.filter(col("vehicle_count_int").isNull()).count()
print("Invalid vehicle_count:", invalid_vehicle)


Invalid vehicle_count: 49873


In [11]:

invalid_ts = df.filter(col("event_time").isNull()).count()
print("Invalid timestamps:", invalid_ts)



Invalid timestamps: 4853


In [12]:
df_active = df.filter(col("status") == "ACTIVE")

In [13]:
print("After ACTIVE filter:", df_active.count())

After ACTIVE filter: 475000


PHASE 4 — TRAFFIC METRICS

In [14]:
avg_speed_loc = df_active.groupBy("location").avg("avg_speed_double")

In [15]:
vehicle_per_road = df_active.groupBy("road_name").sum("vehicle_count_int")

In [16]:

from pyspark.sql.functions import max as Fmax

peak_time_loc = df_active.groupBy("location") \
    .agg(Fmax("vehicle_count_int").alias("peak_vehicles"))



In [17]:

congestion = df_active.groupBy("road_name").avg("avg_speed_double") \
    .orderBy("avg(avg_speed_double)")


PHASE 5 — WINDOW FUNCTIONS

In [18]:

from pyspark.sql.window import Window
from pyspark.sql.functions import rank

road_speed = df_active.groupBy("road_name").avg("avg_speed_double")

w = Window.orderBy(col("avg(avg_speed_double)").asc())

road_rank = road_speed.withColumn("congestion_rank", rank().over(w))


In [19]:

road_loc_df = df_active.groupBy("location","road_name") \
    .sum("vehicle_count_int")

w_loc = Window.partitionBy("location").orderBy(col("sum(vehicle_count_int)").desc())

road_loc_rank = road_loc_df.withColumn("road_rank", rank().over(w_loc))


In [20]:
top3_congested = road_loc_rank.filter(col("road_rank") <= 3)

PHASE 6 — ANOMALY DETECTION

In [22]:
from pyspark.sql.functions import lag
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col

w_event = Window.partitionBy("sensor_id").orderBy("event_time")

df_anom = df_active.withColumn("prev_speed", lag("avg_speed_double").over(w_event)) \
                   .withColumn("speed_drop", col("prev_speed") - col("avg_speed_double"))

In [23]:
sudden_drop = df_anom.filter(col("speed_drop") > 10)

In [25]:

df_anom = df_anom.withColumn("prev_count", lag("vehicle_count_int").over(w_event)) \
                 .withColumn("count_spike", col("vehicle_count_int") - col("prev_count"))

sudden_spike = df_anom.filter(col("count_spike") > 20)



PHASE 7 — PERFORMANCE ENGINEERING

In [26]:
df_active.rdd.getNumPartitions()

2

In [27]:
congestion.explain(True)

== Parsed Logical Plan ==
'Sort ['avg(avg_speed_double) ASC NULLS FIRST], true
+- Aggregate [road_name#73], [road_name#73, avg(avg_speed_double#82) AS avg(avg_speed_double)#198]
   +- Filter (status#78 = ACTIVE)
      +- Project [sensor_id#71, location#72, road_name#73, vehicle_count#74, avg_speed#75, temperature#76, timestamp#77, status#78, vehicle_count_clean#79, vehicle_count_int#80, avg_speed_clean#81, avg_speed_double#82, coalesce(try_to_timestamp(timestamp#77, Some(yyyy-MM-dd HH:mm:ss), TimestampType, Some(Etc/UTC), false), try_to_timestamp(timestamp#77, Some(dd/MM/yyyy HH:mm:ss), TimestampType, Some(Etc/UTC), false), try_to_timestamp(timestamp#77, Some(yyyy/MM/dd HH:mm:ss), TimestampType, Some(Etc/UTC), false)) AS event_time#84]
         +- Project [sensor_id#71, location#72, road_name#73, vehicle_count#74, avg_speed#75, temperature#76, timestamp#77, status#78, vehicle_count_clean#79, vehicle_count_int#80, avg_speed_clean#81, cast(avg_speed_clean#81 as double) AS avg_speed_doubl

In [28]:
df_part = df_active.repartition("location")

In [30]:

df_active.cache()



DataFrame[sensor_id: string, location: string, road_name: string, vehicle_count: string, avg_speed: string, temperature: string, timestamp: string, status: string, vehicle_count_clean: string, vehicle_count_int: int, avg_speed_clean: string, avg_speed_double: double, event_time: timestamp]

In [31]:
df_part.groupBy("location").avg("avg_speed_double").explain(True)

== Parsed Logical Plan ==
'Aggregate ['location], ['location, unresolvedalias('avg(avg_speed_double#82))]
+- RepartitionByExpression [location#72]
   +- Filter (status#78 = ACTIVE)
      +- Project [sensor_id#71, location#72, road_name#73, vehicle_count#74, avg_speed#75, temperature#76, timestamp#77, status#78, vehicle_count_clean#79, vehicle_count_int#80, avg_speed_clean#81, avg_speed_double#82, coalesce(try_to_timestamp(timestamp#77, Some(yyyy-MM-dd HH:mm:ss), TimestampType, Some(Etc/UTC), false), try_to_timestamp(timestamp#77, Some(dd/MM/yyyy HH:mm:ss), TimestampType, Some(Etc/UTC), false), try_to_timestamp(timestamp#77, Some(yyyy/MM/dd HH:mm:ss), TimestampType, Some(Etc/UTC), false)) AS event_time#84]
         +- Project [sensor_id#71, location#72, road_name#73, vehicle_count#74, avg_speed#75, temperature#76, timestamp#77, status#78, vehicle_count_clean#79, vehicle_count_int#80, avg_speed_clean#81, cast(avg_speed_clean#81 as double) AS avg_speed_double#82]
            +- Project [s

PHASE 8 — RDD OPERATIONS

In [32]:
rdd = df_active.rdd

In [33]:
total_vehicle = rdd.map(lambda r: r["vehicle_count_int"] or 0).reduce(lambda a,b: a+b)

In [34]:
loc_counts = rdd.map(lambda r: (r["location"], 1)).reduceByKey(lambda a,b: a+b)

Why DataFrames are better?

* Catalyst optimizer = smarter query plans
* Tungsten execution = faster memory handling
* Column pruning, predicate pushdown
* Much less code
* RDDs lack schema and optimizations → slower and error-prone

PHASE 9 — SORTING & SET OPS

In [35]:

high_congestion = road_speed.orderBy(col("avg(avg_speed_double)").asc())



In [36]:

low_speed_set = df_active.filter(col("avg_speed_double") < 25).select("road_name").distinct()
high_traffic_set = df_active.filter(col("vehicle_count_int") > 60).select("road_name").distinct()


In [37]:

both = low_speed_set.join(high_traffic_set, "road_name", "inner")
only_low_speed = low_speed_set.join(high_traffic_set, "road_name", "left_anti")
only_high_traffic = high_traffic_set.join(low_speed_set, "road_name", "left_anti")


PHASE 10 — STORAGE

In [38]:

df_active.write.mode("overwrite") \
    .partitionBy("location") \
    .parquet("traffic_clean_parquet")


In [39]:
congestion.write.mode("overwrite").orc("congestion_orc")

In [40]:

p = spark.read.parquet("traffic_clean_parquet")
o = spark.read.orc("congestion_orc")

p.printSchema()
o.printSchema()

print(p.count(), o.count())


root
 |-- sensor_id: string (nullable = true)
 |-- road_name: string (nullable = true)
 |-- vehicle_count: string (nullable = true)
 |-- avg_speed: string (nullable = true)
 |-- temperature: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- status: string (nullable = true)
 |-- vehicle_count_clean: string (nullable = true)
 |-- vehicle_count_int: integer (nullable = true)
 |-- avg_speed_clean: string (nullable = true)
 |-- avg_speed_double: double (nullable = true)
 |-- event_time: timestamp (nullable = true)
 |-- location: string (nullable = true)

root
 |-- road_name: string (nullable = true)
 |-- avg(avg_speed_double): double (nullable = true)

475000 21
