In [28]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, to_timestamp, sum as _sum, desc

In [2]:
# Start Spark session
spark = SparkSession.builder.appName("MTA Busiest Stations 2023-2024").config("spark.driver.memory", "8g").getOrCreate()

# Read the Parquet data
df = spark.read.parquet("D:/data/mta_hourly")

In [24]:
# Convert timestamp string to actual timestamp
df = df.withColumn("timestamp", to_timestamp(col("transit_timestamp"), "MM/dd/yyyy HH:mm:ss a"))

# Extract year from timestamp
df = df.withColumn("year", year(col("timestamp")))

# Filter for 2023 and 2024
df_filtered = df.filter(col("year").isin(2023, 2024))

In [25]:
# Group by station and aggregate total ridership
df_grouped = df_filtered.groupBy("station_complex").agg(
    _sum("ridership").alias("total_ridership")
)

# Sort and select top 10
top_10 = df_grouped.orderBy(col("total_ridership").desc()).limit(10)

# Show result
top_10.show(truncate=False)

+--------------------------------------------------+---------------+
|station_complex                                   |total_ridership|
+--------------------------------------------------+---------------+
|Times Sq-42 St (N,Q,R,W,S,1,2,3,7)/42 St (A,C,E)  |32358463       |
|Grand Central-42 St (S,4,5,6,7)                   |21720564       |
|74-Broadway (7)/Jackson Hts-Roosevelt Av (E,F,M,R)|15294219       |
|Flushing-Main St (7)                              |14739922       |
|34 St-Penn Station (A,C,E)                        |14328644       |
|34 St-Penn Station (1,2,3)                        |14170110       |
|34 St-Herald Sq (B,D,F,M,N,Q,R,W)                 |12755374       |
|Fulton St (A,C,J,Z,2,3,4,5)                       |10585567       |
|14 St-Union Sq (L,N,Q,R,W,4,5,6)                  |9634957        |
|Junction Blvd (7)                                 |8088005        |
+--------------------------------------------------+---------------+



In [29]:
df_grouped_1 = df_filtered.groupBy("borough", "station_complex").agg(_sum("ridership").alias("total_ridership"))
boroughs = ["Brooklyn", "Manhattan", "Queens"]

for b in boroughs:
    print(f"\nTop 5 stations in {b}:")
    df_grouped_1.filter(col("borough") == b).orderBy(desc("total_ridership")).limit(5).show(truncate=False)


Top 5 stations in Brooklyn:
+--------+--------------------------------------------+---------------+
|borough |station_complex                             |total_ridership|
+--------+--------------------------------------------+---------------+
|Brooklyn|Atlantic Av-Barclays Ctr (B,D,N,Q,R,2,3,4,5)|6874259        |
|Brooklyn|Crown Hts-Utica Av (3,4)                    |6385979        |
|Brooklyn|Bedford Av (L)                              |6099672        |
|Brooklyn|Myrtle-Wyckoff Avs (L,M)                    |5746788        |
|Brooklyn|Kings Hwy (B,Q)                             |4933015        |
+--------+--------------------------------------------+---------------+


Top 5 stations in Manhattan:
+---------+------------------------------------------------+---------------+
|borough  |station_complex                                 |total_ridership|
+---------+------------------------------------------------+---------------+
|Manhattan|Times Sq-42 St (N,Q,R,W,S,1,2,3,7)/42 St (A,C,E)|3

In [31]:
top_10.write.mode("overwrite").csv("D:/data/top10_mta", header=True)

for b in boroughs:
    top5_df = df_grouped_1.filter(col("borough") == b) \
                          .orderBy(desc("total_ridership")) \
                          .limit(5)

    path = f"D:/data/top5_{b.lower().replace(' ', '_')}_stations"
    top5_df.write.mode("overwrite").csv(path, header=True)


In [32]:
spark.stop()