In [6]:
import numpy as np
import pandas as pd
import matplotlib as plt
import pyspark as sp
from pyspark.sql.functions import col, year, to_timestamp, sum as _sum, desc

In [7]:
spark = sp.sql.SparkSession.builder.appName("Reduced MTA").master("local[*]").config("spark.driver.memory", "8g").getOrCreate()

In [8]:
df = spark.read.parquet("D:/data/mta_hourly")

In [9]:
 #Convert timestamp string to actual timestamp
df = df.withColumn("timestamp", to_timestamp(col("transit_timestamp"), "MM/dd/yyyy HH:mm:ss a"))

# Extract year from timestamp
df = df.withColumn("year", year(col("timestamp")))

# Filter for 2023 and 2024
df_filtered = df.filter(col("year").isin(2023, 2024))

In [10]:
df_filtered.count()

25052735

In [11]:
top10_stations = ["Times Sq-42 St (N,Q,R,W,S,1,2,3,7)/42 St (A,C,E)", "Grand Central-42 St (S,4,5,6,7)", "74-Broadway (7)/Jackson Hts-Roosevelt Av (E,F,M,R)", "Flushing-Main St (7)", "34 St-Penn Station (A,C,E)", "34 St-Penn Station (1,2,3)", "34 St-Herald Sq (B,D,F,M,N,Q,R,W)", "Fulton St (A,C,J,Z,2,3,4,5)", "14 St-Union Sq (L,N,Q,R,W,4,5,6)", "Junction Blvd (7)"]

In [12]:
df_top10 = df_filtered.filter(col("station_complex").isin(top10_stations))

In [13]:
df_top10.count()

735426

In [14]:
df_top10.show(5)

+--------------------+------------------+--------------------+--------------+--------------------+---------+---------+---------+----------+--------------------+------------+---------+-------------------+----+
|   transit_timestamp|station_complex_id|     station_complex|payment_method| fare_class_category|ridership|transfers| latitude| longitude|        Georeference|transit_mode|  borough|          timestamp|year|
+--------------------+------------------+--------------------+--------------+--------------------+---------+---------+---------+----------+--------------------+------------+---------+-------------------+----+
|05/04/2024 03:00:...|               602|14 St-Union Sq (L...|     metrocard|Metrocard - Unlim...|       24|        0|40.735737| -73.98995|POINT (-73.98995 ...|      subway|Manhattan|2024-05-04 03:00:00|2024|
|09/08/2024 12:00:...|               628|Fulton St (A,C,J,...|     metrocard|Metrocard - Fair ...|       22|        0|40.710373| -74.00657|POINT (-74.00657 ...|    

In [15]:
top15_stations = ["Atlantic Av-Barclays Ctr (B,D,N,Q,R,2,3,4,5)", "Crown Hts-Utica Av (3,4)", "Bedford Av (L)", "Myrtle-Wyckoff Avs (L,M)", "Kings Hwy (B,Q)", "Times Sq-42 St (N,Q,R,W,S,1,2,3,7)/42 St (A,C,E)", "Grand Central-42 St (S,4,5,6,7)", "34 St-Penn Station (A,C,E)", "34 St-Penn Station (1,2,3)", "34 St-Herald Sq (B,D,F,M,N,Q,R,W)", "74-Broadway (7)/Jackson Hts-Roosevelt Av (E,F,M,R)", "Flushing-Main St (7)", "Junction Blvd (7)", "103 St-Corona Plaza (7)", "Jamaica Center-Parsons/Archer (E,J,Z)"]

In [16]:
df_top15 = df_filtered.filter(col("station_complex").isin(top15_stations))

In [17]:
df_top15.count()

1073980

In [18]:
df_grouped_1 = df_filtered.groupBy("borough", "station_complex").agg(_sum("ridership").alias("total_ridership"))
df_grouped_1.filter(col("borough") == "Bronx").orderBy(desc("total_ridership")).limit(5).show(truncate=False)

+-------+-----------------------------+---------------+
|borough|station_complex              |total_ridership|
+-------+-----------------------------+---------------+
|Bronx  |161 St-Yankee Stadium (B,D,4)|4521294        |
|Bronx  |Parkchester (6)              |4070365        |
|Bronx  |3 Av-149 St (2,5)            |3514056        |
|Bronx  |Fordham Rd (4)               |2258342        |
|Bronx  |Hunts Point Av (6)           |2255898        |
+-------+-----------------------------+---------------+



In [19]:
top20_stations = ["Atlantic Av-Barclays Ctr (B,D,N,Q,R,2,3,4,5)", "Crown Hts-Utica Av (3,4)", "Bedford Av (L)", "Myrtle-Wyckoff Avs (L,M)", "Kings Hwy (B,Q)", "Times Sq-42 St (N,Q,R,W,S,1,2,3,7)/42 St (A,C,E)", "Grand Central-42 St (S,4,5,6,7)", "34 St-Penn Station (A,C,E)", "34 St-Penn Station (1,2,3)", "34 St-Herald Sq (B,D,F,M,N,Q,R,W)", "74-Broadway (7)/Jackson Hts-Roosevelt Av (E,F,M,R)", "Flushing-Main St (7)", "Junction Blvd (7)", "103 St-Corona Plaza (7)", "Jamaica Center-Parsons/Archer (E,J,Z)", "161 St-Yankee Stadium (B,D,4)", "Parkchester (6)", "3 Av-149 St (2,5)", "Fordham Rd (4)", "Hunts Point Av (6)"]

In [20]:
df_top20 = df_filtered.filter(col("station_complex").isin(top20_stations))
df_top20.count()

1411411

In [21]:
df_top20.show(5)

+--------------------+------------------+--------------------+--------------+--------------------+---------+---------+---------+----------+--------------------+------------+--------+-------------------+----+
|   transit_timestamp|station_complex_id|     station_complex|payment_method| fare_class_category|ridership|transfers| latitude| longitude|        Georeference|transit_mode| borough|          timestamp|year|
+--------------------+------------------+--------------------+--------------+--------------------+---------+---------+---------+----------+--------------------+------------+--------+-------------------+----+
|03/09/2024 12:00:...|               345|Crown Hts-Utica A...|          omny|    OMNY - Full Fare|      297|       19|40.668896|-73.932945|POINT (-73.932945...|      subway|Brooklyn|2024-03-09 12:00:00|2024|
|03/09/2024 12:00:...|               120|      Bedford Av (L)|     metrocard|Metrocard - Students|        2|        0|40.717304| -73.95687|POINT (-73.95687 ...|      su

In [22]:
df_top20.select("transit_mode").distinct().show(truncate=False)

+------------+
|transit_mode|
+------------+
|subway      |
+------------+



In [23]:
df_top20.select("Georeference").distinct().show(truncate=False)

+----------------------------+
|Georeference                |
+----------------------------+
|POINT (-73.95773 40.60867)  |
|POINT (-73.97767 40.68436)  |
|POINT (-73.91158 40.699814) |
|POINT (-73.97689 40.68446)  |
|POINT (-73.97881 40.683666) |
|POINT (-73.95687 40.717304) |
|POINT (-73.932945 40.668896)|
|POINT (-73.987495 40.75731) |
|POINT (-73.987495 40.75529) |
|POINT (-73.98795 40.749565) |
|POINT (-73.986755 40.75731) |
|POINT (-73.986755 40.75529) |
|POINT (-73.976845 40.751778)|
|POINT (-73.99339 40.75229)  |
|POINT (-73.98782 40.749718) |
|POINT (-73.99106 40.750374) |
|POINT (-73.98974 40.75731)  |
|POINT (-73.986755 40.754673)|
|POINT (-73.891335 40.74685) |
|POINT (-73.80111 40.70215)  |
+----------------------------+
only showing top 20 rows



In [24]:
df_top10.write.mode("overwrite").csv("D:/data/top10_mta_data", header=True)
df_top15.write.mode("overwrite").csv("D:/data/top15_mta_data", header=True)
df_top20.write.mode("overwrite").csv("D:/data/top20_mta_data", header=True)

In [30]:
df_top10.write.mode("overwrite").parquet("D:/data/top10_mta_data")
df_top15.write.mode("overwrite").parquet("D:/data/top15_mta_data")
df_top20.write.mode("overwrite").parquet("D:/data/top20_mta_data")

Py4JJavaError: An error occurred while calling o356.parquet.
: java.util.NoSuchElementException: None.get
	at scala.None$.get(Option.scala:529)
	at scala.None$.get(Option.scala:527)
	at org.apache.spark.sql.execution.datasources.BasicWriteJobStatsTracker$.metrics(BasicWriteStatsTracker.scala:239)
	at org.apache.spark.sql.execution.command.DataWritingCommand.metrics(DataWritingCommand.scala:55)
	at org.apache.spark.sql.execution.command.DataWritingCommand.metrics$(DataWritingCommand.scala:55)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.metrics$lzycompute(InsertIntoHadoopFsRelationCommand.scala:47)
	at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.metrics(InsertIntoHadoopFsRelationCommand.scala:47)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.metrics$lzycompute(commands.scala:109)
	at org.apache.spark.sql.execution.command.DataWritingCommandExec.metrics(commands.scala:109)
	at org.apache.spark.sql.execution.SparkPlanInfo$.fromSparkPlan(SparkPlanInfo.scala:63)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$6(SQLExecution.scala:115)
	at org.apache.spark.sql.execution.SQLExecution$.withSQLConfPropagated(SQLExecution.scala:195)
	at org.apache.spark.sql.execution.SQLExecution$.$anonfun$withNewExecutionId$1(SQLExecution.scala:103)
	at org.apache.spark.sql.SparkSession.withActive(SparkSession.scala:827)
	at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:65)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:98)
	at org.apache.spark.sql.execution.QueryExecution$$anonfun$eagerlyExecuteCommands$1.applyOrElse(QueryExecution.scala:94)
	at org.apache.spark.sql.catalyst.trees.TreeNode.$anonfun$transformDownWithPruning$1(TreeNode.scala:512)
	at org.apache.spark.sql.catalyst.trees.CurrentOrigin$.withOrigin(TreeNode.scala:104)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDownWithPruning(TreeNode.scala:512)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.org$apache$spark$sql$catalyst$plans$logical$AnalysisHelper$$super$transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning(AnalysisHelper.scala:267)
	at org.apache.spark.sql.catalyst.plans.logical.AnalysisHelper.transformDownWithPruning$(AnalysisHelper.scala:263)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.plans.logical.LogicalPlan.transformDownWithPruning(LogicalPlan.scala:31)
	at org.apache.spark.sql.catalyst.trees.TreeNode.transformDown(TreeNode.scala:488)
	at org.apache.spark.sql.execution.QueryExecution.eagerlyExecuteCommands(QueryExecution.scala:94)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted$lzycompute(QueryExecution.scala:81)
	at org.apache.spark.sql.execution.QueryExecution.commandExecuted(QueryExecution.scala:79)
	at org.apache.spark.sql.execution.QueryExecution.assertCommandExecuted(QueryExecution.scala:133)
	at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:856)
	at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:387)
	at org.apache.spark.sql.DataFrameWriter.saveInternal(DataFrameWriter.scala:360)
	at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:239)
	at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:789)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:566)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:834)


In [31]:
spark.stop()