In [1]:
import pandas as pd
import requests
import zipfile
import io
import os
from shapely.geometry import Point, LineString
from shapely.ops import transform
import pyproj

# Configuration
GTFS_URL = "https://rrgtfsfeeds.s3.amazonaws.com/gtfs_subway.zip"
DATA_DIR = "data/static"
os.makedirs(DATA_DIR, exist_ok=True)

## 1. Download and Extract GTFS Data

In [2]:
print(f"Downloading GTFS data from {GTFS_URL}...")
r = requests.get(GTFS_URL)
z = zipfile.ZipFile(io.BytesIO(r.content))
z.extractall(DATA_DIR)
print("Download and extraction complete.")

Downloading GTFS data from https://rrgtfsfeeds.s3.amazonaws.com/gtfs_subway.zip...
Download and extraction complete.


## 2. Load Shapes and Stops

In [3]:
# Load necessary files
shapes_df = pd.read_csv(os.path.join(DATA_DIR, "shapes.txt"))
stops_df = pd.read_csv(os.path.join(DATA_DIR, "stops.txt"))
trips_df = pd.read_csv(os.path.join(DATA_DIR, "trips.txt"))
routes_df = pd.read_csv(os.path.join(DATA_DIR, "routes.txt"))

print("Shapes:", shapes_df.shape)
print("Stops:", stops_df.shape)
print("Trips:", trips_df.shape)

Shapes: (149834, 4)
Stops: (1488, 6)
Trips: (20304, 6)


In [13]:
# Identify longest A line shapes

# filter trips for the A line 
a_trips = trips_df[trips_df['route_id']=='A']

# get unique shape_ids for the A line
a_shape_ids = a_trips['shape_id'].unique()

# filter shapes to only include A-line shapes
a_shapes = shapes_df[shapes_df['shape_id'].isin(a_shape_ids)]

# find the shape_id with the most points.  A proxy for the longest most detailed path
# we group by shape_id and count the rows
shape_counts = a_shapes.groupby('shape_id').count()['shape_pt_sequence']

longest_shape_id = shape_counts.idxmax()

print(f"Selected canonical shape id: {longest_shape_id} with {shape_counts.max()} points")

# extract the points for this specific shape and sort them by sequence
main_shape_points = a_shapes[a_shapes['shape_id']==longest_shape_id].sort_values('shape_pt_sequence')

main_shape_points.head()

Selected canonical shape id: A..N09R with 1133 points


Unnamed: 0,shape_id,shape_pt_sequence,shape_pt_lat,shape_pt_lon
46232,A..N09R,0,40.603995,-73.755405
46233,A..N09R,1,40.603729,-73.755898
46234,A..N09R,2,40.603715,-73.755925
46235,A..N09R,3,40.603701,-73.755953
46236,A..N09R,4,40.603687,-73.75598


In [17]:
# create geometry and calculate distance

# create a linestring from the shapes point
line_coords = list(zip(main_shape_points.shape_pt_lon, main_shape_points.shape_pt_lat))
line_geom = LineString(line_coords)

# define projection: WGS84 (lat/lon) -> UTM Zone 18N (NYC, meters)
# EPSG:4326 is standard Lat/Lon
# EPSG:32618 is UTM Zone 18N (Meters)
wgs84 = pyproj.CRS('EPSG:4326')
utm = pyproj.CRS('EPSG:32618')
project = pyproj.Transformer.from_crs(wgs84, utm, always_xy=True).transform

# project the line to meters
line_geom_meters = transform(project, line_geom)

print(f"Total length of A line (longest path): {line_geom_meters.length / 1000:.2f} km")
print(f"Total length in miles: {line_geom_meters.length * 0.000621371:.2f} miles")

Total length of A line (longest path): 51.82 km
Total length in miles: 32.20 miles
