# Subway Data Analysis

In [23]:
import pandas as pd
import folium

### Load Files

In [24]:
routes = pd.read_csv("data/gtfs/routes.txt")
trips = pd.read_csv("data/gtfs/trips.txt")
shapes = pd.read_csv("data/gtfs/shapes.txt")
stops = pd.read_csv("data/gtfs/stops.txt")

### Clean

In [25]:
# One representative trip per (route_id, direction_id)
trip_reps = (
    trips.dropna(subset=["shape_id"])
         .drop_duplicates(subset=["route_id", "direction_id"])
         [["route_id", "direction_id", "shape_id"]]
)

trip_reps.head()

# Map route_id -> hex color (default gray if missing)
route_colors = {}
for _, row in routes.iterrows():
    color = row.get("route_color")
    if isinstance(color, str) and len(color) == 6:
        route_colors[row.route_id] = f"#{color}"
    else:
        route_colors[row.route_id] = "#888888"  # fallback

# Keep only actual station locations (location_type 1)
stations = stops[(stops["location_type"] == 1) | (stops["location_type"].isna())]

# Center on Manhattan
nyc_center = [40.75, -73.97]

### Map

In [26]:
m = folium.Map(location=nyc_center, zoom_start=11, tiles="cartodbpositron")
# Draw each representative shape as a PolyLine
for _, trip_row in trip_reps.iterrows():
    route_id  = trip_row["route_id"]
    shape_id  = trip_row["shape_id"]
    color     = route_colors.get(route_id, "#888888")

    pts = (
        shapes[shapes["shape_id"] == shape_id]
        .sort_values("shape_pt_sequence")
    )

    if pts.empty:
        continue

    coords = list(zip(pts["shape_pt_lat"], pts["shape_pt_lon"]))

    folium.PolyLine(
        locations=coords,
        color=color,
        weight=3,
        opacity=0.8,
        tooltip=f"Route {route_id}"
    ).add_to(m)

for _, row in stations.iterrows():
    folium.CircleMarker(
        location=[row["stop_lat"], row["stop_lon"]],
        radius=2,
        weight=0,
        fill=True,
        fill_opacity=0.9,
        color="black",
        tooltip=row["stop_name"]
    ).add_to(m)

m.save('map.html')