In [1]:
import os
import pandas as pd
DATA_DIR = os.path.join("..", "data", "raw")
ace_violations = pd.read_csv(os.path.join(DATA_DIR, "ACE_violations_MAR.csv"))
ace_violations.head()



Unnamed: 0,Violation ID,Vehicle ID,First Occurrence,Last Occurrence,Violation Status,Violation Type,Bus Route ID,Violation Latitude,Violation Longitude,Stop ID,Stop Name,Bus Stop Latitude,Bus Stop Longitude,Violation Georeference,Bus Stop Georeference
0,360443322,cd8aedcc2c966c2c3128818deb321c84747128d77c9f3f...,03/30/2023 12:29:00 PM,03/30/2023 12:39:41 PM,EXEMPT - BUS/PARATRANSIT,MOBILE BUS LANE,B44+,40.6345,-73.947773,303477,NOSTRAND AV/NEWKIRK AV,40.640085,-73.948517,POINT (-73.9477735 40.6344998333333),POINT (-73.948517 40.640085)
1,360443314,7b5b5e516b899a43bcb7123fc2c252b4b8fdcd9567d035...,03/30/2023 10:57:32 AM,03/30/2023 12:01:47 PM,EXEMPT - OTHER,MOBILE BUS LANE,B44+,40.632408,-73.947633,901278,NOSTRAND AV/AV H,40.631827,-73.947438,POINT (-73.9476335 40.6324081666667),POINT (-73.947438 40.631827)
2,360003985,04abd244dbc4036fe4031950085a23b3ec7dfc94a4db00...,03/31/2023 08:53:13 AM,03/31/2023 10:13:57 AM,EXEMPT - EMERGENCY VEHICLE,MOBILE BUS LANE,B44+,40.632347,-73.94762,901278,NOSTRAND AV/AV H,40.631827,-73.947438,POINT (-73.9476205 40.632347),POINT (-73.947438 40.631827)
3,360000317,3f458190e67894eae37e3e122850dc0f47bafbb4501663...,03/31/2023 07:13:36 AM,03/31/2023 08:06:02 AM,VIOLATION ISSUED,MOBILE BUS LANE,B44+,40.634646,-73.947921,303477,NOSTRAND AV/NEWKIRK AV,40.640085,-73.948517,POINT (-73.9479213333333 40.6346456666667),POINT (-73.948517 40.640085)
4,360000315,f4c07b886377c8c5ae94fe3908d4514183e7672e3234d4...,03/31/2023 10:58:00 AM,03/31/2023 11:11:00 AM,VIOLATION ISSUED,MOBILE BUS LANE,B44+,40.695957,-73.952719,303441,LEE AV/FLUSHING AV,40.699506,-73.953484,POINT (-73.95271867 40.69595733),POINT (-73.953484 40.699506)


In [4]:
import json
import math

features = []

for _, row in ace_violations.iterrows():
    try:
        v_lat, v_lon = row["Violation Latitude"], row["Violation Longitude"]

        # Skip rows with bad coords
        if (
            pd.isna(v_lat) or pd.isna(v_lon) or
            not (math.isfinite(v_lat) and math.isfinite(v_lon))
        ):
            continue

        # Properties (convert everything to string to avoid JSON issues)
        props = {col: str(row[col]) for col in [
            "Violation ID",
            "Vehicle ID",
            "First Occurrence",
            "Last Occurrence",
            "Violation Status",
            "Violation Type",
            "Bus Route ID",
            "Stop ID",
            "Stop Name"
        ] if col in row}

        feature = {
            "type": "Feature",
            "properties": props,
            "geometry": {
                "type": "Point",
                "coordinates": [float(v_lon), float(v_lat)]
            }
        }
        features.append(feature)
    except Exception as e:
        print(f"Skipping row due to error: {e}")

geojson = {
    "type": "FeatureCollection",
    "features": features
}

with open("/Users/danielluna/Desktop/datathon_project/data/processed/violations.geojson", "w", encoding="utf-8") as f:
    json.dump(geojson, f, indent=2, ensure_ascii=False)

print(f"✅ Saved {len(features)} point features to violations_points.geojson")


preview = {
    "type": "FeatureCollection",
    "features": features[:3]
}

print(json.dumps(preview, indent=2, ensure_ascii=False))


✅ Saved 796 point features to violations_points.geojson
{
  "type": "FeatureCollection",
  "features": [
    {
      "type": "Feature",
      "properties": {
        "Violation ID": "360443322",
        "Vehicle ID": "cd8aedcc2c966c2c3128818deb321c84747128d77c9f3f1c00d767e6fc4192d4",
        "First Occurrence": "03/30/2023 12:29:00 PM",
        "Last Occurrence": "03/30/2023 12:39:41 PM",
        "Violation Status": "EXEMPT - BUS/PARATRANSIT",
        "Violation Type": "MOBILE BUS LANE",
        "Bus Route ID": "B44+",
        "Stop ID": "303477",
        "Stop Name": "NOSTRAND AV/NEWKIRK AV"
      },
      "geometry": {
        "type": "Point",
        "coordinates": [
          -73.9477735,
          40.6344998333333
        ]
      }
    },
    {
      "type": "Feature",
      "properties": {
        "Violation ID": "360443314",
        "Vehicle ID": "7b5b5e516b899a43bcb7123fc2c252b4b8fdcd9567d035232f216de2eb45939e",
        "First Occurrence": "03/30/2023 10:57:32 AM",
        "Las

In [5]:
!pip install geopandas


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [6]:

import geopandas as gpd
violations_gdf = gpd.read_file(
    "/Users/danielluna/Desktop/datathon_project/data/processed/violations.geojson"
).set_crs(epsg=4326)


#Bus Segment Speeds
'''
Purpose
This code processes MTA bus route segment speeds and converts them into a GeoJSON file 
of line segments along each bus route, including aggregated speed and trip information. 
This is useful for mapping bus route performance in GIS tools like Kepler.gl or Mapbox.
'''

# Path to data folder relative to the notebook
DATA_DIR = os.path.join("..", "data", "raw")

import pandas as pd
import json
from math import radians, sin, cos, sqrt, atan2

# --- 1. Load GTFS shapes and trips ---
shapes = pd.read_csv("/Users/danielluna/Desktop/shapes.txt")
shapes = shapes.sort_values(["shape_id", "shape_pt_sequence"])

trips = pd.read_csv("/Users/danielluna/Desktop/trips.txt")

# --- 2. Load speeds ---
DATA_DIR = os.path.join("..", "data", "raw")
speeds = pd.read_csv(os.path.join(DATA_DIR, "MTA_Bus_Speed_MAR.csv"),parse_dates=["Timestamp"])
speeds.head()


# --- 2b. Aggregate speeds by Next Timepoint Stop Name and Direction ---
agg_speeds = (
    speeds.groupby(["Route ID","Next Timepoint Stop Name", "Direction"], as_index=False)
    .agg(
        avg_speed=("Average Road Speed", "mean"),
        total_trips=("Bus Trip Count", "sum"),
        start_lat=("Timepoint Stop Latitude", "first"),
        start_lon=("Timepoint Stop Longitude", "first"),
        end_lat=("Next Timepoint Stop Latitude", "first"),
        end_lon=("Next Timepoint Stop Longitude", "first"),
        route_id=("Route ID", "first")
    )
)

# --- 2c. Map directions and add segment numbers ---
dir_map = {'W': 0, 'E': 1, 'N': 0, 'S': 1}  # adjust if needed
agg_speeds['direction_id'] = agg_speeds['Direction'].map(dir_map)

agg_speeds = agg_speeds.sort_values(["route_id", "direction_id"])
agg_speeds["segment_number"] = (
    agg_speeds.groupby(["route_id", "direction_id"]).cumcount() + 1
)

# --- 3. Helper functions ---
def haversine(lat1, lon1, lat2, lon2):
    R = 6371e3
    phi1, phi2 = radians(lat1), radians(lat2)
    dphi, dlambda = radians(lat2 - lat1), radians(lon2 - lon1)
    a = sin(dphi/2)**2 + cos(phi1) * cos(phi2) * sin(dlambda/2)**2
    return 2 * R * atan2(sqrt(a), sqrt(1 - a))

def closest_index(lat, lon, coords):
    return min(range(len(coords)), key=lambda i: haversine(lat, lon, coords[i][1], coords[i][0]))

# --- 4. Map route+direction to BEST shape_id (longest shape) ---
# Get all shape_ids per route/direction
direction_map = trips.groupby(['route_id', 'direction_id'])['shape_id'].unique().to_dict()

best_shape_map = {}
for key, shape_ids in direction_map.items():
    # pick the shape_id with the most points
    shape_counts = shapes[shapes["shape_id"].isin(shape_ids)].groupby("shape_id").size()
    best_shape_id = shape_counts.idxmax()
    best_shape_map[key] = best_shape_id

# --- 5. Build GeoJSON ---
features = []

for _, row in agg_speeds.iterrows():
    route_dir_key = (row["route_id"], row["direction_id"])
    if route_dir_key not in best_shape_map:
        continue

    shape_id = best_shape_map[route_dir_key]
    route_shape = shapes[shapes["shape_id"] == shape_id]
    shape_coords = list(zip(route_shape["shape_pt_lon"], route_shape["shape_pt_lat"]))

    # Find indices along shape for stop and next stop
    i1 = closest_index(row["start_lat"], row["start_lon"], shape_coords)
    i2 = closest_index(row["end_lat"], row["end_lon"], shape_coords)
    if i1 > i2:
        i1, i2 = i2, i1

    # Fallback if points are the same
    if i1 == i2:
        segment_coords = [(row["start_lon"], row["start_lat"]), (row["end_lon"], row["end_lat"])]
    else:
        segment_coords = shape_coords[i1:i2+1]

    features.append({
        "type": "Feature",
        "geometry": {"type": "LineString", "coordinates": segment_coords},
        "properties": {
            "route_id": row["route_id"],
            "direction": row["Direction"],
            "segment_number": int(row["segment_number"]),
            "speed": row["avg_speed"],
            "trips": row["total_trips"]
        }
    })

geojson = {"type": "FeatureCollection", "features": features}

with open("/Users/danielluna/Desktop/bx19_speeds_shapes_directions_agg_3.geojson", "w") as f:
    json.dump(geojson, f)

print(f"GeoJSON created with {len(features)} features")

# Ensure CRS is set (assuming WGS84, EPSG:4326)
import geopandas as gpd

segments_gdf = gpd.read_file(
    "/Users/danielluna/Desktop/bx19_speeds_shapes_directions_agg_3.geojson"
).set_crs(epsg=4326)

#print(geojson)


'''
Combines the violations dataset with segment speed dataset. 
This enables use to see which WHERE and WHICH DIRECTION the bus was travelling 
when the violation occured.
'''

# Ensure route_id columns align
violations_gdf = violations_gdf.rename(columns={"Bus Route ID": "route_id"})

# Only join violations to segments from the same route
joined = gpd.sjoin_nearest(
    violations_gdf,
    segments_gdf,
    how="left",
    max_distance=50,   # tolerance in meters, adjust if needed
    distance_col="dist_meters"
)

# Filter so that routes actually match
joined = joined[joined["route_id_left"] == joined["route_id_right"]]

print(joined[[
    "Violation ID", "route_id_left", "Stop Name", 
    "segment_number", "direction", "speed", "dist_meters"
]].head())


# Extract longitude and latitude from Point geometry
joined["longitude"] = joined.geometry.x
joined["latitude"] = joined.geometry.y

print(joined)

# Optional: export to CSV
output_fp = '/Users/danielluna/Desktop/violation_speed_segment.csv'
joined.to_csv(output_fp, index=False)
print(f"Saved merged CSV to {output_fp}")

  speeds = pd.read_csv(os.path.join(DATA_DIR, "MTA_Bus_Speed_MAR.csv"),parse_dates=["Timestamp"])


GeoJSON created with 51 features
  Violation ID route_id_left               Stop Name  segment_number  \
0    360443322          B44+  NOSTRAND AV/NEWKIRK AV               2   
1    360443314          B44+        NOSTRAND AV/AV H               4   
1    360443314          B44+        NOSTRAND AV/AV H               1   
2    360003985          B44+        NOSTRAND AV/AV H               4   
2    360003985          B44+        NOSTRAND AV/AV H               1   

  direction     speed  dist_meters  
0         S  8.464248     0.000047  
1         N  7.886065     0.000006  
1         N  8.701053     0.000006  
2         N  7.886065     0.000012  
2         N  8.701053     0.000012  
    Violation ID                                         Vehicle ID  \
0      360443322  cd8aedcc2c966c2c3128818deb321c84747128d77c9f3f...   
1      360443314  7b5b5e516b899a43bcb7123fc2c252b4b8fdcd9567d035...   
1      360443314  7b5b5e516b899a43bcb7123fc2c252b4b8fdcd9567d035...   
2      360003985  04abd244db


