## Adding Geospatial Data

In [3]:
import pandas as pd

df = pd.read_csv("data_original.csv")
df.head(5)

Unnamed: 0,Toll Date,Toll Hour,Toll 10 Minute Block,Minute of Hour,Hour of Day,Day of Week Int,Day of Week,Toll Week,Time Period,Vehicle Class,Detection Group,Detection Region,CRZ Entries,Excluded Roadway Entries
0,03/29/2025,03/29/2025 11:00:00 PM,03/29/2025 11:50:00 PM,50,23,7,Saturday,03/23/2025,Overnight,"1 - Cars, Pickups and Vans",Brooklyn Bridge,Brooklyn,103,99
1,03/29/2025,03/29/2025 11:00:00 PM,03/29/2025 11:50:00 PM,50,23,7,Saturday,03/23/2025,Overnight,TLC Taxi/FHV,West Side Highway at 60th St,West Side Highway,97,2
2,03/29/2025,03/29/2025 11:00:00 PM,03/29/2025 11:50:00 PM,50,23,7,Saturday,03/23/2025,Overnight,TLC Taxi/FHV,West 60th St,West 60th St,197,0
3,03/29/2025,03/29/2025 11:00:00 PM,03/29/2025 11:50:00 PM,50,23,7,Saturday,03/23/2025,Overnight,TLC Taxi/FHV,Queensboro Bridge,Queens,77,0
4,03/29/2025,03/29/2025 11:00:00 PM,03/29/2025 11:50:00 PM,50,23,7,Saturday,03/23/2025,Overnight,TLC Taxi/FHV,Queens Midtown Tunnel,Queens,137,0


In [4]:
# Get latitude and longitude using geopy
from geopy.geocoders import Nominatim
import pandas as pd

geolocator = Nominatim(user_agent="mta-map-app")

def get_coordinates(location_name):
    try:
        location = geolocator.geocode(location_name + ", New York City")
        if location:
            return location.latitude, location.longitude
    except:
        return None, None
    return None, None

unique_groups = df["Detection Group"].dropna().unique()
location_map = {group: get_coordinates(group) for group in unique_groups}

In [5]:
# Add back to dataframe
df["Latitude"] = df["Detection Group"].map(lambda x: location_map.get(x, (None, None))[0])
df["Longitude"] = df["Detection Group"].map(lambda x: location_map.get(x, (None, None))[1])

In [6]:
# Check if any detection groups are missing geospatial data
null_detection_groups = df[df["Latitude"].isna() | df["Longitude"].isna()]["Detection Group"].dropna().unique()
print(null_detection_groups)

['West Side Highway at 60th St' 'FDR Drive at 60th St']


In [7]:
# We are missing only two. We can manually add these
# West Side Highway at 60th St: 40.773557, -73.992791
# FDR Drive at 60th St: 40.759008, -73.958702
# Define the correction mapping
manual_coords = {
    "West Side Highway at 60th St": (40.773557, -73.992791),
    "FDR Drive at 60th St": (40.759008, -73.958702)
}

# Fill in missing lat/lon for those detection groups
for group, (lat, lon) in manual_coords.items():
    mask = df["Detection Group"] == group
    df.loc[mask, "Latitude"] = lat
    df.loc[mask, "Longitude"] = lon

In [8]:
# Verify no more missing geospatial data
null_detection_groups = df[df["Latitude"].isna() | df["Longitude"].isna()]["Detection Group"].dropna().unique()
print(null_detection_groups)

[]


In [9]:
# Save into data_augmented.csv
df.to_csv("data_augmented.csv", index=False)