# Multimodal Best-Mode Demo (Taxi vs Citi Bike vs Subway)

Quick demo: given an origin/destination in NYC, compare Taxi vs Citi Bike vs Subway using simple heuristics (wait estimates from counts, travel-time approximations).

In [20]:
import pandas as pd
import numpy as np
from pathlib import Path
import sys
import math
import plotly.graph_objects as go
from pathlib import Path


In [21]:
def haversine_km(lat1, lon1, lat2, lon2):
    R = 6371.0
    phi1, phi2 = math.radians(lat1), math.radians(lat2)
    dphi = math.radians(lat2 - lat1)
    dlambda = math.radians(lon2 - lon1)
    a = math.sin(dphi/2)**2 + math.cos(phi1)*math.cos(phi2)*math.sin(dlambda/2)**2
    c = 2*math.atan2(math.sqrt(a), math.sqrt(1-a))
    return R*c


In [22]:
root = Path.cwd().resolve()
for candidate in [root, *root.parents]:
    if (candidate/'src').exists():
        sys.path.append(str(candidate/'src'))
        break
from modeling.poisson_zone import load_taxi_pickups


## Load data (trim for speed)
- Taxi: Jan 2024 parquet, columns needed for zone + pickup time.
- Citi Bike: Jan 2024 CSVs (sample first shard) to compute start counts per station/hour.
- Subway: entrance locations only; assume fixed 5-minute headway.

In [25]:
# TAXI_PATH = Path('data/raw/yellow_tripdata_2024-01.parquet')
# CITIBIKE_FILES = sorted(Path('data/raw/citibike').glob('202401-citibike-tripdata_*\.csv'))
TAXI_PATH = Path('/Users/atharvramesh/UCSD/Fall2025/ECE225A/NYC_Public_Transit/data/raw/yellow_tripdata_2024-01.parquet')
LOOKUP_CSV = Path('/Users/atharvramesh/UCSD/Fall2025/ECE225A/NYC_Public_Transit/data/raw/taxi_zone_lookup.csv')

CITIBIKE_FILES = sorted(
    Path("/Users/atharvramesh/UCSD/Fall2025/ECE225A/NYC_Public_Transit/data/raw/citibike")
    .glob("202401-citibike-tripdata_*.csv")
)
# CITIBIKE_FILES = sorted(Path('/Users/atharvramesh/UCSD/Fall2025/ECE225A/NYC_Public_Transit/data/raw/citibike').glob('202401-citibike-tripdata_*\.csv'))

SUBWAY_TURNSTILE = Path('/Users/atharvramesh/UCSD/Fall2025/ECE225A/NYC_Public_Transit/data/raw/subway/turnstile_data.csv')
MAX_TAXI = 400_000  # adjust for speed
MAX_BIKE = 500_000


In [27]:
# Taxi
taxi = load_taxi_pickups(TAXI_PATH, max_rows=MAX_TAXI)
taxi['hour'] = taxi['event_time'].dt.hour
taxi_rates = taxi.groupby(['PULocationID','hour']).size().rename('rides').reset_index()
taxi_rates['lambda_per_min'] = taxi_rates['rides'] / 60.0
zone_centroids = pd.read_csv('/Users/atharvramesh/UCSD/Fall2025/ECE225A/NYC_Public_Transit/data/raw/taxi_zone_centroids.csv')[['LocationID','lon','lat']].rename(columns={'LocationID':'PULocationID'})


In [33]:
taxi

Unnamed: 0,event_time,passenger_count,trip_distance,fare_amount,PULocationID,hour
0,2024-01-01 00:57:55+00:00,1.0,1.72,17.7,186,0
1,2024-01-01 00:03:00+00:00,1.0,1.80,10.0,140,0
2,2024-01-01 00:17:06+00:00,1.0,4.70,23.3,236,0
3,2024-01-01 00:36:38+00:00,1.0,1.40,10.0,79,0
4,2024-01-01 00:46:51+00:00,1.0,0.80,7.9,211,0
...,...,...,...,...,...,...
399995,2024-01-05 19:26:04+00:00,1.0,1.80,14.2,237,19
399996,2024-01-05 19:43:09+00:00,1.0,1.40,10.0,230,19
399997,2024-01-05 19:41:30+00:00,1.0,2.18,14.2,166,19
399998,2024-01-05 19:16:53+00:00,1.0,1.00,8.6,107,19


In [28]:
# Citi Bike (sample first shard)
bike_frames = []
for i, f in enumerate(CITIBIKE_FILES):
    bike_frames.append(pd.read_csv(f, nrows=MAX_BIKE if i==0 else 0))
bike = pd.concat(bike_frames, ignore_index=True)
bike['started_at'] = pd.to_datetime(bike['started_at'])
bike['hour'] = bike['started_at'].dt.hour
bike = bike.dropna(subset=['start_station_id','start_lat','start_lng'])
bike['start_station_id'] = bike['start_station_id'].astype(str)
bike_rates = bike.groupby(['start_station_id','hour']).size().rename('rides').reset_index()
bike_rates['lambda_per_min'] = bike_rates['rides'] / 60.0
stations = bike[['start_station_id','start_lat','start_lng']].drop_duplicates().rename(columns={'start_lat':'lat','start_lng':'lon'})


  bike_frames.append(pd.read_csv(f, nrows=MAX_BIKE if i==0 else 0))
  bike = pd.concat(bike_frames, ignore_index=True)


In [None]:
bike

In [29]:
# Subway entrances (location only)
subway = pd.read_csv(SUBWAY_TURNSTILE)
subway = subway.rename(columns={'Entrance Latitude':'lat','Entrance Longitude':'lon','Stop Name':'station'})
subway = subway[['station','lat','lon']].dropna().drop_duplicates()
SUBWAY_HEADWAY_MIN = 5.0  # assumed


## Scoring heuristics
- Access distance: origin → nearest station/zone (km) → walking time at 5 km/h.
- Travel distance: origin → dest straight-line (km).
- Speeds: taxi 15 km/h, bike 12 km/h, subway 25 km/h.
- Wait: taxi from λ_zone, bike from station λ, subway fixed headway.
- Total time = wait + walk access + in-vehicle time. Rank by total time.

In [30]:
WALK_SPEED_KMH = 5.0
TAXI_SPEED_KMH = 15.0
BIKE_SPEED_KMH = 12.0
SUBWAY_SPEED_KMH = 25.0

def nearest_point(df, lat, lon, id_col):
    dists = df.apply(lambda r: haversine_km(lat, lon, r['lat'], r['lon']), axis=1)
    idx = dists.idxmin()
    return df.loc[idx, id_col], df.loc[idx, 'lat'], df.loc[idx, 'lon'], dists.min()

def get_lambda(rate_df, key_col, key_val, hour_col='hour', hour=12):
    subset = rate_df[(rate_df[key_col]==key_val) & (rate_df[hour_col]==hour)]
    if subset.empty:
        return np.nan
    return subset['lambda_per_min'].iloc[0]


In [31]:
def recommend(origin, dest, hour=12):
    o_lat, o_lon = origin
    d_lat, d_lon = dest
    direct_km = haversine_km(o_lat, o_lon, d_lat, d_lon)
    # Taxi
    tz_id, tz_lat, tz_lon, tz_walk_km = nearest_point(zone_centroids, o_lat, o_lon, 'PULocationID')
    lam_taxi = get_lambda(taxi_rates, 'PULocationID', tz_id, hour=hour)
    wait_taxi = np.nan if not np.isfinite(lam_taxi) or lam_taxi<=0 else 1/lam_taxi
    walk_taxi_min = (tz_walk_km / WALK_SPEED_KMH) * 60
    travel_taxi_min = (direct_km / TAXI_SPEED_KMH) * 60
    total_taxi = wait_taxi + walk_taxi_min + travel_taxi_min if np.isfinite(wait_taxi) else np.nan
    # Bike
    st_id, st_lat, st_lon, st_walk_km = nearest_point(stations, o_lat, o_lon, 'start_station_id')
    lam_bike = get_lambda(bike_rates, 'start_station_id', st_id, hour=hour)
    wait_bike = np.nan if not np.isfinite(lam_bike) or lam_bike<=0 else 1/lam_bike
    walk_bike_min = (st_walk_km / WALK_SPEED_KMH) * 60
    travel_bike_min = (direct_km / BIKE_SPEED_KMH) * 60
    total_bike = wait_bike + walk_bike_min + travel_bike_min if np.isfinite(wait_bike) else np.nan
    # Subway (coarse)
    sb_name, sb_lat, sb_lon, sb_walk_km = nearest_point(subway, o_lat, o_lon, 'station')
    wait_subway = SUBWAY_HEADWAY_MIN / 2
    walk_subway_min = (sb_walk_km / WALK_SPEED_KMH) * 60
    travel_subway_min = (direct_km / SUBWAY_SPEED_KMH) * 60
    total_subway = wait_subway + walk_subway_min + travel_subway_min
    rows = [
        {'mode':'Taxi', 'wait_min':wait_taxi, 'walk_min':walk_taxi_min, 'travel_min':travel_taxi_min, 'total_min':total_taxi},
        {'mode':'Citi Bike', 'wait_min':wait_bike, 'walk_min':walk_bike_min, 'travel_min':travel_bike_min, 'total_min':total_bike},
        {'mode':'Subway', 'wait_min':wait_subway, 'walk_min':walk_subway_min, 'travel_min':travel_subway_min, 'total_min':total_subway},
    ]
    return pd.DataFrame(rows).sort_values('total_min')


## Demo
Edit the origin/destination below (lat, lon) and set `hour` (0-23).

In [32]:
origin = (40.7580, -73.9855)  # Times Square
dest = (40.8075, -73.9626)    # Columbia University
hour = 9  # morning
recommend(origin, dest, hour=hour)


Unnamed: 0,mode,wait_min,walk_min,travel_min,total_min
2,Subway,2.5,1.760156,13.996994,18.25715
0,Taxi,0.102041,2.759964,23.328324,26.190328
1,Citi Bike,1.666667,2.974789,29.160405,33.801861
