
# Taxi Cohort + Tail Modeling Dashboard

Interactive notebook that merges the cohort explorer and the Poisson vs. Neg-Bin tail demo so you can:
- Load a slice of NYC Yellow Taxi data with zone metadata.
- Filter by weekday/weekend and rush/off-peak cohorts.
- Inspect arrival histograms and dispersion per zone/bucket.
- Compare empirical tails vs. Poisson/NB tail approximations with classic bounds.



## Setup
Run the cell below if the plotting/widget dependencies are missing in your environment.


In [12]:
%pip install --quiet plotly pandas numpy ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [13]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px
import ipywidgets as widgets
from IPython.display import display
from pathlib import Path
import sys
from scipy.stats import poisson as sp_poisson, nbinom as sp_nbinom


In [14]:
# Add src/ to path for project helpers
root = Path.cwd().resolve()
for candidate in [root, *root.parents]:
    if (candidate / 'src').exists():
        sys.path.append(str(candidate / 'src'))
        break
from modeling.poisson_zone import load_taxi_pickups, attach_zone_metadata, bucket_counts_by_group


## Configuration

In [15]:
TAXI_PATH = Path('/Users/atharvramesh/UCSD/Fall2025/ECE225A/NYC_Public_Transit/data/raw/yellow_tripdata_2024-01.parquet')
LOOKUP_CSV = Path('/Users/atharvramesh/UCSD/Fall2025/ECE225A/NYC_Public_Transit/data/raw/taxi_zone_lookup.csv')
MAX_ROWS = 4_000_000
BUCKET_BASE = '15min'
MIN_MEAN = 1.0
MIN_NONZERO = 0.3


In [16]:
RUSH_RANGES = [(7, 10), (16, 19)]  # inclusive start, exclusive end

def is_rush(hour, ranges=RUSH_RANGES):
    return any(lo <= hour < hi for lo, hi in ranges)

def cohort_label(is_weekend: pd.Series, is_rush: pd.Series) -> pd.Series:
    weekend = np.where(is_weekend, 'weekend', 'weekday')
    rush = np.where(is_rush, 'rush', 'offpeak')
    return pd.Series(weekend + '_' + rush, index=is_weekend.index)


## Load taxi sample & derive cohorts

In [17]:
if not TAXI_PATH.exists():
    raise FileNotFoundError(TAXI_PATH)
if not LOOKUP_CSV.exists():
    raise FileNotFoundError(LOOKUP_CSV)

trips = load_taxi_pickups(TAXI_PATH, max_rows=MAX_ROWS)
trips = attach_zone_metadata(trips, LOOKUP_CSV).dropna(subset=['Zone'])
trips['event_time'] = trips['event_time'].dt.tz_convert(None)
trips['hour'] = trips['event_time'].dt.hour
trips['is_weekend'] = trips['event_time'].dt.dayofweek >= 5
trips['is_rush'] = trips['hour'].apply(is_rush)
trips['cohort'] = cohort_label(trips['is_weekend'], trips['is_rush'])
trips['bucket_start'] = trips['event_time'].dt.floor(BUCKET_BASE)

print(f"Loaded {len(trips):,} trips across {trips['Zone'].nunique()} zones")


Loaded 2,954,264 trips across 258 zones


In [18]:
# Screen for active zones and prep cohort-level counts
base_counts = bucket_counts_by_group(trips, freq=BUCKET_BASE, group_cols='Zone')
means = base_counts.mean()
nonzero = (base_counts > 0).mean()
active_zones = [z for z in base_counts.columns if means[z] >= MIN_MEAN and nonzero[z] >= MIN_NONZERO]
active_zones = sorted(active_zones) or sorted(trips['Zone'].unique())

cohort_counts = (
    trips
    .groupby(['Zone', 'cohort', 'bucket_start'])
    .size()
    .rename('arrivals')
    .reset_index()
)

print(f"Active zones: {len(active_zones)} | Cohorts: {sorted(trips['cohort'].unique())}")


Active zones: 58 | Cohorts: ['weekday_offpeak', 'weekday_rush', 'weekend_offpeak', 'weekend_rush']


## Modeling helpers

In [19]:
def fit_nb(series: pd.Series):
    """Method-of-moments NB fit returning (r, p)."""
    mean = series.mean()
    var = series.var(ddof=0)
    if var <= mean or mean <= 0:
        return np.nan, np.nan
    r = mean ** 2 / (var - mean)
    p = r / (r + mean)
    return r, p


## Tail fit explorer

In [None]:
def build_tail_app():
    zone_dd = widgets.Dropdown(options=active_zones, description='Zone')
    cohort_opts = ['All trips'] + sorted(trips['cohort'].unique())
    cohort_dd = widgets.Dropdown(options=cohort_opts, description='Cohort')
    freq_dd = widgets.Dropdown(options=['5min', '15min', '30min', '1H'], value='15min', description='Bucket')
    out = widgets.Output()

    def refresh(*_):
        out.clear_output()
        with out:
            cohort = cohort_dd.value
            subset = trips if cohort == 'All trips' else trips[trips['cohort'] == cohort]
            if subset.empty:
                print('No trips for this cohort.')
                return
            counts = bucket_counts_by_group(subset, freq=freq_dd.value, group_cols='Zone')
            if zone_dd.value not in counts.columns:
                print('Zone missing for this selection.')
                return
            series = counts[zone_dd.value]
            nonzero_frac = (series > 0).mean()
            mean = series.mean()
            if mean < MIN_MEAN or nonzero_frac < MIN_NONZERO:
                print(f'Selection too sparse (mean={mean:.2f}, nonzero={nonzero_frac:.2f}).')
                return
            var = series.var(ddof=0)
            disp = var / mean if mean > 0 else np.nan
            nb_r, nb_p = fit_nb(series)
            grid = np.arange(0, max(series.max(), int(series.quantile(0.99)) + 5) + 1)
            obs = series.value_counts().reindex(grid, fill_value=0).values
            fig = go.Figure()
            fig.add_bar(x=grid, y=obs, name='Observed', marker=dict(color='#4B6BFB'), opacity=0.75)
            pois_exp = sp_poisson.pmf(grid, mean) * len(series)
            fig.add_scatter(x=grid, y=pois_exp, mode='lines', name='Poisson', line=dict(color='#FFA500', width=2))
            if np.isfinite(nb_r) and np.isfinite(nb_p) and nb_r > 0 and 0 < nb_p < 1:
                nb_exp = sp_nbinom.pmf(grid, nb_r, nb_p) * len(series)
                fig.add_scatter(x=grid, y=nb_exp, mode='lines', name='Neg-Bin', line=dict(color='#D62728', width=3))
            fig.update_layout(title=f"{zone_dd.value} ({freq_dd.value}) — {cohort_dd.value}", xaxis_title='Arrivals per bucket', yaxis_title='Frequency', template='plotly_white')
            fig.show()
            summary = {
                'zone': zone_dd.value,
                'cohort': cohort_dd.value,
                'freq': freq_dd.value,
                'mean': mean,
                'variance': var,
                'dispersion': disp,
            }
            display(pd.DataFrame([summary]))

    zone_dd.observe(refresh, names='value')
    cohort_dd.observe(refresh, names='value')
    freq_dd.observe(refresh, names='value')

    refresh()
    return widgets.VBox([widgets.HBox([zone_dd, cohort_dd]), freq_dd, out])

tail_app = build_tail_app()
tail_app


VBox(children=(HBox(children=(Dropdown(description='Zone', options=('Alphabet City', 'Battery Park City', 'Blo…

## Cohort histogram explorer

In [21]:
def build_hist_app():
    zones = sorted(trips['Zone'].unique())
    cohorts = sorted(trips['cohort'].unique())
    zone_dd = widgets.Dropdown(options=zones, description='Zone')
    cohort_dd = widgets.Dropdown(options=cohorts, description='Cohort')
    bucket_dd = widgets.Dropdown(options=['5min', '15min', '30min', '1H'], value=BUCKET_BASE, description='Bucket')
    out = widgets.Output()

    def refresh(*_):
        out.clear_output()
        with out:
            sub = cohort_counts[(cohort_counts['Zone'] == zone_dd.value) & (cohort_counts['cohort'] == cohort_dd.value)]
            if sub.empty:
                print('No data for this zone/cohort.')
                return
            bucket = bucket_dd.value
            stacks = sub.copy()
            stacks['bucket_start'] = stacks['bucket_start'].dt.floor(bucket)
            agg = stacks.groupby('bucket_start')['arrivals'].sum().reset_index()
            stats = agg['arrivals'].describe(percentiles=[0.5, 0.9]).to_frame().T
            fig = px.histogram(agg, x='arrivals', nbins=40, title=f"{zone_dd.value} / {cohort_dd.value} ({bucket})")
            fig.update_layout(xaxis_title='Arrivals per bucket', yaxis_title='Frequency')
            fig.show()
            display(stats[['mean', 'std', 'min', '50%', '90%', 'max']])

    zone_dd.observe(refresh, names='value')
    cohort_dd.observe(refresh, names='value')
    bucket_dd.observe(refresh, names='value')

    refresh()
    return widgets.VBox([widgets.HBox([zone_dd, cohort_dd, bucket_dd]), out])

hist_app = build_hist_app()
hist_app


VBox(children=(HBox(children=(Dropdown(description='Zone', options=('Allerton/Pelham Gardens', 'Alphabet City'…

## Combined dashboard

In [22]:
tabs = widgets.Tab(children=[tail_app, hist_app])
tabs.set_title(0, 'Tail fits')
tabs.set_title(1, 'Cohort histograms')
tabs


Tab(children=(VBox(children=(HBox(children=(Dropdown(description='Zone', options=('Alphabet City', 'Battery Pa…