# Dataset Overview

Quick reference notebook to summarize the raw taxi + Citi Bike data. Run this before writing the report to capture fleet sizes, ride counts, average durations/distances, and station/zone coverage.

In [1]:
import json
from pathlib import Path
import sys

import numpy as np
import pandas as pd

In [2]:
# Add src/ to path for shared loaders
root = Path.cwd().resolve()
for candidate in [root, *root.parents]:
    src_dir = candidate / 'src'
    if src_dir.exists():
        sys.path.append(str(src_dir))
        break

from modeling.travel_diagnostics import load_taxi_trips, load_bike_trips


In [3]:
DATA_ROOT = Path('/Users/atharvramesh/UCSD/Fall2025/ECE225A/NYC_Public_Transit/data/raw')
TAXI_PATHS = sorted(DATA_ROOT.glob('yellow_tripdata_2024-*.parquet'))
BIKE_ROOT = DATA_ROOT / 'citibike'
BIKE_GLOB = '20240*-citibike-tripdata_*.csv'
print(f'Taxi files: {len(TAXI_PATHS)}')
print(f'Bike glob: {BIKE_GLOB}')


Taxi files: 6
Bike glob: 20240*-citibike-tripdata_*.csv


## Taxi sample statistics

In [4]:
path = Path("/Users/atharvramesh/UCSD/Fall2025/ECE225A/NYC_Public_Transit/data/raw/yellow_tripdata_2024-01.parquet")
df = pd.read_parquet(path)
print(df.head())
print(df.columns)

   VendorID tpep_pickup_datetime tpep_dropoff_datetime  passenger_count  \
0         2  2024-01-01 00:57:55   2024-01-01 01:17:43              1.0   
1         1  2024-01-01 00:03:00   2024-01-01 00:09:36              1.0   
2         1  2024-01-01 00:17:06   2024-01-01 00:35:01              1.0   
3         1  2024-01-01 00:36:38   2024-01-01 00:44:56              1.0   
4         1  2024-01-01 00:46:51   2024-01-01 00:52:57              1.0   

   trip_distance  RatecodeID store_and_fwd_flag  PULocationID  DOLocationID  \
0           1.72         1.0                  N           186            79   
1           1.80         1.0                  N           140           236   
2           4.70         1.0                  N           236            79   
3           1.40         1.0                  N            79           211   
4           0.80         1.0                  N           211           148   

   payment_type  fare_amount  extra  mta_tax  tip_amount  tolls_amount  \


In [5]:
taxi_frames = []
for path in TAXI_PATHS:
    print(f'Loading {path.name} ...')
    taxi_frames.append(load_taxi_trips(path, max_rows=None))

taxi = pd.concat(taxi_frames, ignore_index=True)
print(f"Taxi trips after filters: {len(taxi):,}")
if 'PULocationID' in taxi.columns:
    print('Unique pickup zones:', taxi['PULocationID'].nunique())
else:
    print('Unique pickup zones: unavailable (column missing)')
print('Unique hours observed:', taxi['hour'].nunique())

taxi_summary = taxi[['travel_min', 'distance_km', 'is_rush', 'is_weekend']].describe(include='all')
display(taxi_summary)

Loading yellow_tripdata_2024-01.parquet ...
Loading yellow_tripdata_2024-02.parquet ...
Loading yellow_tripdata_2024-03.parquet ...
Loading yellow_tripdata_2024-04.parquet ...
Loading yellow_tripdata_2024-05.parquet ...
Loading yellow_tripdata_2024-06.parquet ...
Taxi trips after filters: 17,550,334
Unique pickup zones: 258
Unique hours observed: 24


Unnamed: 0,travel_min,distance_km,is_rush,is_weekend
count,17550330.0,17550330.0,17550334,17550334
unique,,,2,2
top,,,False,False
freq,,,12062480,12624192
mean,13.13588,3.23856,,
std,8.106863,2.310941,,
min,1.0,0.0160934,,
25%,7.266667,1.60934,,
50%,11.41667,2.574944,,
75%,17.15,4.152097,,


## Citi Bike sample statistics

In [8]:
df = pd.read_csv('/Users/atharvramesh/UCSD/Fall2025/ECE225A/NYC_Public_Transit/data/raw/citibike/202401-citibike-tripdata_1.csv')
print(df.head())

            ride_id  rideable_type               started_at  \
0  8E865410DBDE0CA9  electric_bike  2024-01-01 13:00:04.563   
1  0403D0B3FC9CA77D  electric_bike  2024-01-08 19:36:43.520   
2  F6DE7BB42FF550BE  electric_bike  2024-01-12 15:00:41.580   
3  84A995BFD98030D4   classic_bike  2024-01-12 16:52:19.025   
4  7BBEAD4F2B535813  electric_bike  2024-01-05 19:50:19.202   

                  ended_at           start_station_name start_station_id  \
0  2024-01-01 13:04:04.652                 3 St & 3 Ave          4028.03   
1  2024-01-08 19:53:16.266  Franklin Ave & St Marks Ave          4107.05   
2  2024-01-12 15:36:29.622           W 67 St & Broadway          7116.04   
3  2024-01-12 17:17:29.773  Central Park West & W 68 St          7079.06   
4  2024-01-05 20:34:42.517           W 67 St & Broadway          7116.04   

            end_station_name end_station_id  start_lat  start_lng    end_lat  \
0      Carroll St & Smith St        4225.14  40.675070 -73.987752  40.680611   
1   

  df = pd.read_csv('/Users/atharvramesh/UCSD/Fall2025/ECE225A/NYC_Public_Transit/data/raw/citibike/202401-citibike-tripdata_1.csv')


In [6]:
bike = load_bike_trips(BIKE_ROOT, BIKE_GLOB, max_rows=None)
print(f"Bike trips after filters: {len(bike):,}")
print('Unique start stations:', bike['rideable_type'].nunique())
print('Rideable mix:', bike['rideable_type'].value_counts())

bike_summary = bike[['travel_min', 'distance_km', 'rideable_type', 'is_rush', 'is_weekend']].describe(include='all')
display(bike_summary)


Bike trips after filters: 18,253,964
Unique start stations: 2
Rideable mix: rideable_type
electric_bike    11854157
classic_bike      6399807
Name: count, dtype: int64


Unnamed: 0,travel_min,distance_km,rideable_type,is_rush,is_weekend
count,18253960.0,18253960.0,18253964,18253964,18253964
unique,,,2,2,2
top,,,electric_bike,False,False
freq,,,11854157,10745186,13527779
mean,12.09192,2.024503,,,
std,10.80158,1.671537,,,
min,1.000383,0.0005373035,,,
25%,5.1868,0.8744288,,,
50%,8.890842,1.511654,,,
75%,15.33917,2.628416,,,
