# Citi Bike Dataset

### Bias: Data filtering in the dataset

> “This data has been processed to remove trips taken by staff, trips to/from test stations, and any trips under 60 seconds (likely false starts or users re-docking bikes).”

These biases (staff trips excluded and sub-60-second trips removed) **do not impact our analysis.**

# Data Cleaning

In [1]:
from __future__ import annotations

# Standard library
import math
from typing import Any, Dict, Optional, Tuple

# Scientific computing
import numpy as np
import pandas as pd
import itertools
from scipy.signal import periodogram
import statsmodels
from statsmodels.graphics.tsaplots import plot_pacf
from statsmodels.tsa.deterministic import CalendarFourier, DeterministicProcess

# Visualization
import matplotlib.pyplot as plt
from matplotlib.axes import Axes
from matplotlib.figure import Figure
from matplotlib.offsetbox import AnchoredText
import seaborn as sns

# Machine learning
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
)
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from xgboost import XGBRegressor

In [35]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)
pd.set_option("display.max_colwidth", None)

- don't load the jersey city files because we only want to analyse the new york data (the motor vehicle accident report is only for NY)
- the unpacked folder contains all unpacked data folders from 202411-citibike-tripdata to 202510-citibike-tripdata, so our df is sampled from this data

In [3]:
from __future__ import annotations

from pathlib import Path
import pandas as pd

# SCHEMA DEFINITION 

EXPECTED_RAW_COLS = [
    "ride_id",
    "rideable_type",
    "started_at",
    "ended_at",
    "start_station_name",
    "start_station_id",
    "end_station_name",
    "end_station_id",
    "start_lat",
    "start_lng",
    "end_lat",
    "end_lng",
    "member_casual",
]

# Dtypes enforced during pd.read_csv
LOAD_DTYPES = {
    "ride_id": "string",
    "rideable_type": "category",
    "start_station_name": "string",
    "start_station_id": "string",
    "end_station_name": "string",
    "end_station_id": "string",
    "start_lat": "float32",
    "start_lng": "float32",
    "end_lat": "float32",
    "end_lng": "float32",
    "member_casual": "category",
}

DATETIME_COLS = ["started_at", "ended_at"]

# LOADING FUNCTIONS

def sample_one_csv(
    csv_path: Path,
    expected_raw_cols: list[str],
    dtypes: dict,
    datetime_cols: list[str],
    sample_frac: float,
    random_state: int,
    add_source_file: bool = True,
) -> pd.DataFrame:
    """Read one CSV with schema enforcement at load time."""
    
    print(f"Reading: {csv_path.name}")
    
    # Enforce types during read
    df = pd.read_csv(
        csv_path,
        dtype=dtypes,
        parse_dates=datetime_cols,
    )
    
    # Validate schema
    df.columns = df.columns.str.strip()
    if list(df.columns) != expected_raw_cols:
        raise ValueError(
            f"Schema mismatch in {csv_path.name}\n"
            f"Expected: {expected_raw_cols}\n"
            f"Found:    {list(df.columns)}"
        )
    
    # Sample and keep only needed columns
    df = df.sample(frac=sample_frac, random_state=random_state)
    
    if add_source_file:
        df["source_file"] = csv_path.name
    
    return df


def load_or_create_sample(
    base_dir: Path,
    out_path: Path,
    sample_frac: float,
    random_state: int = 42,
) -> pd.DataFrame:
    """Load cached sample or create new one with schema enforcement."""
    
    if out_path.exists():
        print(f"Found existing sampled file -> loading: {out_path}")
        df = pd.read_csv(
            out_path,
            dtype=LOAD_DTYPES,
            parse_dates=DATETIME_COLS,
        )
        print("Loaded shape:", df.shape)
        return df
    
    csv_files = sorted(base_dir.rglob("*.csv"))
    if not csv_files:
        raise FileNotFoundError(f"No CSV files found under {base_dir}")
    
    dfs = [
        sample_one_csv(
            csv_path=p,
            expected_raw_cols=EXPECTED_RAW_COLS,
            dtypes=LOAD_DTYPES,
            datetime_cols=DATETIME_COLS,
            sample_frac=sample_frac,
            random_state=random_state,
        )
        for p in csv_files
    ]
    
    df = pd.concat(dfs, ignore_index=True)
    print("Final sampled shape:", df.shape)
    
    print(f"Saving sampled data to: {out_path}")
    df.to_csv(out_path, index=False)
    return df

# USAGE

BASE_DIR = Path(r"..\data\citi_bike_raw\unpacked")
OUT_DIR = Path(r"..\data\processed")
SAMPLE_FRAC = 0.03
RANDOM_STATE = 42

sample_percent = int(round(SAMPLE_FRAC * 100))
OUT_PATH = OUT_DIR / f"citi_bike_sampled_{sample_percent}_percent.csv"

df_citibike = load_or_create_sample(
    base_dir=BASE_DIR,
    out_path=OUT_PATH,
    sample_frac=SAMPLE_FRAC,
    random_state=RANDOM_STATE,
)


Found existing sampled file -> loading: ..\data\processed\citi_bike_sampled_3_percent.csv
Loaded shape: (1388512, 14)


## Overview

In [4]:
df_citibike.head()

Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,source_file
0,38658C64FCBE6EAD,electric_bike,2024-11-14 07:06:27.367,2024-11-14 07:10:59.644,Greenwich St & Hubert St,5470.1,Centre St & Chambers St,5207.01,40.721317,-74.010063,40.712734,-74.004608,member,202411-citibike-tripdata_1.csv
1,4B8FB1EE481B7B5D,classic_bike,2024-11-07 11:23:45.587,2024-11-07 11:29:55.879,Frederick Douglass Blvd & W 115 St,7658.13,E 106 St & Madison Ave,7528.31,40.803864,-73.955933,40.793434,-73.949448,member,202411-citibike-tripdata_1.csv
2,B2244A69E4D70034,classic_bike,2024-11-02 17:37:09.850,2024-11-02 17:48:37.132,W 20 St & 7 Ave,6182.02,King St & Varick St,5687.11,40.74239,-73.997261,40.727898,-74.005363,member,202411-citibike-tripdata_1.csv
3,0394AFE351677235,electric_bike,2024-11-04 18:09:08.203,2024-11-04 18:26:57.723,Harrison Pl & Porter Ave,5090.06,5 St & 51 Ave,6137.04,40.70686,-73.928513,40.742374,-73.956596,member,202411-citibike-tripdata_1.csv
4,8C84290C44C57A40,electric_bike,2024-11-10 15:07:32.908,2024-11-10 15:47:25.503,10 St & 7 Ave,3762.08,W 54 St & 6 Ave,6771.13,40.666206,-73.982002,40.761864,-73.977242,member,202411-citibike-tripdata_1.csv


- it's a bit counter intuitiv that the stations aren't natural numbers but counterchecking on https://account.citibikenyc.com/map showed that the xxxx.xx shape is exactly what they use

In [5]:
df_citibike.shape

(1388512, 14)

In [6]:
#CITIBIKE DATA QUALITY AUDIT

def audit_data_quality(df: pd.DataFrame) -> pd.DataFrame:
    """Comprehensive data quality report."""
    
    report = []
    
    for col in df.columns:
        null_count = df[col].isna().sum()
        null_pct = 100 * null_count / len(df)
        unique_count = df[col].nunique()
        dtype = df[col].dtype
        
        # Sample non-null values
        sample_vals = df[col].dropna().head(3).tolist()
        
        report.append({
            'column': col,
            'dtype': dtype,
            'null_count': null_count,
            'null_pct': f"{null_pct:.2f}%",
            'unique_values': unique_count,
            'sample': sample_vals
        })
    
    return pd.DataFrame(report)


# Run audit
quality_report = audit_data_quality(df_citibike)
print("="*80)
print("DATA QUALITY AUDIT REPORT")
print("="*80)
display(quality_report)

print(f"\n{'='*80}")
print(f"Dataset Shape: {df_citibike.shape[0]:,} rows × {df_citibike.shape[1]} columns")
print(f"Memory Usage: {df_citibike.memory_usage(deep=True).sum() / 1024**2:.2f} MB")
print(f"{'='*80}\n")

DATA QUALITY AUDIT REPORT


Unnamed: 0,column,dtype,null_count,null_pct,unique_values,sample
0,ride_id,string[python],0,0.00%,1388512,"[38658C64FCBE6EAD, 4B8FB1EE481B7B5D, B2244A69E..."
1,rideable_type,category,0,0.00%,2,"[electric_bike, classic_bike, classic_bike]"
2,started_at,datetime64[ns],0,0.00%,1388453,"[2024-11-14 07:06:27.367000, 2024-11-07 11:23:..."
3,ended_at,datetime64[ns],0,0.00%,1388458,"[2024-11-14 07:10:59.644000, 2024-11-07 11:29:..."
4,start_station_name,string[python],650,0.05%,2215,"[Greenwich St & Hubert St, Frederick Douglass ..."
5,start_station_id,string[python],650,0.05%,2308,"[5470.10, 7658.13, 6182.02]"
6,end_station_name,string[python],3813,0.27%,2249,"[Centre St & Chambers St, E 106 St & Madison A..."
7,end_station_id,string[python],4075,0.29%,2334,"[5207.01, 7528.31, 5687.11]"
8,start_lat,float32,650,0.05%,2180,"[40.721317291259766, 40.803863525390625, 40.74..."
9,start_lng,float32,650,0.05%,2106,"[-74.01006317138672, -73.9559326171875, -73.99..."



Dataset Shape: 1,388,512 rows × 14 columns
Memory Usage: 630.35 MB



## Duplicates

In [7]:
n_dup_rides = df_citibike.duplicated(subset=["ride_id"]).sum()
print("Duplicate ride_id rows:", n_dup_rides)

Duplicate ride_id rows: 0


In [8]:
#drop ride id
df_citibike = df_citibike.drop(columns=["ride_id"])

## Time features

In [9]:
print(f"Date Range started: {df_citibike['started_at'].min()} to {df_citibike['started_at'].max()}")
print(f"Date Range ended: {df_citibike['ended_at'].min()} to {df_citibike['ended_at'].max()}")

Date Range started: 2024-10-31 15:11:07.863000 to 2025-10-31 23:54:48.606000
Date Range ended: 2024-11-01 00:00:55.765000 to 2025-10-31 23:59:30.796000


- It is weird that the minimum start date is in 2024-10 i only sampled from files from 2024-11 till 2025-10

In [10]:
# analyze time outliers

target_date = pd.to_datetime("2024-10-31").date()

mask_1031 = (
    (df_citibike["started_at"].dt.date == target_date) |
    (df_citibike["ended_at"].dt.date == target_date)
)

rides_1031 = df_citibike.loc[mask_1031].copy()

print("Number of rides touching 2024-10-31:", len(rides_1031))
rides_1031.head(33)

Number of rides touching 2024-10-31: 33


Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,source_file
608,electric_bike,2024-10-31 19:55:18.511,2024-11-01 03:09:57.592,Longwood Ave & Southern Blvd,7849.1,,,40.816341,-73.896446,,,casual,202411-citibike-tripdata_1.csv
3089,electric_bike,2024-10-31 23:49:31.894,2024-11-01 00:17:19.380,Suydam St & Knickerbocker Ave,4939.07,Perry St & Greenwich Ave,5955.12,40.702011,-73.923767,40.735916,-74.000938,member,202411-citibike-tripdata_1.csv
4516,classic_bike,2024-10-31 23:56:05.656,2024-11-01 00:01:16.220,W 24 St & 7 Ave,6257.03,W 26 St & 10 Ave,6382.05,40.744877,-73.9953,40.749718,-74.002953,member,202411-citibike-tripdata_1.csv
5491,electric_bike,2024-10-31 23:45:46.934,2024-11-01 00:11:02.154,E 43 St & Madison Ave,6551.11,Crescent St & 30 Ave,6958.06,40.753548,-73.978966,40.768692,-73.924957,member,202411-citibike-tripdata_1.csv
7380,classic_bike,2024-10-31 23:32:13.006,2024-11-01 00:06:11.116,Barrow St & Hudson St,5805.05,E 10 St & 2 Ave,5746.02,40.731724,-74.006744,40.72971,-73.986595,member,202411-citibike-tripdata_1.csv
8123,electric_bike,2024-10-31 23:55:57.436,2024-11-01 00:33:45.381,W 45 St & 8 Ave,6676.02,Broadway & Morris St,5033.01,40.759293,-73.988594,40.705944,-74.013222,member,202411-citibike-tripdata_1.csv
10889,electric_bike,2024-10-31 23:53:39.811,2024-11-01 00:01:13.283,Hudson Blvd W & W 36 St,6611.07,Lexington Ave & E 36 St,6313.1,40.756763,-73.999718,40.747574,-73.978798,member,202411-citibike-tripdata_1.csv
12764,electric_bike,2024-10-31 23:59:40.941,2024-11-01 00:20:18.958,Morton St & Greenwich St,5772.05,E 48 St & 5 Ave,6626.01,40.731152,-74.008873,40.757244,-73.978058,member,202411-citibike-tripdata_1.csv
12794,electric_bike,2024-10-31 15:11:07.863,2024-11-01 02:15:40.584,Westchester Ave & Jackson Ave,7853.01,E 149 St & Jackson Ave,7812.01,40.816124,-73.908119,40.813133,-73.909233,member,202411-citibike-tripdata_1.csv
14639,electric_bike,2024-10-31 23:51:48.272,2024-11-01 00:21:51.552,Central Park West & W 85 St,7354.01,W 82 St & Central Park West,7304.08,40.78476,-73.969864,40.782749,-73.971367,member,202411-citibike-tripdata_1.csv


- Explanation for rides starting in 2024-10: the data is grouped by the end time of a ride
- thats only a very small inconvenience (if we analyze the rides from the last day in our data a few will be missing because they ended the day after)

## Location features

### A) Missingness

In [11]:
# NYC bounding box 
NYC_BOUNDS = {
    'lat_min': 40.4,
    'lat_max': 41.0,
    'lng_min': -74.3,
    'lng_max': -73.6
}

def flag_geographic_anomalies(df: pd.DataFrame) -> pd.DataFrame:
    """Identify and categorize location data issues."""
    
    df = df.copy()
    
    # Flag categories
    df['start_coord_missing'] = df['start_lat'].isna() | df['start_lng'].isna()
    df['end_coord_missing'] = df['end_lat'].isna() | df['end_lng'].isna()
    
    df['start_out_of_bounds'] = (
        ~df['start_coord_missing'] & 
        ((df['start_lat'] < NYC_BOUNDS['lat_min']) | 
         (df['start_lat'] > NYC_BOUNDS['lat_max']) |
         (df['start_lng'] < NYC_BOUNDS['lng_min']) | 
         (df['start_lng'] > NYC_BOUNDS['lng_max']))
    )
    
    df['end_out_of_bounds'] = (
        ~df['end_coord_missing'] & 
        ((df['end_lat'] < NYC_BOUNDS['lat_min']) | 
         (df['end_lat'] > NYC_BOUNDS['lat_max']) |
         (df['end_lng'] < NYC_BOUNDS['lng_min']) | 
         (df['end_lng'] > NYC_BOUNDS['lng_max']))
    )
    
    # Station ID vs. coordinate consistency
    df['start_station_id_missing'] = df['start_station_id'].isna()
    df['end_station_id_missing'] = df['end_station_id'].isna()
    df['start_station_name_missing'] = df['start_station_name'].isna()
    df['end_station_name_missing'] = df['end_station_name'].isna()
    
    return df


df_citibike = flag_geographic_anomalies(df_citibike)

# Summarize issues
geo_issues = {
    'Start Coord Missing': df_citibike['start_coord_missing'].sum(),
    'Start Out of Bounds': df_citibike['start_out_of_bounds'].sum(),
    'Start Station ID Missing': df_citibike['start_station_id_missing'].sum(),
    'Start Station Name Missing': df_citibike['start_station_name_missing'].sum(),
    'End Coord Missing': df_citibike['end_coord_missing'].sum(),
    'End Out of Bounds': df_citibike['end_out_of_bounds'].sum(),
    'End Station ID Missing': df_citibike['end_station_id_missing'].sum(),
    'End Station Name Missing': df_citibike['end_station_name_missing'].sum(),
}

print("GEOGRAPHIC ANOMALIES:")
print("="*60)
for issue, count in geo_issues.items():
    pct = 100 * count / len(df_citibike)
    print(f"{issue:.<40} {count:>6,} ({pct:>5.2f}%)")
print("="*60)

GEOGRAPHIC ANOMALIES:
Start Coord Missing.....................    650 ( 0.05%)
Start Out of Bounds.....................      0 ( 0.00%)
Start Station ID Missing................    650 ( 0.05%)
Start Station Name Missing..............    650 ( 0.05%)
End Coord Missing.......................  4,075 ( 0.29%)
End Out of Bounds.......................      3 ( 0.00%)
End Station ID Missing..................  4,075 ( 0.29%)
End Station Name Missing................  3,813 ( 0.27%)


In [12]:
#analyzing the missingnes of end(start) lat/long/station name/station

def missing_pattern_table(df, cols, prefix=None):
    """
    Build a table of all 2^k missingness patterns for the given columns.
    """
    # Boolean df of missingness
    miss = df[cols].isna()

    # Collapse rows into tuples
    pattern_counts = miss.value_counts(sort=False).rename("count").reset_index()

    # Add percentage
    total = len(df)
    pattern_counts["pct"] = pattern_counts["count"] / total * 100

    # Rename columns nicely
    if prefix:
        pattern_counts = pattern_counts.rename(columns={
            c: f"{prefix}_{c}" for c in cols
        })

    return pattern_counts


# ----- START patterns -----
start_cols = ["start_lat", "start_lng", "start_station_id", "start_station_name"]
start_patterns = missing_pattern_table(df_citibike, start_cols)

# Ensure all 16 combinations appear, even if zero
all_patterns = pd.DataFrame(
    list(itertools.product([False, True], repeat=4)),
    columns=start_cols
)
start_patterns = all_patterns.merge(start_patterns, on=start_cols, how="left").fillna({"count":0,"pct":0})

print("\nSTART — Missingness Patterns (False = present, True = missing)")
display(start_patterns.sort_values(start_cols))


# ----- END patterns -----
end_cols = ["end_lat", "end_lng", "end_station_id", "end_station_name"]
end_patterns = missing_pattern_table(df_citibike, end_cols)

all_patterns = pd.DataFrame(
    list(itertools.product([False, True], repeat=4)),
    columns=end_cols
)
end_patterns = all_patterns.merge(end_patterns, on=end_cols, how="left").fillna({"count":0,"pct":0})

print("\nEND — Missingness Patterns (False = present, True = missing)")
display(end_patterns.sort_values(end_cols))


# ----- OPTIONAL: Combined pattern table -----
combined_cols = (
    ["start_lat", "start_lng", "start_station_id", "start_station_name"] +
    ["end_lat", "end_lng", "end_station_id", "end_station_name"]
)

combined_missing = df_citibike[combined_cols].isna()
combined_table = combined_missing.value_counts(sort=False).rename("count").reset_index()
combined_table["pct"] = combined_table["count"] / len(df_citibike) * 100

print("\nCOMBINED — Start + End Missingness Patterns (8 fields → 2^8 rows)")
display(combined_table)


START — Missingness Patterns (False = present, True = missing)


Unnamed: 0,start_lat,start_lng,start_station_id,start_station_name,count,pct
0,False,False,False,False,1387862.0,99.953187
1,False,False,False,True,0.0,0.0
2,False,False,True,False,0.0,0.0
3,False,False,True,True,0.0,0.0
4,False,True,False,False,0.0,0.0
5,False,True,False,True,0.0,0.0
6,False,True,True,False,0.0,0.0
7,False,True,True,True,0.0,0.0
8,True,False,False,False,0.0,0.0
9,True,False,False,True,0.0,0.0



END — Missingness Patterns (False = present, True = missing)


Unnamed: 0,end_lat,end_lng,end_station_id,end_station_name,count,pct
0,False,False,False,False,1384437.0,99.70652
1,False,False,False,True,0.0,0.0
2,False,False,True,False,0.0,0.0
3,False,False,True,True,0.0,0.0
4,False,True,False,False,0.0,0.0
5,False,True,False,True,0.0,0.0
6,False,True,True,False,0.0,0.0
7,False,True,True,True,0.0,0.0
8,True,False,False,False,0.0,0.0
9,True,False,False,True,0.0,0.0



COMBINED — Start + End Missingness Patterns (8 fields → 2^8 rows)


Unnamed: 0,start_lat,start_lng,start_station_id,start_station_name,end_lat,end_lng,end_station_id,end_station_name,count,pct
0,False,False,False,False,False,False,False,False,1383934,99.670295
1,False,False,False,False,True,True,True,False,260,0.018725
2,False,False,False,False,True,True,True,True,3668,0.264168
3,True,True,True,True,False,False,False,False,503,0.036226
4,True,True,True,True,True,True,True,False,2,0.000144
5,True,True,True,True,True,True,True,True,145,0.010443


In [13]:
df_citibike.shape

(1388512, 21)

- information about the start location are either all there or all missing at once (650)
- if information about the end location is missing either all entries are missing (3813) or everything is missing except for station name (count 262.0)
- there are 145 rides which have information about start or endpoint at all (what went wrong there? bikes with broken gps? A generell problem in the citibike system?)

- **IMPORTANT:** End station information is almost 6 times more likely to be missing than start station.
  It is very unlikely that this is due to system failure because why should the system failure be biased toward end stations.

→ These rows could contain a significant number of trips that ended in accidents.

In [14]:
#Analyzing rows were end lat/long/id is missing but endstation is there

# How many rows to display
N = 20   


# ---- Identify affected rows ----
mask = (
    df_citibike["end_lat"].isna() &
    df_citibike["end_lng"].isna() &
    df_citibike["end_station_id"].isna() &
    df_citibike["end_station_name"].notna()
)

subset = df_citibike.loc[mask].copy()

print(f"Rows where end_lat/end_lng/end_station_id are missing but end_station_name is present: {len(subset)}")

# Show only first N affected trips
display(subset.head(N))


# ---- Build station-level summary ----

# Counts within the subset
subset_counts = (
    subset["end_station_name"]
    .value_counts()
    .rename_axis("end_station_name")
    .reset_index(name="subset_count")
)

# Counts within the full dataset
full_counts = (
    df_citibike["end_station_name"]
    .value_counts()
    .rename_axis("end_station_name")
    .reset_index(name="full_count")
)

# Merge
summary = subset_counts.merge(full_counts, on="end_station_name")

# Percentage of each station's trips that are missing coords
summary["pct_of_station_trips"] = (
    summary["subset_count"] / summary["full_count"] * 100
)

# ---- SORT BY PERCENTAGE DESCENDING ----
summary = summary.sort_values("pct_of_station_trips", ascending=False)

print("\nEnd station names — sorted by % of trips missing coords:")
display(summary.head(N))


Rows where end_lat/end_lng/end_station_id are missing but end_station_name is present: 262


Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,source_file,start_coord_missing,end_coord_missing,start_out_of_bounds,end_out_of_bounds,start_station_id_missing,end_station_id_missing,start_station_name_missing,end_station_name_missing
7013,electric_bike,2024-11-11 08:38:09.546,2024-11-11 08:51:29.109,Schermerhorn St & Hoyt St,4479.1,Grand St & Elizabeth St,,40.688625,-73.985191,,,member,202411-citibike-tripdata_1.csv,False,True,False,False,False,True,False,False
9644,electric_bike,2024-11-12 07:10:40.300,2024-11-12 07:30:04.790,56 St & Arnold Ave,5289.05,35 Ave & 37 St,,40.71484,-73.908997,,,casual,202411-citibike-tripdata_1.csv,False,True,False,False,False,True,False,False
13862,electric_bike,2024-11-13 09:37:05.810,2024-11-13 09:40:42.380,Bainbridge Ave & E 196 St,8615.01,Strong St & University Ave,,40.866482,-73.890419,,,casual,202411-citibike-tripdata_1.csv,False,True,False,False,False,True,False,False
39974,electric_bike,2024-11-13 08:42:51.487,2024-11-13 08:50:18.510,India St & Manhattan Ave,5826.02,S 2 St & Kent Ave,,40.732323,-73.955086,,,member,202411-citibike-tripdata_2.csv,False,True,False,False,False,True,False,False
56066,electric_bike,2024-11-03 08:50:15.079,2024-11-03 09:03:24.056,Central Park West & W 85 St,7354.01,E 51 St & Lexington Ave,,40.78476,-73.969864,,,member,202411-citibike-tripdata_2.csv,False,True,False,False,False,True,False,False
58088,electric_bike,2024-11-10 13:49:22.878,2024-11-10 14:04:33.948,E 84 St & Park Ave,7243.04,7 Ave & W 55 St,,40.778625,-73.957718,,,member,202411-citibike-tripdata_2.csv,False,True,False,False,False,True,False,False
59518,electric_bike,2024-11-11 12:14:01.598,2024-11-11 12:21:30.617,W 45 St & 8 Ave,6676.02,Broadway & W 37 St,,40.759293,-73.988594,,,member,202411-citibike-tripdata_2.csv,False,True,False,False,False,True,False,False
63471,electric_bike,2024-11-22 05:30:36.112,2024-11-22 05:35:41.110,Plaza St West & Flatbush Ave,4010.13,6 St & 7 Ave,,40.675022,-73.971115,,,casual,202411-citibike-tripdata_3.csv,False,True,False,False,False,True,False,False
70027,electric_bike,2024-11-28 23:57:55.936,2024-11-29 00:06:43.933,3 Ave & E 72 St,7028.04,E 94 St & Madison Ave,,40.769943,-73.960609,,,member,202411-citibike-tripdata_3.csv,False,True,False,False,False,True,False,False
77829,electric_bike,2024-11-21 20:34:32.003,2024-11-21 20:37:25.270,N 6 St & Bedford Ave,5379.1,S 2 St & Kent Ave,,40.717453,-73.958511,,,member,202411-citibike-tripdata_3.csv,False,True,False,False,False,True,False,False



End station names — sorted by % of trips missing coords:


Unnamed: 0,end_station_name,subset_count,full_count,pct_of_station_trips
36,Jerome Ave & W 177 St,2,35,5.714286
74,47 Ave & 109 St,1,43,2.325581
38,St Nicholas Ave & W 157 St,2,108,1.851852
135,Strong St & University Ave,1,69,1.449275
22,W 170 St & University Ave,2,149,1.342282
3,Coffey St & Ferris St,4,310,1.290323
0,Broadway & W 37 St,7,553,1.265823
23,Grand Concourse & E 156 St,2,171,1.169591
5,Greene Ave & Myrtle Ave,4,367,1.089918
91,W Fordham Rd & Loring Pl N,1,98,1.020408


In [15]:
subset.rideable_type.value_counts()

rideable_type
electric_bike    262
classic_bike       0
Name: count, dtype: int64

- Pattern: 100% of the rows have rideable_type electric_bike

In [16]:
#analyzing rows where all of the end/start lat/lng/id/name information is missing

# Columns to check
all_cols = [
    "start_lat", "start_lng", "start_station_id", "start_station_name",
    "end_lat", "end_lng", "end_station_id", "end_station_name",
]

# Mask: all 8 columns missing
mask_all_missing = df_citibike[all_cols].isna().all(axis=1)

print("Rows where ALL 8 start/end fields are missing:", mask_all_missing.sum())

# Display the rows
rows_all_missing = df_citibike.loc[mask_all_missing]

display(rows_all_missing)

Rows where ALL 8 start/end fields are missing: 145


Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,source_file,start_coord_missing,end_coord_missing,start_out_of_bounds,end_out_of_bounds,start_station_id_missing,end_station_id_missing,start_station_name_missing,end_station_name_missing
67,electric_bike,2024-11-12 18:16:17.112,2024-11-12 18:20:20.579,,,,,,,,,member,202411-citibike-tripdata_1.csv,True,True,False,False,True,True,True,True
964,electric_bike,2024-11-14 11:09:28.605,2024-11-14 12:10:52.756,,,,,,,,,member,202411-citibike-tripdata_1.csv,True,True,False,False,True,True,True,True
4749,electric_bike,2024-11-09 22:43:09.260,2024-11-09 23:43:19.962,,,,,,,,,member,202411-citibike-tripdata_1.csv,True,True,False,False,True,True,True,True
5048,electric_bike,2024-11-05 17:50:06.131,2024-11-05 17:53:40.088,,,,,,,,,member,202411-citibike-tripdata_1.csv,True,True,False,False,True,True,True,True
5239,electric_bike,2024-11-06 17:39:03.491,2024-11-06 17:45:18.284,,,,,,,,,casual,202411-citibike-tripdata_1.csv,True,True,False,False,True,True,True,True
5478,electric_bike,2024-11-07 16:14:52.980,2024-11-07 16:21:50.710,,,,,,,,,member,202411-citibike-tripdata_1.csv,True,True,False,False,True,True,True,True
9062,electric_bike,2024-11-01 18:27:01.335,2024-11-01 18:49:59.771,,,,,,,,,member,202411-citibike-tripdata_1.csv,True,True,False,False,True,True,True,True
9581,electric_bike,2024-11-10 01:31:33.180,2024-11-10 01:45:20.474,,,,,,,,,member,202411-citibike-tripdata_1.csv,True,True,False,False,True,True,True,True
9834,electric_bike,2024-11-14 15:36:18.734,2024-11-14 15:44:29.959,,,,,,,,,member,202411-citibike-tripdata_1.csv,True,True,False,False,True,True,True,True
11032,electric_bike,2024-11-04 10:09:21.685,2024-11-04 10:14:43.461,,,,,,,,,member,202411-citibike-tripdata_1.csv,True,True,False,False,True,True,True,True


In [17]:
rows_all_missing.rideable_type.value_counts()

rideable_type
electric_bike    145
classic_bike       0
Name: count, dtype: int64

-Pattern: again 100% of the trips are with electric bikes

In [18]:
start_patterns.head()

Unnamed: 0,start_lat,start_lng,start_station_id,start_station_name,count,pct
0,False,False,False,False,1387862.0,99.953187
1,False,False,False,True,0.0,0.0
2,False,False,True,False,0.0,0.0
3,False,False,True,True,0.0,0.0
4,False,True,False,False,0.0,0.0


In [19]:
# Add "% electric_bike" to a pattern table (start / end / combined)

def add_pct_electric(pattern_table: pd.DataFrame, cols: list[str], df: pd.DataFrame) -> pd.DataFrame:
    # Boolean missingness matrix for those columns
    miss = df[cols].isna()

    # Indicator: is electric bike?
    is_electric = (df["rideable_type"] == "electric_bike").astype("int64")

    # For each pattern, compute group mean of is_electric -> fraction electric
    frac_electric = (
        miss.assign(_is_electric=is_electric)
            .groupby(cols, dropna=False)["_is_electric"]
            .mean()
            .mul(100)
            .rename("pct_electric_bike")
            .reset_index()
    )

    # Merge onto your existing pattern table
    out = pattern_table.merge(frac_electric, on=cols, how="left")
    return out

# ---- apply to your three tables ----
start_patterns = (
    add_pct_electric(start_patterns, start_cols, df_citibike)
    .loc[lambda d: d["pct"] > 0]
)

end_patterns = (
    add_pct_electric(end_patterns, end_cols, df_citibike)
    .loc[lambda d: d["pct"] > 0]
)

combined_table = (
    add_pct_electric(combined_table, combined_cols, df_citibike)
    .loc[lambda d: d["pct"] > 0]
)

# show results
print("\nSTART — Missingness Patterns + % electric_bike")
display(start_patterns.sort_values(start_cols))

print("\nEND — Missingness Patterns + % electric_bike")
display(end_patterns.sort_values(end_cols))

print("\nCOMBINED — Missingness Patterns + % electric_bike")
display(combined_table.sort_values(combined_cols))


START — Missingness Patterns + % electric_bike


Unnamed: 0,start_lat,start_lng,start_station_id,start_station_name,count,pct,pct_electric_bike
0,False,False,False,False,1387862.0,99.953187,70.042843
15,True,True,True,True,650.0,0.046813,100.0



END — Missingness Patterns + % electric_bike


Unnamed: 0,end_lat,end_lng,end_station_id,end_station_name,count,pct,pct_electric_bike
0,False,False,False,False,1384437.0,99.70652,69.987728
14,True,True,True,False,262.0,0.018869,100.0
15,True,True,True,True,3813.0,0.274611,93.102544



COMBINED — Missingness Patterns + % electric_bike


Unnamed: 0,start_lat,start_lng,start_station_id,start_station_name,end_lat,end_lng,end_station_id,end_station_name,count,pct,pct_electric_bike
0,False,False,False,False,False,False,False,False,1383934,99.670295,69.97682
1,False,False,False,False,True,True,True,False,260,0.018725,100.0
2,False,False,False,False,True,True,True,True,3668,0.264168,92.82988
3,True,True,True,True,False,False,False,False,503,0.036226,100.0
4,True,True,True,True,True,True,True,False,2,0.000144,100.0
5,True,True,True,True,True,True,True,True,145,0.010443,100.0


- 100% of the data with any missing start value is from trips with electric bikes 
- Roughly 95% of the data with any missing end value is from trips with electric bikes
- **Conclusion** there is definitly something wrong relating to the location tracking of electric bikes.

In [20]:
# columns to check for missingness
loc_cols = [
    "start_lat", "start_lng", "start_station_id", "start_station_name",
    "end_lat", "end_lng", "end_station_id", "end_station_name",
]

# mask: any location field missing
has_missing_loc = df_citibike[loc_cols].isna().any(axis=1)

# mask: classic bike
is_classic = df_citibike["rideable_type"] == "classic_bike"

# combined condition
mask = has_missing_loc & is_classic

print("Classic bike rows with any missing start/end info:", int(mask.sum()))

display(
    df_citibike[mask]
)


Classic bike rows with any missing start/end info: 263


Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,source_file,start_coord_missing,end_coord_missing,start_out_of_bounds,end_out_of_bounds,start_station_id_missing,end_station_id_missing,start_station_name_missing,end_station_name_missing
3423,classic_bike,2024-11-04 19:06:35.508,2024-11-05 20:06:22.433,Albemarle Rd & E 21 St,3180.09,,,40.647411,-73.959328,,,casual,202411-citibike-tripdata_1.csv,False,True,False,False,False,True,False,True
5729,classic_bike,2024-11-01 18:55:19.242,2024-11-02 19:55:11.943,Southpoint Park,6566.01,,,40.753704,-73.958649,,,casual,202411-citibike-tripdata_1.csv,False,True,False,False,False,True,False,True
7503,classic_bike,2024-11-04 18:17:26.565,2024-11-05 19:17:18.497,Broadway & W 25 St,6173.08,,,40.74287,-73.989189,,,member,202411-citibike-tripdata_1.csv,False,True,False,False,False,True,False,True
7537,classic_bike,2024-11-13 10:22:05.101,2024-11-14 11:21:59.254,E 11 St & 3 Ave,5788.16,,,40.73127,-73.988487,,,casual,202411-citibike-tripdata_1.csv,False,True,False,False,False,True,False,True
9282,classic_bike,2024-11-06 16:41:35.310,2024-11-07 17:41:29.469,E 48 St & 5 Ave,6626.01,,,40.757244,-73.978058,,,member,202411-citibike-tripdata_1.csv,False,True,False,False,False,True,False,True
9472,classic_bike,2024-11-05 15:33:08.461,2024-11-06 16:33:02.452,E 15 St & 5 Ave,5947.06,,,40.736591,-73.99295,,,member,202411-citibike-tripdata_1.csv,False,True,False,False,False,True,False,True
10766,classic_bike,2024-11-09 16:00:11.177,2024-11-10 17:00:05.073,Central Park W & W 97 St,7538.18,,,40.792496,-73.964172,,,casual,202411-citibike-tripdata_1.csv,False,True,False,False,False,True,False,True
13448,classic_bike,2024-11-14 14:44:14.553,2024-11-15 15:44:03.270,27 St & Hunter St,6310.06,,,40.748501,-73.941277,,,casual,202411-citibike-tripdata_1.csv,False,True,False,False,False,True,False,True
13454,classic_bike,2024-11-10 12:38:33.467,2024-11-11 13:38:28.620,Schenectady Ave & Prospect Pl,4007.01,,,40.673512,-73.933632,,,casual,202411-citibike-tripdata_1.csv,False,True,False,False,False,True,False,True
19863,classic_bike,2024-11-08 14:54:27.188,2024-11-09 15:54:05.294,1 Ave & E 94 St,7286.05,,,40.781723,-73.945938,,,member,202411-citibike-tripdata_1.csv,False,True,False,False,False,True,False,True


In [21]:
ride_durations = (
    df_citibike.loc[mask, "ended_at"]
    - df_citibike.loc[mask, "started_at"]
)

ride_durations.describe()

count                          263
mean     0 days 23:51:30.482692015
std      0 days 05:05:27.609026021
min         0 days 00:10:34.176000
25%         1 days 00:59:41.579500
50%         1 days 00:59:51.932000
75%         1 days 00:59:54.525500
max         1 days 00:59:57.480000
dtype: object

- Pattern: Over 75% of these rides almost exactly 1 day 1 hour long
- Explanation: The bike was not properly docked/the bike was abandoned (by intention or because of an accident), the maximum time is oddly specific which indicates that city bike probably has mechnanism to detect bikes with unsual rent time and then automatically looks them aroun that time 

In [22]:
ORIGINAL_COLS = [
    "rideable_type",
    "started_at",
    "ended_at",
    "start_station_name",
    "start_station_id",
    "end_station_name",
    "end_station_id",
    "start_lat",
    "start_lng",
    "end_lat",
    "end_lng",
    "member_casual",
    "source_file"
]

df_citibike = df_citibike[ORIGINAL_COLS].copy()

In [23]:
# Columns that must be fully present
loc_cols = [
    "start_lat", "start_lng", "start_station_id", "start_station_name",
    "end_lat", "end_lng", "end_station_id", "end_station_name",
]

rows_before = len(df_citibike)

# Drop rows with any missing value in these columns
df_citibike = df_citibike.dropna(subset=loc_cols)

rows_after = len(df_citibike)

print(f"Dropped {rows_before - rows_after} rows with missing start/end location info")
print("New shape:", df_citibike.shape)

Dropped 4578 rows with missing start/end location info
New shape: (1383934, 13)


### B) Out of Bounds

In [24]:
# NYC bounding box 
NYC_BOUNDS = {
    'lat_min': 40.4,
    'lat_max': 41.0,
    'lng_min': -74.3,
    'lng_max': -73.6
}

def flag_geographic_anomalies(df: pd.DataFrame) -> pd.DataFrame:
    """Identify and categorize location data issues."""
    
    df = df.copy()
    
    df['start_out_of_bounds'] = (
        ((df['start_lat'] < NYC_BOUNDS['lat_min']) | 
         (df['start_lat'] > NYC_BOUNDS['lat_max']) |
         (df['start_lng'] < NYC_BOUNDS['lng_min']) | 
         (df['start_lng'] > NYC_BOUNDS['lng_max']))
    )
    
    df['end_out_of_bounds'] = (
        ((df['end_lat'] < NYC_BOUNDS['lat_min']) | 
         (df['end_lat'] > NYC_BOUNDS['lat_max']) |
         (df['end_lng'] < NYC_BOUNDS['lng_min']) | 
         (df['end_lng'] > NYC_BOUNDS['lng_max']))
    )
    
    return df


df_citibike = flag_geographic_anomalies(df_citibike)

# Summarize issues
geo_issues = {
    'Start Out of Bounds': df_citibike['start_out_of_bounds'].sum(),
    'End Out of Bounds': df_citibike['end_out_of_bounds'].sum()
}

print("GEOGRAPHIC ANOMALIES:")
print("="*60)
for issue, count in geo_issues.items():
    pct = 100 * count / len(df_citibike)
    print(f"{issue:.<40} {count:>6,} ({pct:>5.2f}%)")
print("="*60)

GEOGRAPHIC ANOMALIES:
Start Out of Bounds.....................      0 ( 0.00%)
End Out of Bounds.......................      3 ( 0.00%)


In [27]:
# Print every row where END coordinates are out of NYC bounds

end_oob = df_citibike[df_citibike["end_out_of_bounds"]].copy()

print(f"Rows with end coords out of bounds: {len(end_oob)}")

# Show all rows (and all columns) in notebook output
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", 200)

display(end_oob)

Rows with end coords out of bounds: 3


Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,source_file,start_out_of_bounds,end_out_of_bounds
101085,classic_bike,2024-11-20 14:33:29.099,2024-11-20 14:39:42.336,Jefferson St & Cypress Ave,5082.08,Bronx WH station,SYS018,40.709068,-73.92157,0.0,0.0,member,202411-citibike-tripdata_4.csv,False,True
108765,electric_bike,2024-11-18 06:41:54.926,2024-11-18 06:55:55.178,New York Ave & St Marks Ave,4098.06,Bronx WH station,SYS018,40.67514,-73.947159,0.0,0.0,member,202411-citibike-tripdata_4.csv,False,True
126655,electric_bike,2024-12-09 06:48:09.410,2024-12-09 07:02:47.872,New York Ave & St Marks Ave,4098.06,Bronx WH station,SYS018,40.67514,-73.947159,0.0,0.0,member,202412-citibike-tripdata_1.csv,False,True


- very weird behavior, an explanation could be that WH means warehouse and that the Bronx WH station is a warehouse/maintenance facility station, not a public-facing dock ("SYS..." in the id implies that also)
- i dont think that citi bike even meant for these trips to be in this dataset
- good time to generally look at station/Name/ID

In [47]:
df_citibike = df_citibike.drop(columns=["start_out_of_bounds","end_out_of_bounds"])

### Station Name/ID

In [29]:
#Check for other stations with a " WH " in the name or where the id beginns with "SYS..."

# --- helpers (robust to NA) ---
start_name = df_citibike["start_station_name"]
end_name   = df_citibike["end_station_name"]
start_id   = df_citibike["start_station_id"]
end_id     = df_citibike["end_station_id"]

# --- conditions ---
has_wh_name = start_name.str.contains(r"\bWH\b", regex=True) | end_name.str.contains(r"\bWH\b", regex=True)
has_sys_id  = start_id.str.startswith("SYS") | end_id.str.startswith("SYS")

# --- row mask ---
mask = has_wh_name | has_sys_id

print("Rows matching (WH in name OR SYS in id):", int(mask.sum()), "of", len(df_citibike))

# --- distinct counts ---
sys_ids = pd.unique(pd.concat([start_id[start_id.str.startswith("SYS")], end_id[end_id.str.startswith("SYS")]]))
wh_names = pd.unique(pd.concat([start_name[start_name.str.contains(r"\bWH\b", regex=True)],
                               end_name[end_name.str.contains(r"\bWH\b", regex=True)]]))

print("Distinct SYS station IDs:", len(sys_ids))
print("Distinct WH station names:", len(wh_names))

# --- optional: show them ---
print("\nSYS IDs:", list(sys_ids))
print("\nWH names:", list(wh_names))

# --- optional: filtered rows ---
# df_flagged = df_citibike.loc[mask].copy()
display(
    df_citibike.loc[
        mask,
        ["start_station_name", "start_station_id",
         "end_station_name", "end_station_id", "source_file"]
    ].drop_duplicates()
)

Rows matching (WH in name OR SYS in id): 195 of 1383934
Distinct SYS station IDs: 4
Distinct WH station names: 1

SYS IDs: ['SYS016', 'SYS038', 'SYS033', 'SYS018']

WH names: ['Bronx WH station']


Unnamed: 0,start_station_name,start_station_id,end_station_name,end_station_id,source_file
2961,Bushwick Ave & Harman St,4640.01,Morgan Bike Mechanics,SYS016,202411-citibike-tripdata_1.csv
8047,Morgan Bike Mechanics,SYS016,Park Ave & Marcus Garvey Blvd,4768.02,202411-citibike-tripdata_1.csv
29658,Vesey St & Church St,5216.06,Morgan Bike Mechanics,SYS016,202411-citibike-tripdata_1.csv
37384,Morgan Bike Mechanics,SYS016,Schenectady Ave & Maple St,3639.04,202411-citibike-tripdata_2.csv
43570,Bushwick Ave & Harman St,4640.01,Morgan Bike Mechanics,SYS016,202411-citibike-tripdata_2.csv
57691,Morgan Loading Docks,SYS038,Morgan Loading Docks,SYS038,202411-citibike-tripdata_2.csv
72413,Front St & Gold St,4927.04,Morgan Bike Mechanics,SYS016,202411-citibike-tripdata_3.csv
72886,Madison St & Cypress Ave,4807.02,Morgan Bike Mechanics,SYS016,202411-citibike-tripdata_3.csv
75579,Malcolm X Blvd & DeKalb Ave,4648.03,Morgan Bike Mechanics,SYS016,202411-citibike-tripdata_3.csv
77428,Morgan Loading Docks,SYS038,Morgan Bike Mechanics,SYS016,202411-citibike-tripdata_3.csv


- there are 4 id's which start with "SYS" with the corresponding names "Bronx WH station","Morgan Bike Mechanics","Morgan Loading Docks","Pier 40 X2". The names make it higly likely that these are internal maintanance stations 

In [30]:
rows_before = len(df_citibike)

df_citibike = df_citibike.loc[~mask].copy()

rows_after = len(df_citibike)

print(f"Dropped {rows_before - rows_after} rows using existing mask")
print("New shape:", df_citibike.shape)

Dropped 195 rows using existing mask
New shape: (1383739, 15)


In [31]:
unique_station_stats = pd.DataFrame(
    {
        "column": [
            "start_station_name",
            "start_station_id",
            "end_station_name",
            "end_station_id",
            "ALL_station_names_combined",
            "ALL_station_ids_combined",
        ],
        "unique_values": [
            df_citibike["start_station_name"].nunique(dropna=True),
            df_citibike["start_station_id"].nunique(dropna=True),
            df_citibike["end_station_name"].nunique(dropna=True),
            df_citibike["end_station_id"].nunique(dropna=True),
            pd.concat(
                [df_citibike["start_station_name"], df_citibike["end_station_name"]]
            ).nunique(dropna=True),
            pd.concat(
                [df_citibike["start_station_id"], df_citibike["end_station_id"]]
            ).nunique(dropna=True),
        ],
    }
)

display(unique_station_stats)

Unnamed: 0,column,unique_values
0,start_station_name,2212
1,start_station_id,2305
2,end_station_name,2245
3,end_station_id,2330
4,ALL_station_names_combined,2245
5,ALL_station_ids_combined,2340


this is weird
1. there should be a 1:1 mapping between stations and id's which is not the case at all (there are roughly 100 more ids than stations)
2. there are roughly 30 stations which only appeared as end stations

Lets investigate

In [33]:
# Build unified (name, id) table from start + end
stations = pd.concat(
    [
        df_citibike[["start_station_name", "start_station_id"]]
            .rename(columns={"start_station_name": "name", "start_station_id": "id"}),
        df_citibike[["end_station_name", "end_station_id"]]
            .rename(columns={"end_station_name": "name", "end_station_id": "id"}),
    ],
    ignore_index=True,
)

# Drop rows where name or id is missing
stations = stations.dropna(subset=["name", "id"])

# Count distinct IDs per station name
id_counts = stations.groupby("name")["id"].nunique()

# Names with more than one ID
ambiguous_names = id_counts[id_counts > 1].index

# ---- Table 1: row-level (name, id) pairs ----
ambiguous_rows = (
    stations[stations["name"].isin(ambiguous_names)]
    .drop_duplicates()
    .sort_values(["name", "id"])
)

# ---- Table 2: grouped table (name → list of IDs) ----
ambiguous_grouped = (
    ambiguous_rows
    .groupby("name")["id"]
    .apply(lambda x: sorted(x.unique()))
    .reset_index(name="ids")
)

print(f"Stations with >1 ID: {len(ambiguous_grouped)}")

display(ambiguous_grouped)

Stations with >1 ID: 105


Unnamed: 0,name,ids
0,11 St & 43 Ave,"[6438.04, 6438.05]"
1,2 Ave & E 99 St,"[7386.1, 7386.10]"
2,24 Ave & 26 St,"[7152.1, 7152.10]"
3,3 Ave & E 82 St,"[7154.1, 7154.10]"
4,3 St & Hoyt St,"[4110.1, 4110.10]"
5,31 Ave & Crescent St,"[6893.1, 6893.10]"
6,31 St & Broadway,"[6789.2, 6789.20]"
7,31 St & Newtown Ave,"[6923.2, 6923.20]"
8,34 Ave & 38 St,"[6638.01, 6638.08]"
9,34 Ave & 45 St,"[6596.1, 6596.10]"


There are 3 problems
1. by far the biggest problem seem to be missing trailing zeros (e.g. 2 Ave & E 99 St	[7386.1, 7386.10])
2. both id's are valid but different (11 St & 43 Ave	[6438.04, 6438.05])
3. weird outliers which are not conform with naming convention (xxxx.xx) at all (e.g. Kent Ave & N 3 St	[5348.06, 5348.06_old])

In [32]:
# look at ids which
import pandas as pd

# pattern: exactly 4 digits, dot, 2 digits (e.g. 5470.10)
pat = r"^\d{4}\.\d{2}$"

# make one Series of all station IDs (start + end)
all_ids = pd.concat(
    [df_citibike["start_station_id"], df_citibike["end_station_id"]],
    ignore_index=True
).dropna().astype(str)

# station IDs that do NOT match iiii.ii
bad_ids = all_ids[~all_ids.str.match(pat)]

print("Total unique station IDs:", all_ids.nunique())
print("Unique IDs NOT matching iiii.ii:", bad_ids.nunique())

# show the actual IDs
display(pd.Series(bad_ids.unique()).sort_values().reset_index(drop=True))

Total unique station IDs: 2340
Unique IDs NOT matching iiii.ii: 134


0            3113.1
1            3391.1
2            3423.1
3            3576.1
4            3593.1
5            3776.1
6            3834.1
7            3981.1
8            4095.1
9            4110.1
10           4129.1
11           4137.1
12           4157.1
13           4181.1
14           4366.1
15           4404.1
16           4416.1
17           4432.1
18           4455.1
19           4479.1
20           4483.1
21           4485.1
22           4488.1
23           4632.1
24           4798.1
25           4821.1
26           4912.1
27           4994.1
28           5024.1
29           5065.1
30           5137.1
31           5235.1
32           5247.1
33         5303.06_
34         5308.04_
35           5332.1
36           5343.1
37      5348.06_old
38           5359.1
39           5379.1
40           5430.1
41           5470.1
42           5506.1
43           5553.1
44           5669.1
45           5779.1
46           5785.1
47           5854.1
48           5872.1
49           5980.1


- HB & JC Stations are also here even though we didnt import the Jersey City tripdata, we will look at this in a moment

cleaning the ids would be bothersome so lets look if we can use station name as a better unique identifier

In [36]:
# Build unified (name, id) table from start + end
stations = pd.concat(
    [
        df_citibike[["start_station_name", "start_station_id"]]
            .rename(columns={"start_station_name": "name", "start_station_id": "id"}),
        df_citibike[["end_station_name", "end_station_id"]]
            .rename(columns={"end_station_name": "name", "end_station_id": "id"}),
    ],
    ignore_index=True,
)

# Drop rows where name or id is missing
stations = stations.dropna(subset=["name", "id"])

# Count distinct names per station id
name_counts = stations.groupby("id")["name"].nunique()

# IDs with more than one name
ambiguous_ids = name_counts[name_counts > 1].index

# ---- Table 1: row-level (id, name) pairs ----
ambiguous_rows = (
    stations[stations["id"].isin(ambiguous_ids)]
    .drop_duplicates()
    .sort_values(["id", "name"])
)

# ---- Table 2: grouped table (id → list of names) ----
ambiguous_grouped = (
    ambiguous_rows
    .groupby("id")["name"]
    .apply(lambda x: sorted(x.unique()))
    .reset_index(name="names")
)

print(f"Station IDs with >1 name: {len(ambiguous_grouped)}")

display(ambiguous_grouped)

Station IDs with >1 name: 11


Unnamed: 0,id,names
0,6505.01,"[24 St & 41 Ave, 24 St & 41 Ave (original)]"
1,6873.01,"[34 Ave & Vernon Blvd, 34th Ave & Vernon Blvd]"
2,7079.06,"[Central Park W & W 68 St, Central Park West & W 68 St]"
3,7141.07,"[Central Park W & W 72 St, Central Park West & W 72 St]"
4,7253.04,"[Central Park W & W 76 St, Central Park West & W 76 St]"
5,7304.08,"[W 82 St & Central Park W, W 82 St & Central Park West]"
6,7354.01,"[Central Park W & W 85 St, Central Park West & W 85 St]"
7,7606.01,"[W 106 St & Central Park W, W 106 St & Central Park West]"
8,7851.04,"[5 Av & W 139 St, 5 Ave & W 139 St]"
9,8226.07,"[W 168 St & Fort Washington Ave, W 168 St & Ft Washington Ave]"


In [37]:
# Mapping: variant name -> canonical name (first entry in each list)
NAME_CANON_MAP = {
    "24 St & 41 Ave (original)": "24 St & 41 Ave",

    "34th Ave & Vernon Blvd": "34 Ave & Vernon Blvd",

    "Central Park West & W 68 St": "Central Park W & W 68 St",
    "Central Park West & W 72 St": "Central Park W & W 72 St",
    "Central Park West & W 76 St": "Central Park W & W 76 St",
    "Central Park West & W 85 St": "Central Park W & W 85 St",

    "W 82 St & Central Park West": "W 82 St & Central Park W",
    "W 106 St & Central Park West": "W 106 St & Central Park W",

    "5 Ave & W 139 St": "5 Av & W 139 St",

    "W 168 St & Ft Washington Ave": "W 168 St & Fort Washington Ave",
    "W 180 St & Ft Washington Ave": "W 180 St & Fort Washington Ave",
}

df_citibike["start_station_name"] = (
    df_citibike["start_station_name"]
    .replace(NAME_CANON_MAP)
)

df_citibike["end_station_name"] = (
    df_citibike["end_station_name"]
    .replace(NAME_CANON_MAP)
)

- now station names should uniquely identify each station

In [41]:
start_jc_hb = df_citibike.loc[
    df_citibike["start_station_id"]
    .astype(str)
    .str.startswith(("JC", "HB"), na=False)
].copy()

print("Rows with start_station_id starting with 'JC' or 'HB':", len(start_jc_hb))
display(start_jc_hb)

Rows with start_station_id starting with 'JC' or 'HB': 0


Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,source_file,start_out_of_bounds,end_out_of_bounds


In [42]:
end_jc_hb = df_citibike.loc[
    df_citibike["end_station_id"]
    .astype(str)
    .str.startswith(("JC", "HB"), na=False)
].copy()

print("Rows with end_station_id starting with 'JC' or 'HB':", len(end_jc_hb))
display(end_jc_hb)

Rows with end_station_id starting with 'JC' or 'HB': 95


Unnamed: 0,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual,source_file,start_out_of_bounds,end_out_of_bounds
13287,electric_bike,2024-11-06 18:19:35.622,2024-11-06 18:43:06.763,9 Ave & W 45 St,6717.06,14 St Ferry - 14 St & Shipyard Ln,HB202,40.760193,-73.991257,40.75296,-74.024353,member,202411-citibike-tripdata_1.csv,False,False
13698,electric_bike,2024-11-06 17:20:52.966,2024-11-06 18:01:12.754,King St & Varick St,5687.11,Dey St,JC065,40.727898,-74.005363,40.737713,-74.066917,member,202411-citibike-tripdata_1.csv,False,False
13712,electric_bike,2024-11-05 16:37:05.209,2024-11-05 16:59:25.585,E 43 St & Madison Ave,6551.11,14 St Ferry - 14 St & Shipyard Ln,HB202,40.753548,-73.978966,40.75296,-74.024353,member,202411-citibike-tripdata_1.csv,False,False
16532,electric_bike,2024-11-02 19:53:43.944,2024-11-02 20:06:06.819,Vesey Pl & River Terrace,5297.02,Paulus Hook,JC002,40.71534,-74.016586,40.714146,-74.033554,member,202411-citibike-tripdata_1.csv,False,False
32843,electric_bike,2024-11-13 18:38:59.680,2024-11-13 19:10:51.724,Broadway & W 36 St,6441.01,11 St & Washington St,HB502,40.750977,-73.987656,40.749985,-74.027153,member,202411-citibike-tripdata_2.csv,False,False
55487,classic_bike,2024-11-07 21:31:29.793,2024-11-07 21:47:30.900,West St & Chambers St,5329.03,Morris Canal,JC072,40.717548,-74.013222,40.712418,-74.038528,member,202411-citibike-tripdata_2.csv,False,False
55955,electric_bike,2024-11-07 20:01:48.919,2024-11-07 20:41:50.082,E 65 St & 2 Ave,6860.12,Columbus Park - Clinton St & 9 St,HB501,40.764717,-73.962219,40.748161,-74.032455,member,202411-citibike-tripdata_2.csv,False,False
60739,electric_bike,2024-11-19 17:15:28.135,2024-11-19 17:43:36.426,W 37 St & 5 Ave,6398.06,Bloomfield St & 15 St,HB203,40.750381,-73.983391,40.754532,-74.026581,member,202411-citibike-tripdata_3.csv,False,False
83866,electric_bike,2024-11-26 18:01:31.702,2024-11-26 18:40:29.594,Broadway & W 41 St,6560.01,12 St & Sinatra Dr N,HB201,40.755135,-73.98658,40.750603,-74.024017,member,202411-citibike-tripdata_3.csv,False,False
122528,electric_bike,2024-12-09 21:18:31.736,2024-12-09 21:48:49.977,Duane St & Hudson St,5359.12,Marin Light Rail,JC013,40.71703,-74.009247,40.714584,-74.042816,member,202412-citibike-tripdata_1.csv,False,False


- Explanation: Only rows where the start station is in new york city are included, but end station can be arbitrary
- **Inconsistency**: This is exactly the other way around as for start/end time, there the end time needs to be in the valid time frame, but for station the start station needs to be in NYC. So a valid datapoint in our dataset could be e.g. a ride which start at 2024-10-31 in nyc and ends in 2024-11-01 in Jersey City, its a bit counter intuitiv but this sample would be in the 2024-11 citibike nyc dataset

In [45]:
# Unique station names
start_names = set(df_citibike["start_station_name"].dropna().unique())
end_names   = set(df_citibike["end_station_name"].dropna().unique())

# Names that appear only as end stations
end_only_names = end_names - start_names

print("Stations appearing only as END stations:", len(end_only_names))

# Build table with IDs + counts
end_only_table = (
    df_citibike[df_citibike["end_station_name"].isin(end_only_names)]
    .groupby(["end_station_name", "end_station_id"])
    .size()
    .reset_index(name="n_rides_ended")
    .sort_values("n_rides_ended", ascending=False)
)

display(end_only_table)

Stations appearing only as END stations: 32


Unnamed: 0,end_station_name,end_station_id,n_rides_ended
25,Morris Canal,JC072,18
2,14 St Ferry - 14 St & Shipyard Ln,HB202,13
19,Harborside,JC104,7
1,12 St & Sinatra Dr N,HB201,7
23,Marin Light Rail,JC013,6
15,Exchange Pl,JC116,3
5,8 St & Washington St,HB603,3
21,JC Medical Center,JC110,3
18,Hamilton Park,JC009,3
11,Columbus Park - Clinton St & 9 St,HB501,3


- so that there are more end stations then start stations can be explanied because of the JC and HB stations

In [51]:
# Build unified station geo table
stations_geo = pd.concat(
    [
        df_citibike[["start_station_name", "start_lat", "start_lng"]]
            .rename(columns={"start_station_name": "station_name", "start_lat": "lat", "start_lng": "lng"}),
        df_citibike[["end_station_name", "end_lat", "end_lng"]]
            .rename(columns={"end_station_name": "station_name", "end_lat": "lat", "end_lng": "lng"}),
    ],
    ignore_index=True,
).dropna(subset=["station_name", "lat", "lng"])

# Fast stats via describe()
geo_desc = (
    stations_geo
    .groupby("station_name")[["lat", "lng"]]
    .describe()
)

# Flatten MultiIndex columns -> lat_mean, lat_std, ...
geo_desc.columns = [f"{var}_{stat}" for var, stat in geo_desc.columns]
geo_desc = geo_desc.reset_index()

# Sort by highest standard deviation (max of lat_std and lng_std)
geo_desc["max_std"] = geo_desc[["lat_std", "lng_std"]].max(axis=1)
geo_desc = geo_desc.sort_values("max_std", ascending=True)

display(geo_desc.head(40))

Unnamed: 0,station_name,lat_count,lat_mean,lat_std,lat_min,lat_25%,lat_50%,lat_75%,lat_max,lng_count,lng_mean,lng_std,lng_min,lng_25%,lng_50%,lng_75%,lng_max,max_std
1580,Montgomery St,2.0,40.719421,0.0,40.719421,40.719421,40.719421,40.719421,40.719421,2.0,-74.050987,0.0,-74.050987,-74.050987,-74.050987,-74.050987,-74.050987,0.0
482,86 St & Ridge Blvd,6.0,40.62402,0.0,40.62402,40.62402,40.62402,40.62402,40.62402,6.0,-74.033958,0.0,-74.033958,-74.033958,-74.033958,-74.033958,-74.033958,0.0
1331,Hamilton Park,3.0,40.727596,0.0,40.727596,40.727596,40.727596,40.727596,40.727596,3.0,-74.04425,0.0,-74.04425,-74.04425,-74.04425,-74.04425,-74.04425,0.0
2209,Willow Ave & 12 St,2.0,40.751869,0.0,40.751869,40.751869,40.751869,40.751869,40.751869,2.0,-74.03038,0.0,-74.03038,-74.03038,-74.03038,-74.03038,-74.03038,0.0
843,Columbus Park - Clinton St & 9 St,3.0,40.748161,0.0,40.748161,40.748161,40.748161,40.748161,40.748161,3.0,-74.032455,0.0,-74.032455,-74.032455,-74.032455,-74.032455,-74.032455,0.0
477,83 St & Narrows Ave,4.0,40.627571,0.0,40.627571,40.627571,40.627571,40.627571,40.627571,4.0,-74.038383,0.0,-74.038383,-74.038383,-74.038383,-74.038383,-74.038383,0.0
24,100 St & Humphreys St,54.0,40.765732,0.0,40.765739,40.765739,40.765739,40.765739,40.765739,54.0,-73.870117,0.0,-73.870117,-73.870117,-73.870117,-73.870117,-73.870117,0.0
635,Bloomfield St & 15 St,2.0,40.754532,0.0,40.754532,40.754532,40.754532,40.754532,40.754532,2.0,-74.026581,0.0,-74.026581,-74.026581,-74.026581,-74.026581,-74.026581,0.0
1539,Marin Light Rail,6.0,40.714584,0.0,40.714584,40.714584,40.714584,40.714584,40.714584,6.0,-74.042816,0.0,-74.042816,-74.042816,-74.042816,-74.042816,-74.042816,0.0
422,68 St & 5 Ave,14.0,40.634159,0.0,40.634159,40.634159,40.634159,40.634159,40.634159,14.0,-74.020576,0.0,-74.020576,-74.020576,-74.020576,-74.020576,-74.020576,0.0


- seems like there are a few stations where lat/long is hardcoded

In [52]:
display(geo_desc.tail(40))

Unnamed: 0,station_name,lat_count,lat_mean,lat_std,lat_min,lat_25%,lat_50%,lat_75%,lat_max,lng_count,lng_mean,lng_std,lng_min,lng_25%,lng_50%,lng_75%,lng_max,max_std
662,Broadway & E 21 St,6662.0,40.739895,0.001923,40.739887,40.739887,40.739887,40.739887,40.739887,6662.0,-73.989578,0.003769,-73.989586,-73.989586,-73.989586,-73.989586,-73.989586,0.003769
1433,Lafayette Ave & Ft Greene Pl,3498.0,40.687008,0.000412,40.687004,40.687004,40.687004,40.687004,40.687004,3498.0,-73.976646,0.003777,-73.976646,-73.976646,-73.976646,-73.976646,-73.976646,0.003777
1050,E 20 St & FDR Dr,4324.0,40.733212,0.000889,40.733208,40.733208,40.733208,40.733208,40.733208,4324.0,-73.975685,0.003785,-73.975685,-73.975685,-73.975685,-73.975685,-73.975685,0.003785
465,8 Ave & W 31 St,7274.0,40.750584,0.000542,40.750584,40.750584,40.750584,40.750584,40.750584,7274.0,-73.994682,0.003792,-73.994682,-73.994682,-73.994682,-73.994682,-73.994682,0.003792
1017,E 17 St & Broadway,7422.0,40.737003,0.000122,40.737007,40.737007,40.737007,40.737007,40.737007,7422.0,-73.990128,0.003922,-73.990135,-73.990135,-73.990135,-73.990135,-73.990135,0.003922
431,7 Ave & Central Park South,6958.0,40.766735,0.000244,40.766739,40.766739,40.766739,40.766739,40.766739,6958.0,-73.979073,0.00396,-73.979073,-73.979073,-73.979073,-73.979073,-73.979073,0.00396
821,Clinton St & Grand St,5866.0,40.715611,0.002129,40.715595,40.715595,40.715595,40.715595,40.715736,5866.0,-73.98703,0.00401,-73.98703,-73.98703,-73.98703,-73.98703,-73.986992,0.00401
660,Broadway & E 14 St,8168.0,40.73455,0.003147,40.734547,40.734547,40.734547,40.734547,40.734547,8168.0,-73.99073,0.004089,-73.990738,-73.990738,-73.990738,-73.990738,-73.990738,0.004089
1864,Spring St & Hudson St,3913.0,40.725842,0.000748,40.725842,40.725842,40.725842,40.725842,40.725842,3913.0,-74.007652,0.004113,-74.007652,-74.007652,-74.007652,-74.007652,-74.007652,0.004113
1069,E 33 St & 1 Ave,7083.0,40.743225,0.003392,40.743225,40.743225,40.743225,40.743225,40.743225,7083.0,-73.974495,0.004159,-73.974495,-73.974495,-73.974495,-73.974495,-73.974495,0.004159


- highest std deviation is for long "0.006936", which corresponds to roughly 600 meters, so there are some stations with a high deviation in their lat/long data, maybe they were relocated or the gps tends to be not exact
- we could infer a mean lat/lng for each station, but this is not necessary for the comming analytics and modelling