In [1]:
# Cell 1: imports and locate files
import os, pathlib
import pandas as pd
import numpy as np
from keplergl import KeplerGl
import json

print("Working dir:", pathlib.Path.cwd())

# Try a few expected locations for processed data
candidates = list((pathlib.Path.cwd() / 'data' / 'processed').glob('*citibike*'))
candidates += list((pathlib.Path.cwd() / 'data' / 'processed').glob('*.csv'))
candidates = [p for p in candidates if p.is_file()]

print("Found candidate data files:")
for p in candidates:
    print("-", p.name)

# Set fn to the most likely file (adjust if needed)
if candidates:
    fn = candidates[0]
else:
    fn = pathlib.Path.cwd() / 'data' / 'processed' / 'citibike_with_weather_merged.csv'

print("Using file:", fn)

  from pkg_resources import resource_string


Working dir: c:\Users\Biswajit\citi-bike-nyc-2022-dashboard
Found candidate data files:
- citibike_2022_sample_1000rows.csv
- citibike_with_weather_merged.csv
- citibike_2022_sample_1000rows.csv
- citibike_with_weather_merged.csv
- la_guardia_weather_2022.csv
Using file: c:\Users\Biswajit\citi-bike-nyc-2022-dashboard\data\processed\citibike_2022_sample_1000rows.csv


In [2]:
# Cell 2: load data (use low_memory to avoid mixed types warning)
fn = str(fn)   # from previous cell
df = pd.read_csv(fn, low_memory=False, parse_dates=['date'] if 'date' in pd.read_csv(fn, nrows=0).columns else None)
print("Loaded:", df.shape)
print("Columns:", df.columns.tolist())
df.head()

Loaded: (1000, 13)
Columns: ['ride_id', 'rideable_type', 'started_at', 'ended_at', 'start_station_name', 'start_station_id', 'end_station_name', 'end_station_id', 'start_lat', 'start_lng', 'end_lat', 'end_lng', 'member_casual']


Unnamed: 0,ride_id,rideable_type,started_at,ended_at,start_station_name,start_station_id,end_station_name,end_station_id,start_lat,start_lng,end_lat,end_lng,member_casual
0,AB60147266CC5C80,classic_bike,2022-08-19 14:05:13,2022-08-19 14:18:51,Astor Place,JC077,Grove St PATH,JC005,40.719282,-74.071262,40.719586,-74.043117,member
1,B00E39547B9F173A,electric_bike,2022-12-19 15:25:30,2022-12-19 15:31:34,14 St Ferry - 14 St & Shipyard Ln,HB202,South Waterfront Walkway - Sinatra Dr & 1 St,HB103,40.752961,-74.024353,40.736982,-74.027781,member
2,14220FEDCC1226DD,classic_bike,2022-09-12 07:45:55,2022-09-12 07:55:28,South Waterfront Walkway - Sinatra Dr & 1 St,HB103,Newport PATH,JC066,40.737132,-74.027709,40.727224,-74.033759,member
3,A773814956315760,classic_bike,2022-03-30 09:04:08,2022-03-30 09:08:00,Grove St PATH,JC005,Dixon Mills,JC076,40.719586,-74.043117,40.72163,-74.049968,member
4,6D3ECB87A8F2C16D,electric_bike,2022-11-17 14:37:01,2022-11-17 14:47:38,Lincoln Park,JC053,Grove St PATH,JC005,40.724605,-74.078406,40.719586,-74.043117,member


In [5]:
# Cell 3: create value and aggregate trips between stations
# Normalize column names if slightly different
cols = df.columns.str.lower().tolist()

# common names we expect
start_name_col = next((c for c in df.columns if c.lower() in ('start_station_name','from_station_name','start_station')), None)
end_name_col   = next((c for c in df.columns if c.lower() in ('end_station_name','to_station_name','end_station')), None)

print("Using start station col:", start_name_col)
print("Using end station col:  ", end_name_col)

# add value column if missing
if 'value' not in df.columns:
    df['value'] = 1
else:
    df['value'] = df['value'].fillna(1)

# groupby start->end
df_group = df.groupby([start_name_col, end_name_col])['value'].count().reset_index()
df_group.columns = ['start_station','end_station','trips']   # rename for clarity
df_group = df_group.sort_values('trips', ascending=False).reset_index(drop=True)
print("Aggregated routes:", df_group.shape)
df_group.head(10)

Using start station col: start_station_name
Using end station col:   end_station_name
Aggregated routes: (723, 3)


Unnamed: 0,start_station,end_station,trips
0,Grove St PATH,Marin Light Rail,8
1,Grove St PATH,Dixon Mills,7
2,South Waterfront Walkway - Sinatra Dr & 1 St,14 St Ferry - 14 St & Shipyard Ln,7
3,12 St & Sinatra Dr N,South Waterfront Walkway - Sinatra Dr & 1 St,6
4,Marin Light Rail,Grove St PATH,6
5,Lafayette Park,Grove St PATH,5
6,South Waterfront Walkway - Sinatra Dr & 1 St,South Waterfront Walkway - Sinatra Dr & 1 St,5
7,14 St Ferry - 14 St & Shipyard Ln,14 St Ferry - 14 St & Shipyard Ln,5
8,South Waterfront Walkway - Sinatra Dr & 1 St,12 St & Sinatra Dr N,5
9,Hoboken Terminal - Hudson St & Hudson Pl,Hoboken Ave at Monmouth St,5


In [6]:
# Cell 4: attach coordinates for start and end (if present)
# try to find lat/lng columns in original df
lat_cols = [c for c in df.columns if 'lat' in c.lower()]
lng_cols = [c for c in df.columns if 'lng' in c.lower() or 'lon' in c.lower()]

print("Latitude columns:", lat_cols)
print("Longitude columns:", lng_cols)

# If original has start_lat/start_lng etc, create station lookup and merge
# We'll build station->lat/lng mapping using start_station columns
station_lat_col = next((c for c in df.columns if 'start_lat' in c.lower()), None)
station_lng_col = next((c for c in df.columns if 'start_lng' in c.lower() or 'start_lon' in c.lower()), None)

if station_lat_col and station_lng_col:
    # station mapping from the original
    start_map = df[[start_name_col, station_lat_col, station_lng_col]].drop_duplicates().rename(
        columns={start_name_col:'start_station', station_lat_col:'start_lat', station_lng_col:'start_lng'}
    )
    end_map = df[[end_name_col, station_lat_col.replace('start','end') if station_lat_col.replace('start','end') in df.columns else station_lat_col, 
                  station_lng_col.replace('start','end') if station_lng_col.replace('start','end') in df.columns else station_lng_col]].drop_duplicates()
    # handle end_map naming generically if columns exist
    if end_name_col in df.columns and ('end_lat' in df.columns or 'start_lat' in df.columns):
        # attempt safer approach: build a station-level table using all station names and coords
        station_tbl = pd.concat([
            df[[start_name_col, station_lat_col, station_lng_col]].rename(columns={start_name_col:'station', station_lat_col:'lat', station_lng_col:'lng'}),
            df[[end_name_col, station_lat_col, station_lng_col]].rename(columns={end_name_col:'station', station_lat_col:'lat', station_lng_col:'lng'})
        ], ignore_index=True).drop_duplicates(subset=['station'])
        station_tbl = station_tbl.dropna(subset=['lat','lng'])
        # create mappings
        df_group = df_group.merge(station_tbl.rename(columns={'station':'start_station','lat':'start_lat','lng':'start_lng'}), on='start_station', how='left')
        df_group = df_group.merge(station_tbl.rename(columns={'station':'end_station','lat':'end_lat','lng':'end_lng'}), on='end_station', how='left')
    else:
        # fallback: try to merge using start and end coordinate columns with best-effort names
        df_group = df_group.merge(start_map, on='start_station', how='left')
        # attempt to build end_map from end columns if present
        end_lat_col = next((c for c in df.columns if 'end_lat' in c.lower()), None)
        end_lng_col = next((c for c in df.columns if 'end_lng' in c.lower() or 'end_lon' in c.lower()), None)
        if end_lat_col and end_lng_col:
            end_map = df[[end_name_col, end_lat_col, end_lng_col]].drop_duplicates().rename(
                columns={end_name_col:'end_station', end_lat_col:'end_lat', end_lng_col:'end_lng'}
            )
            df_group = df_group.merge(end_map, on='end_station', how='left')
else:
    # No start_lat available; try to read stations file in data/raw or data/lookup
    cand_stations = list((pathlib.Path.cwd() / 'data' / 'raw').glob('*station*')) + list((pathlib.Path.cwd() / 'data').glob('*stations*'))
    cand_stations = [p for p in cand_stations if p.is_file()]
    if cand_stations:
        print("Found station lookup:", cand_stations[0])
        stations = pd.read_csv(cand_stations[0])
        # try to guess columns for join
        station_name_col = next((c for c in stations.columns if 'name' in c.lower()), None)
        lat_col = next((c for c in stations.columns if 'lat' in c.lower()), None)
        lon_col = next((c for c in stations.columns if 'lon' in c.lower() or 'lng' in c.lower()), None)
        print("station lookup cols:", station_name_col, lat_col, lon_col)
        stations = stations[[station_name_col, lat_col, lon_col]].rename(columns={station_name_col:'start_station', lat_col:'start_lat', lon_col:'start_lng'})
        df_group = df_group.merge(stations, on='start_station', how='left')
        # duplicate for end (rename then merge)
        stations_end = stations.rename(columns={'start_station':'end_station','start_lat':'end_lat','start_lng':'end_lng'})
        df_group = df_group.merge(stations_end, on='end_station', how='left')

# Final check for coordinate availability
print("Missing start coords:", df_group['start_lat'].isna().sum() if 'start_lat' in df_group.columns else 'no start_lat')
print("Missing end coords:", df_group['end_lat'].isna().sum() if 'end_lat' in df_group.columns else 'no end_lat')
df_group.head()

Latitude columns: ['start_lat', 'end_lat']
Longitude columns: ['start_lng', 'end_lng']
Missing start coords: 0
Missing end coords: 0


Unnamed: 0,start_station,end_station,trips,start_lat,start_lng,end_lat,end_lng
0,Grove St PATH,Marin Light Rail,8,40.719586,-74.043117,40.714584,-74.042817
1,Grove St PATH,Dixon Mills,7,40.719586,-74.043117,40.72163,-74.049968
2,South Waterfront Walkway - Sinatra Dr & 1 St,14 St Ferry - 14 St & Shipyard Ln,7,40.737132,-74.027709,40.752961,-74.024353
3,12 St & Sinatra Dr N,South Waterfront Walkway - Sinatra Dr & 1 St,6,40.750604,-74.02402,40.737132,-74.027709
4,Marin Light Rail,Grove St PATH,6,40.714584,-74.042817,40.719586,-74.043117


In [7]:
# Cell 5: create df_final for mapping; drop rows missing coords
needed_cols = ['start_station','end_station','trips','start_lat','start_lng','end_lat','end_lng']
for c in needed_cols:
    if c not in df_group.columns:
        df_group[c] = np.nan

df_final = df_group.dropna(subset=['start_lat','start_lng','end_lat','end_lng']).copy()
print("Rows with full coords:", df_final.shape)
df_final = df_final.astype({'start_lat':float,'start_lng':float,'end_lat':float,'end_lng':float,'trips':int})
df_final.head()

Rows with full coords: (723, 7)


Unnamed: 0,start_station,end_station,trips,start_lat,start_lng,end_lat,end_lng
0,Grove St PATH,Marin Light Rail,8,40.719586,-74.043117,40.714584,-74.042817
1,Grove St PATH,Dixon Mills,7,40.719586,-74.043117,40.72163,-74.049968
2,South Waterfront Walkway - Sinatra Dr & 1 St,14 St Ferry - 14 St & Shipyard Ln,7,40.737132,-74.027709,40.752961,-74.024353
3,12 St & Sinatra Dr N,South Waterfront Walkway - Sinatra Dr & 1 St,6,40.750604,-74.02402,40.737132,-74.027709
4,Marin Light Rail,Grove St PATH,6,40.714584,-74.042817,40.719586,-74.043117


In [8]:
# Cell 6: initialize Kepler map and add our df_final
m = KeplerGl(height=700, data={"bike_trips": df_final})
m

User Guide: https://docs.kepler.gl/docs/keplergl-jupyter


KeplerGl(data={'bike_trips':                                     start_station  \
0                           â€¦

In [9]:
# Cell 7: Save the configuration and export to HTML
config = m.config  # capture current GUI settings (after you configure)
# write config to file
with open("kepler_config.json", "w") as f:
    json.dump(config, f)

# Save interactive HTML (this is a copy you can send to mentor)
m.save_to_html(file_name='Divvy_Bike_Trips_Aggregated.html', read_only=False, config=config)
print("Saved kepler_config.json and Divvy_Bike_Trips_Aggregated.html")

Map saved to Divvy_Bike_Trips_Aggregated.html!
Saved kepler_config.json and Divvy_Bike_Trips_Aggregated.html


## Kepler map customization notes
- **Layers enabled:** Points for start stations, Points for end stations, Arc (start -> end).
- **Encodings:** Arc color set to `trips` (quantitative) using a gradient palette so busiest routes are brightest; point color set to neutral single color.
- **Size:** Arc thickness mapped to `trips` to emphasize major corridors.
- **Filter used:** numeric filter on `trips` to show routes with trips >= [YOUR_THRESHOLD].
- **Why:** This highlights frequently used routes and reduces clutter from one-off trips.