Goal: Retrieve the species checklist for a region using the eBird API.


In [2]:
from __future__ import annotations

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
from birding.ebird_api import get_ebird_api_key

ebird_api_key = get_ebird_api_key()

In [1]:
locations_of_interest = ["Maine, United States"]

In [3]:
from birding.geocoding import retrieve_geocode
from birding.sqlite_cache import init_db

init_db()
for location in locations_of_interest:
    print(f"\n\nRetrieving geocode for location '{location}'...")
    data = retrieve_geocode(query=location)
    if data is None:
        print(f"Could not find geocode for location '{location}'.")
    else:
        print(data)



Retrieving geocode for location 'Maine, United States'...
Geocode for query 'Maine, United States' was already cached.
{'place_id': 336157849, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 63512, 'lat': '45.7090970', 'lon': '-68.8590201', 'class': 'boundary', 'type': 'administrative', 'place_rank': 8, 'importance': 0.7469097285923896, 'addresstype': 'state', 'name': 'Maine', 'display_name': 'Maine, United States', 'boundingbox': ['42.9222206', '47.4598397', '-71.0841694', '-66.8854174']}


In [8]:
from birding.primitives import Coordinate

raw_location_data = {loc: retrieve_geocode(query=loc) for loc in locations_of_interest}
location_data = {loc: data for loc, data in raw_location_data.items() if data is not None}
assert len(raw_location_data) == len(location_data), "A location's geocode data was None!"

location_coords = {loc: Coordinate.from_geocode_data(data) for loc, data in location_data.items()}
location_coords

Geocode for query 'Providence, Rhode Island, United States' was already cached.


{'Providence, Rhode Island, United States': Coordinate(latitude=41.8239891, longitude=-71.4128343)}

In [9]:
from birding.ebird_api import retrieve_nearby_hotspots

for loc, coord in location_coords.items():
    print(f"\n\nRetrieving nearby hotspots for location '{loc}'...")
    hotspots = retrieve_nearby_hotspots(ebird_api_key, coord=coord)
    print(f"Found {len(hotspots)} nearby hotspots.")

    if len(hotspots) < 5:
        print("Location had less than 5 nearby hotspots!")
        more_hotspots = retrieve_nearby_hotspots(ebird_api_key, coord=coord, distance_km=50)
        print(f"Found {len(more_hotspots)} hotspots within 50 km.")



Retrieving nearby hotspots for location 'Providence, Rhode Island, United States'...
Nearby hotspots for (41.8240, -71.4128) were already cached.
Found 336 nearby hotspots.


In [10]:
spots_per_loc = {
    loc: retrieve_nearby_hotspots(ebird_api_key, coord) for loc, coord in location_coords.items()
}

spots_per_loc

Nearby hotspots for (41.8240, -71.4128) were already cached.


{'Providence, Rhode Island, United States': [{'locId': 'L9189645',
   'locName': 'Allen Harbor Marina',
   'countryCode': 'US',
   'subnational1Code': 'US-RI',
   'subnational2Code': 'US-RI-009',
   'lat': 41.6197094,
   'lng': -71.4112937,
   'latestObsDt': '2025-11-04 15:39',
   'numSpeciesAllTime': 121},
  {'locId': 'L5070824',
   'locName': "Allin's Cove (Barrington Land Trust)",
   'countryCode': 'US',
   'subnational1Code': 'US-RI',
   'subnational2Code': 'US-RI-001',
   'lat': 41.7442266,
   'lng': -71.3481226,
   'latestObsDt': '2025-11-02 14:30',
   'numSpeciesAllTime': 123},
  {'locId': 'L33244456',
   'locName': 'Anthony Lawrence Wildlife Preserve',
   'countryCode': 'US',
   'subnational1Code': 'US-MA',
   'subnational2Code': 'US-MA-005',
   'lat': 41.92324,
   'lng': -71.349151,
   'latestObsDt': '2025-05-03 10:07',
   'numSpeciesAllTime': 55},
  {'locId': 'L634993',
   'locName': 'Apponaug Cove',
   'countryCode': 'US',
   'subnational1Code': 'US-RI',
   'subnational2Code

In [11]:
all_hotspot_keys = set()
for hotspots in spots_per_loc.values():
    for spot_data in hotspots:
        all_hotspot_keys.update(spot_data.keys())

all_hotspot_keys

{'countryCode',
 'lat',
 'latestObsDt',
 'lng',
 'locId',
 'locName',
 'numSpeciesAllTime',
 'subnational1Code',
 'subnational2Code'}

In [12]:
from birding.primitives import EBirdHotspot

hotspots_per_location = {
    loc: {EBirdHotspot.from_json(hotspot_data) for hotspot_data in hotspots}
    for loc, hotspots in spots_per_loc.items()
}

In [13]:
hotspots_per_location

{'Providence, Rhode Island, United States': {EBirdHotspot(location=EBirdLocation(id='L10059274', name='Diamond Hill Park, Cumberland', coord=Coordinate(latitude=42.0019027, longitude=-71.4131983), country_code='US', subnat1_code='US-RI', subnat1_name=None, subnat2_code='US-RI-007', subnat2_name=None), all_time_species=94),
  EBirdHotspot(location=EBirdLocation(id='L1007144', name='Goddard Memorial SP', coord=Coordinate(latitude=41.6611783, longitude=-71.4358091), country_code='US', subnat1_code='US-RI', subnat1_name=None, subnat2_code='US-RI-003', subnat2_name=None), all_time_species=163),
  EBirdHotspot(location=EBirdLocation(id='L10078182', name='North Attleboro National Fish Hatchery', coord=Coordinate(latitude=41.9927973, longitude=-71.2838421), country_code='US', subnat1_code='US-MA', subnat1_name=None, subnat2_code='US-MA-005', subnat2_name=None), all_time_species=157),
  EBirdHotspot(location=EBirdLocation(id='L10135757', name='Ripley St Marsh', coord=Coordinate(latitude=41.7122

In [14]:
from geopy.distance import geodesic

# Find the nearest hotspot to each location
nearest_hotspots: dict[str, EBirdHotspot] = {}
for loc, hotspots in hotspots_per_location.items():
    loc_coord = location_coords[loc]
    nearest = min(hotspots, key=lambda hs: geodesic(loc_coord, hs.location.coord).mi)
    nearest_hotspots[loc] = nearest

nearest_hotspots

{'Providence, Rhode Island, United States': EBirdHotspot(location=EBirdLocation(id='L24184797', name='Kennedy Plaza, Providence', coord=Coordinate(latitude=41.825473, longitude=-71.412127), country_code='US', subnat1_code='US-RI', subnat1_name=None, subnat2_code='US-RI-007', subnat2_name=None), all_time_species=38)}

In [17]:
# For now, only retrieve species lists for the nearest hotspot to each location of interest
from birding.ebird_api import retrieve_species_list

for hotspot in nearest_hotspots.values():
    print(f"\n\nRetrieving bird species list for hotspot '{hotspot.location.name}'...")

    species_list = retrieve_species_list(ebird_api_key, area_code=hotspot.location.id)
    print(f"'{hotspot.location.name}' has {len(species_list)} nearby species.")



Retrieving bird species list for hotspot 'Kennedy Plaza, Providence'...
Bird species list for 'L24184797' was already cached.
'Kennedy Plaza, Providence' has 39 nearby species.


In [18]:
# Retrieve species lists for all subnational and national areas represented in the current data

all_species_lists: dict[str, list[str]] = {}
invalid_area_codes = {}

for loc, hotspots in hotspots_per_location.items():
    print(f"Retrieving species lists for subnational regions around '{loc}'...")
    for hotspot in hotspots:
        hs_loc = hotspot.location
        for possible_area_code in {hs_loc.subnat1_code, hs_loc.subnat2_code, hs_loc.country_code}:
            if possible_area_code is None or possible_area_code in invalid_area_codes:
                continue

            new_list = retrieve_species_list(ebird_api_key, possible_area_code)
            all_species_lists[possible_area_code] = new_list

species_union = set()
for species_list in all_species_lists.values():
    species_union.update(species_list)

print(f"Current species lists include {len(species_union)} total distinct species.")

Retrieving species lists for subnational regions around 'Providence, Rhode Island, United States'...
Bird species list for 'US-RI' was already cached.
Bird species list for 'US' was already cached.
Bird species list for 'US-RI-001' was already cached.
Bird species list for 'US-RI' was already cached.
Bird species list for 'US' was already cached.
Bird species list for 'US-RI-003' was already cached.
Bird species list for 'US-RI' was already cached.
Bird species list for 'US' was already cached.
Bird species list for 'US-RI-003' was already cached.
Bird species list for 'US-RI' was already cached.
Bird species list for 'US' was already cached.
Bird species list for 'US-RI-001' was already cached.
Bird species list for 'US-RI' was already cached.
Bird species list for 'US-RI-007' was already cached.
Bird species list for 'US' was already cached.
Bird species list for 'US-MA-005' was already cached.
Bird species list for 'US-MA' was already cached.
Bird species list for 'US' was already c

In [20]:
# Finally, retrieve species lists for all hotspots near our locations of interest
for loc, hotspots in hotspots_per_location.items():
    print(f"Retrieving species lists for hotspots near '{loc}'...")
    for hotspot in hotspots:
        hotspot_species = retrieve_species_list(ebird_api_key, hotspot.location.id)
        all_species_lists[hotspot.location.id] = hotspot_species

species_union = set()
for species_list in all_species_lists.values():
    species_union.update(species_list)

print(f"Current species lists include {len(species_union)} total distinct species.")

Retrieving species lists for hotspots near 'Providence, Rhode Island, United States'...
Bird species list for 'L23688813' was already cached.
Bird species list for 'L5071219' was already cached.
Bird species list for 'L6530163' was already cached.
Bird species list for 'L2781443' was already cached.
Bird species list for 'L40384308' was already cached.
Bird species list for 'L5495329' was already cached.
Bird species list for 'L735002' was already cached.
Bird species list for 'L247829' was already cached.
Bird species list for 'L8677678' was already cached.
Bird species list for 'L21194566' was already cached.
Bird species list for 'L43251693' was already cached.
Bird species list for 'L1670551' was already cached.
Bird species list for 'L14885742' was already cached.
Bird species list for 'L207451' was already cached.
Bird species list for 'L2597004' was already cached.
Bird species list for 'L1888546' was already cached.
Bird species list for 'L3997605' was already cached.
Bird spec

In [21]:
# Compute the distances (km) from each location to all nearby hotspots
from collections import defaultdict

hotspot_distances_km: dict[str, dict[EBirdHotspot, float]] = defaultdict(dict)

for loc, hotspots in hotspots_per_location.items():
    loc_coord = location_coords[loc]
    for hotspot in hotspots:
        hotspot_distances_km[loc][hotspot] = geodesic(loc_coord, hotspot.location.coord).kilometers

dict(hotspot_distances_km)

{'Providence, Rhode Island, United States': {EBirdHotspot(location=EBirdLocation(id='L23688813', name='Mussachuck Beach', coord=Coordinate(latitude=41.7364935, longitude=-71.3435401), country_code='US', subnat1_code='US-RI', subnat1_name=None, subnat2_code='US-RI-001', subnat2_name=None), all_time_species=98): 11.297219090499365,
  EBirdHotspot(location=EBirdLocation(id='L5071219', name='Big River Management Area--Model Airplane Field', coord=Coordinate(latitude=41.654994, longitude=-71.56379), country_code='US', subnat1_code='US-RI', subnat1_name=None, subnat2_code='US-RI-003', subnat2_name=None), all_time_species=50): 22.583338720991573,
  EBirdHotspot(location=EBirdLocation(id='L6530163', name='Frenchtown Park, East Greenwich', coord=Coordinate(latitude=41.626978, longitude=-71.506429), country_code='US', subnat1_code='US-RI', subnat1_name=None, subnat2_code='US-RI-003', subnat2_name=None), all_time_species=114): 23.226112336598565,
  EBirdHotspot(location=EBirdLocation(id='L2781443

In [22]:
# Compute distance-based weightings per hotspot-location pair using log-sum-exp
import numpy as np
from scipy.special import logsumexp

loc_species_log_lists: dict[str, dict[str, list[float]]] = defaultdict(lambda: defaultdict(list))
"""Map from location names to a map from eBird species IDs to lists of log-weights."""

# Accumulate log-weights (avoid numerical issues by staying in log-space)
# Intended weighting: e^{-km / 20}
for loc, hs_km_mapping in hotspot_distances_km.items():
    for hotspot, dist_km in hs_km_mapping.items():
        log_weight = -dist_km / 20.0  # log of exp(-dist_km / 20)
        for species_id in all_species_lists[hotspot.location.id]:
            loc_species_log_lists[loc][species_id].append(log_weight)

# Convert to unnormalized log-weights using numerically stable log-sum-exp
location_species_log_weights: dict[str, dict[str, float]] = {}
for loc, species_dict in loc_species_log_lists.items():
    location_species_log_weights[loc] = {
        species_id: float(logsumexp(log_weights))  # log(sum of exp(log_weights))
        for species_id, log_weights in species_dict.items()
    }

# Normalize to get probabilities:
#       P(species_i | location) = exp(log_weight_i) / sum_j exp(log_weight_j)
# Do this in log-space for numerical stability:
#       log P(species_i) = log_weight_i - logsumexp(all_log_weights for the location)
location_species_probabilities: dict[str, dict[str, float]] = {}
for loc, species_log_weights in location_species_log_weights.items():
    log_weights_array = np.array(list(species_log_weights.values()))
    log_normalizer = float(logsumexp(log_weights_array))  # log(sum of unnormalized weights)

    location_species_probabilities[loc] = {
        species_id: float(np.exp(log_weight - log_normalizer))  # Convert to normalized probability
        for species_id, log_weight in species_log_weights.items()
    }

# Verify normalization (should sum to ~1.0)
for loc, probs in location_species_probabilities.items():
    total_prob = sum(probs.values())
    print(f"{loc}: {len(probs)} species, total probability = {total_prob:.6f}")

Providence, Rhode Island, United States: 354 species, total probability = 1.000000


In [23]:
sorted_probabilities = {
    loc: dict(sorted(probs.items(), key=lambda id_prob: -id_prob[1]))
    for loc, probs in location_species_probabilities.items()
}

sorted_probabilities

{'Providence, Rhode Island, United States': {'blujay': 0.011174614826877326,
  'amerob': 0.011064807178055193,
  'sonspa': 0.011015889942237672,
  'norcar': 0.010962912182333766,
  'amegfi': 0.010847102471298978,
  'moudov': 0.010797880755263913,
  'bkcchi': 0.01067454566962944,
  'carwre': 0.010669094353103639,
  'amecro': 0.010620024217765832,
  'dowwoo': 0.010565218656471257,
  'houspa': 0.010462501761306644,
  'comgra': 0.010363105226812658,
  'tuftit': 0.010330781440415602,
  'cangoo': 0.010327191220783185,
  'eursta': 0.010242387154516143,
  'norfli': 0.010146832250767334,
  'houfin': 0.010114068102226627,
  'rewbla': 0.010082716304456035,
  'grycat': 0.009952595602579732,
  'rebwoo': 0.009837113337059448,
  'mallar3': 0.00979287115176935,
  'daejun': 0.009710150811649079,
  'whbnut': 0.009685241615886088,
  'rethaw': 0.009668121120381368,
  'whtspa': 0.009547892539557734,
  'normoc': 0.009457582415865983,
  'chiswi': 0.009337356918752175,
  'grbher3': 0.009135312745668167,
  'ch

In [25]:
import time
from pathlib import Path

import yaml

time_min = int(time.time())
output_path = Path.cwd().parent / f"data/probabilities-{time_min}.yaml"
if not output_path.exists():
    output_path.parent.mkdir(parents=True, exist_ok=True)
    output_path.touch()

with output_path.open("w") as outfile:
    yaml.dump(sorted_probabilities, outfile, sort_keys=False)

print(f"Data written to {output_path}")

Data written to /home/benned/git/birding/data/probabilities-1768080541.yaml
