Goal: Retrieve the species list for a region using the eBird API and export it to YAML.


In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
regions = ["Maine, United States", "Providence, Rhode Island, United States"]

In [3]:
# Geocode all regions of interest
from birding.geocoding import retrieve_geocode
from birding.sqlite_cache import init_db

init_db()
for region in regions:
    print(f"\n\nRetrieving geocode for region '{region}'...")
    data = retrieve_geocode(query=region)
    if data is None:
        print(f"Could not find geocode for '{region}'.")
    else:
        print(data)



Retrieving geocode for region 'Maine, United States'...
Geocode for query 'Maine, United States' was already cached.
{'place_id': 336157849, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 63512, 'lat': '45.7090970', 'lon': '-68.8590201', 'class': 'boundary', 'type': 'administrative', 'place_rank': 8, 'importance': 0.7469097285923896, 'addresstype': 'state', 'name': 'Maine', 'display_name': 'Maine, United States', 'boundingbox': ['42.9222206', '47.4598397', '-71.0841694', '-66.8854174']}


Retrieving geocode for region 'Providence, Rhode Island, United States'...
Geocode for query 'Providence, Rhode Island, United States' was already cached.
{'place_id': 334465335, 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. http://osm.org/copyright', 'osm_type': 'relation', 'osm_id': 191210, 'lat': '41.8239891', 'lon': '-71.4128343', 'class': 'boundary', 'type': 'administrative', 'place_rank': 16, 'importance': 0.6540

In [4]:
from birding.primitives import Coordinate

raw_geocode_data = {r: retrieve_geocode(query=r) for r in regions}
geocode_data = {r: data for r, data in raw_geocode_data.items() if data is not None}
assert len(raw_geocode_data) == len(geocode_data), "A region's geocode data was None!"

region_coords = {r: Coordinate.from_geocode_data(data) for r, data in geocode_data.items()}
region_coords

Geocode for query 'Maine, United States' was already cached.
Geocode for query 'Providence, Rhode Island, United States' was already cached.


{'Maine, United States': Coordinate(latitude=45.709097, longitude=-68.8590201),
 'Providence, Rhode Island, United States': Coordinate(latitude=41.8239891, longitude=-71.4128343)}

In [5]:
from birding.ebird_api import EBirdAPI
from birding.primitives import EBirdHotspot

ebird_api = EBirdAPI()

min_hotspots = 5
max_hotspot_dist_km = 5
region_to_hotspots: dict[str, list[EBirdHotspot]] = {}

for region, coord in region_coords.items():
    print(f"\n\nRetrieving nearby hotspots for region '{region}'...")
    hotspots = ebird_api.retrieve_nearby_hotspots(coord, max_hotspot_dist_km)
    print(f"Found {len(hotspots)} nearby hotspots.")

    if len(hotspots) < min_hotspots:
        print(f"Region had less than {min_hotspots} nearby hotspots!")
        more_hotspots = ebird_api.retrieve_nearby_hotspots(coord, distance_km=50)
        print(f"Found {len(more_hotspots)} hotspots within 50 km.")

        region_to_hotspots[region] = more_hotspots
    else:
        region_to_hotspots[region] = hotspots



Retrieving nearby hotspots for region 'Maine, United States'...
Nearby hotspots within 5 km of (45.7091, -68.8590) were already cached.
Found 2 nearby hotspots.
Region had less than 5 nearby hotspots!
Nearby hotspots within 50 km of (45.7091, -68.8590) were already cached.
Found 96 hotspots within 50 km.


Retrieving nearby hotspots for region 'Providence, Rhode Island, United States'...
Nearby hotspots within 5 km of (41.8240, -71.4128) were already cached.
Found 41 nearby hotspots.


In [6]:
region_to_hotspots

{'Maine, United States': [EBirdHotspot(location=EBirdLocation(id='L3028558', name='AMC Gorman Chairback Lodge', coord=Coordinate(latitude=45.4616766, longitude=-69.3166101), country_code='US', subnat1_code='US-ME', subnat1_name=None, subnat2_code='US-ME-021', subnat2_name=None), all_time_species=101),
  EBirdHotspot(location=EBirdLocation(id='L3896821', name='Appalachian Trail--White Cap Mountain', coord=Coordinate(latitude=45.5545086, longitude=-69.245882), country_code='US', subnat1_code='US-ME', subnat1_name=None, subnat2_code='US-ME-021', subnat2_name=None), all_time_species=43),
  EBirdHotspot(location=EBirdLocation(id='L19790628', name='B-52 Site, Elephant Mtn', coord=Coordinate(latitude=45.5280512, longitude=-69.4344453), country_code='US', subnat1_code='US-ME', subnat1_name=None, subnat2_code='US-ME-021', subnat2_name=None), all_time_species=53),
  EBirdHotspot(location=EBirdLocation(id='L296058', name='Baxter SP', coord=Coordinate(latitude=45.9754792, longitude=-68.9351746), c

In [7]:
from geopy.distance import geodesic

from birding.primitives import EBirdHotspot

# Find the nearest hotspot to each region's coordinate
nearest_hotspots: dict[str, EBirdHotspot] = {}
for region, hotspots in region_to_hotspots.items():
    coord = region_coords[region]
    nearest = min(hotspots, key=lambda hs: geodesic(coord, hs.location.coord).mi)
    nearest_hotspots[region] = nearest

nearest_hotspots

{'Maine, United States': EBirdHotspot(location=EBirdLocation(id='L1607267', name='Big Moose Inn and Campground', coord=Coordinate(latitude=45.7294675, longitude=-68.8376149), country_code='US', subnat1_code='US-ME', subnat1_name=None, subnat2_code='US-ME-021', subnat2_name=None), all_time_species=104),
 'Providence, Rhode Island, United States': EBirdHotspot(location=EBirdLocation(id='L24184797', name='Kennedy Plaza, Providence', coord=Coordinate(latitude=41.825473, longitude=-71.412127), country_code='US', subnat1_code='US-RI', subnat1_name=None, subnat2_code='US-RI-007', subnat2_name=None), all_time_species=39)}

In [8]:
# Verify that the dedicated function produces the same output
for region, region_coord in region_coords.items():
    nearest_per_notebook = nearest_hotspots.get(region)
    nearest_per_class = ebird_api.find_nearest_hotspot(region_coord)

    assert nearest_per_notebook == nearest_per_class, (
        f"Nearest hotspot according to the Jupyter notebook: {nearest_per_notebook}\n"
        f"Nearest hotspot according to the API class: {nearest_per_class}"
    )

Nearby hotspots within 0 km of (45.7091, -68.8590) were already cached.
Nearby hotspots within 5 km of (45.7091, -68.8590) were already cached.
Nearby hotspots within 0 km of (41.8240, -71.4128) were already cached.
Nearby hotspots within 5 km of (41.8240, -71.4128) were already cached.


In [9]:
# Find eBird region codes for the regions of interest
region_codes_list = [ebird_api.find_region_code(r, c) for r, c in region_coords.items()]
region_codes_list = [c for c in region_codes_list if c is not None]
region_codes = dict(zip(regions, region_codes_list, strict=True))

region_codes

Nearby hotspots within 0 km of (45.7091, -68.8590) were already cached.
Nearby hotspots within 5 km of (45.7091, -68.8590) were already cached.
Region information for code 'US' was already cached.
Region information for code 'US-ME' was already cached.
Region information for code 'US-ME-021' was already cached.
Nearby hotspots within 0 km of (41.8240, -71.4128) were already cached.
Nearby hotspots within 5 km of (41.8240, -71.4128) were already cached.
Region information for code 'US' was already cached.
Region information for code 'US-RI' was already cached.
Region information for code 'US-RI-007' was already cached.


{'Maine, United States': 'US-ME',
 'Providence, Rhode Island, United States': 'US-RI-007'}

In [10]:
# Find recent hotspots in the regions of interest
recent_hotspots: dict[str, list[EBirdHotspot]] = {
    region: ebird_api.retrieve_hotspots_in_region(code) for region, code in region_codes.items()
}
recent_hotspots

eBird hotspots in region 'US-ME' were already cached.
eBird hotspots in region 'US-RI-007' were already cached.


{'Maine, United States': [EBirdHotspot(location=EBirdLocation(id='L5867205', name='250th Anniversary Park', coord=Coordinate(latitude=43.9191617, longitude=-69.9653551), country_code='US', subnat1_code='US-ME', subnat1_name=None, subnat2_code='US-ME-005', subnat2_name=None), all_time_species=78),
  EBirdHotspot(location=EBirdLocation(id='L3028558', name='AMC Gorman Chairback Lodge', coord=Coordinate(latitude=45.4616766, longitude=-69.3166101), country_code='US', subnat1_code='US-ME', subnat1_name=None, subnat2_code='US-ME-021', subnat2_name=None), all_time_species=101),
  EBirdHotspot(location=EBirdLocation(id='L459957', name='Abagadassett River Mouth', coord=Coordinate(latitude=44.0096078, longitude=-69.8517609), country_code='US', subnat1_code='US-ME', subnat1_name=None, subnat2_code='US-ME-023', subnat2_name=None), all_time_species=154),
  EBirdHotspot(location=EBirdLocation(id='L3863342', name='Abagadassett Wild Rice Fields', coord=Coordinate(latitude=44.0017674, longitude=-69.8489

In [11]:
# Find the species lists for the regions of interest
region_species_lists = {r: ebird_api.retrieve_species_list(c) for r, c in region_codes.items()}
for region, species_list in region_species_lists.items():
    print(f"{len(species_list)} species have been observed in region '{region}'.")

Bird species list for 'US-ME' was already cached.
Bird species list for 'US-RI-007' was already cached.
512 species have been observed in region 'Maine, United States'.
329 species have been observed in region 'Providence, Rhode Island, United States'.


In [12]:
# Compute distances (km) between each region and its recent hotspots
from collections import defaultdict

hotspot_distances_km: dict[str, dict[EBirdHotspot, float]] = defaultdict(dict)

for region, hotspots in recent_hotspots.items():
    region_coord = region_coords[region]
    for hotspot in hotspots:
        hotspot_distances_km[region][hotspot] = geodesic(region_coord, hotspot.location.coord).km

hotspot_distances_km = dict(hotspot_distances_km)
hotspot_distances_km

{'Maine, United States': {EBirdHotspot(location=EBirdLocation(id='L5867205', name='250th Anniversary Park', coord=Coordinate(latitude=43.9191617, longitude=-69.9653551), country_code='US', subnat1_code='US-ME', subnat1_name=None, subnat2_code='US-ME-005', subnat2_name=None), all_time_species=78): 217.306057429487,
  EBirdHotspot(location=EBirdLocation(id='L3028558', name='AMC Gorman Chairback Lodge', coord=Coordinate(latitude=45.4616766, longitude=-69.3166101), country_code='US', subnat1_code='US-ME', subnat1_name=None, subnat2_code='US-ME-021', subnat2_name=None), all_time_species=101): 45.071174496056585,
  EBirdHotspot(location=EBirdLocation(id='L459957', name='Abagadassett River Mouth', coord=Coordinate(latitude=44.0096078, longitude=-69.8517609), country_code='US', subnat1_code='US-ME', subnat1_name=None, subnat2_code='US-ME-023', subnat2_name=None), all_time_species=154): 204.509353286793,
  EBirdHotspot(location=EBirdLocation(id='L3863342', name='Abagadassett Wild Rice Fields', 

In [14]:
# Find taxonomy entries for the species in the regions of interest
region_to_species_taxons = {r: ebird_api.find_species_in_region(c) for r, c in region_codes.items()}
region_to_species_taxons

Bird species list for 'US-ME' was already cached.
Bird species list for 'US-RI-007' was already cached.


{'Maine, United States': [EBirdSpecies(common_name='Black-bellied Whistling-Duck', specific_name='autumnalis', generic_name='Dendrocygna', family_common_name='Ducks, Geese, and Waterfowl', family='Anatidae', order='Anseriformes', ebird_species_code='bbwduc'),
  EBirdSpecies(common_name='Snow Goose', specific_name='caerulescens', generic_name='Anser', family_common_name='Ducks, Geese, and Waterfowl', family='Anatidae', order='Anseriformes', ebird_species_code='snogoo'),
  EBirdSpecies(common_name="Ross's Goose", specific_name='rossii', generic_name='Anser', family_common_name='Ducks, Geese, and Waterfowl', family='Anatidae', order='Anseriformes', ebird_species_code='rosgoo'),
  EBirdSpecies(common_name="Snow x Ross's Goose (hybrid)", specific_name='caerulescens x rossii', generic_name='Anser', family_common_name='Ducks, Geese, and Waterfowl', family='Anatidae', order='Anseriformes', ebird_species_code='sxrgoo1'),
  EBirdSpecies(common_name='Graylag Goose', specific_name='anser', generic

In [15]:
region_to_species_yaml = {
    r: [s.to_yaml() for s in species_list] for r, species_list in region_to_species_taxons.items()
}
region_to_species_yaml

{'Maine, United States': [{'common_name': 'Black-bellied Whistling-Duck',
   'scientific_name': 'Dendrocygna autumnalis',
   'family_common_name': 'Ducks, Geese, and Waterfowl',
   'family': 'Anatidae',
   'order': 'Anseriformes',
   'ebird_species_code': 'bbwduc'},
  {'common_name': 'Snow Goose',
   'scientific_name': 'Anser caerulescens',
   'family_common_name': 'Ducks, Geese, and Waterfowl',
   'family': 'Anatidae',
   'order': 'Anseriformes',
   'ebird_species_code': 'snogoo'},
  {'common_name': "Ross's Goose",
   'scientific_name': 'Anser rossii',
   'family_common_name': 'Ducks, Geese, and Waterfowl',
   'family': 'Anatidae',
   'order': 'Anseriformes',
   'ebird_species_code': 'rosgoo'},
  {'common_name': "Snow x Ross's Goose (hybrid)",
   'scientific_name': 'Anser caerulescens x rossii',
   'family_common_name': 'Ducks, Geese, and Waterfowl',
   'family': 'Anatidae',
   'order': 'Anseriformes',
   'ebird_species_code': 'sxrgoo1'},
  {'common_name': 'Graylag Goose',
   'scienti

In [29]:
import datetime
from pathlib import Path

import yaml

output_dir = Path.cwd().parent / "data"

local_tz = datetime.datetime.now().astimezone().tzinfo
d = datetime.datetime.now(tz=local_tz)
today = f"{d:%I:%M:%S %p %B %d, %Y}"

for region, species_data in region_to_species_yaml.items():
    region_code = region_codes[region]
    yaml_path = output_dir / f"{region_code}-species.yaml"
    yaml_data = {
        "region": region,
        "region_code": region_code,
        "exported_on": today,
        "species": species_data,
    }

    with yaml_path.open("w") as outfile:
        yaml.dump(yaml_data, outfile, default_flow_style=False)

In [31]:
# Alternative output format: Hierarchical taxonomy
from birding.taxonomies import TaxonomicClass

region_taxonomies: dict[str, TaxonomicClass] = {}

for region, species_list in region_to_species_taxons.items():
    region_taxonomy = TaxonomicClass("Aves")
    for species in species_list:
        region_taxonomy.insert_species(species)
    region_taxonomies[region] = region_taxonomy

region_taxonomies

{'Maine, United States': TaxonomicClass(class_name='Aves', orders={'Anseriformes': TaxonomicOrder(order_name='Anseriformes', families={'Anatidae': TaxonomicFamily(family_name='Anatidae', genera={'Dendrocygna': TaxonomicGenus(generic_name='Dendrocygna', species={'autumnalis': TaxonomicSpecies(specific_name='autumnalis')}), 'Anser': TaxonomicGenus(generic_name='Anser', species={'caerulescens': TaxonomicSpecies(specific_name='caerulescens'), 'rossii': TaxonomicSpecies(specific_name='rossii'), 'caerulescens x rossii': TaxonomicSpecies(specific_name='caerulescens x rossii'), 'anser': TaxonomicSpecies(specific_name='anser'), 'cygnoides': TaxonomicSpecies(specific_name='cygnoides'), 'albifrons': TaxonomicSpecies(specific_name='albifrons'), 'brachyrhynchus': TaxonomicSpecies(specific_name='brachyrhynchus'), 'cygnoides x Branta canadensis': TaxonomicSpecies(specific_name='cygnoides x Branta canadensis'), 'albifrons x Branta canadensis': TaxonomicSpecies(specific_name='albifrons x Branta canaden

In [33]:
local_tz = datetime.datetime.now().astimezone().tzinfo
d = datetime.datetime.now(tz=local_tz)
today = f"{d:%I:%M:%S %p %B %d, %Y}"

for region, taxonomy in region_taxonomies.items():
    region_code = region_codes[region]
    yaml_path = output_dir / f"{region_code}-hierarchy.yaml"
    yaml_data = {
        "region": region,
        "region_code": region_code,
        "exported_on": today,
        "bird_taxonomy": taxonomy.to_yaml(),
    }

    with yaml_path.open("w") as outfile:
        yaml.dump(yaml_data, outfile, default_flow_style=False)