In [42]:
import requests
import csv
import time
import os
from tqdm import tqdm

Define some parameters:
- The time to wait between the calls to the API
- The output file location and name
- The query / filter to use
- The API URL

In [43]:
wait_time_between_requests = 1 # 1 second as requested in the API documentation (https://xeno-canto.org/explore/api)
file_save_location = "../data/raw/xeno-canto.csv"
audio_files_dir = "../data/raw/audio/xeno_canto/"

query = 'area:europe+len:"<10"+q:">C"+grp:birds' # query for european birds with length less than 10 seconds and quality A or B (the two best qualities)
api_url = f"https://xeno-canto.org/api/2/recordings?query={query}&page="
recordings_to_ignore = ["692343", "545395", "394335", "351080", "473934", "471533", "827119", "471258", "567174", "672135"] # recordings with an invalid / missing audio file url

In [44]:
if not os.path.exists(audio_files_dir):
    os.makedirs(audio_files_dir)

Define the CSV columns

In [45]:
all_data = []
csv_headers = ["id", "gen", "sp", "ssp", "group", "en", "rec", "cnt", "loc", 
               "lat", "lng", "type", "sex", "stage", "method", "url", 
               "file", "file-name", "sono_small", "sono_med", "sono_large", 
               "sono_full", "osci_small", "osci_med", "osci_large", "lic", 
               "q", "length", "time", "date", "uploaded", "also", "rmk", 
               "bird-seen", "animal-seen", "playback-used", "temperature", 
               "regnr", "auto", "dvc", "mic", "smp"]

Define the function to get the data from the API

In [46]:
def get_data_from_api(page):
    response = requests.get(api_url + str(page))
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching data from page {page}: {response.status_code}")
        return None

Define the function to save the data to a CSV file

In [48]:
def save_to_csv(data, filename):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=csv_headers, quoting=csv.QUOTE_ALL) # quote all fields because some fields contain commas and newlines
        writer.writeheader()
        writer.writerows(data)
    print(f"Data saved to {filename}")

Define the function to save the audio files

In [49]:
def download_audio_file(file_url, file_name):
    try:
        audio_response = requests.get(file_url, stream=True)
        if audio_response.status_code == 200:
            # save the audio file to the specified directory
            file_path = os.path.join(audio_files_dir, file_name)

            with open(file_path, 'wb') as audio_file:
                for chunk in audio_response.iter_content(chunk_size=1024):
                    if chunk:
                        audio_file.write(chunk)
        else:
            print(f"Failed to download {file_name}: {audio_response.status_code}")
    except Exception as e:
        print(f"Error downloading {file_name}: {e}")

Loop through all pages and save all data into a list

In [50]:
# fetch the first page to get total pages
first_page_data = get_data_from_api(1)
if first_page_data:
    num_pages = first_page_data['numPages']
    print(f"Total pages to scrape: {num_pages}")

    # loop through all pages
    for page in tqdm(range(1, num_pages + 1)):
        print(f"Scraping page {page}...")
        
        # get the data from the current page
        page_data = get_data_from_api(page)
        if page_data and "recordings" in page_data:
            # extract relevant fields from each recording
            for recording in page_data["recordings"]:
                if recording.get("id") in recordings_to_ignore:
                    continue
                
                recording_data = {
                    "id": recording.get("id"),
                    "gen": recording.get("gen"),
                    "sp": recording.get("sp"),
                    "ssp": recording.get("ssp"),
                    "group": recording.get("group"),
                    "en": recording.get("en"),
                    "rec": recording.get("rec"),
                    "cnt": recording.get("cnt"),
                    "loc": recording.get("loc"),
                    "lat": recording.get("lat"),
                    "lng": recording.get("lng"),
                    "type": recording.get("type"),
                    "sex": recording.get("sex"),
                    "stage": recording.get("stage"),
                    "method": recording.get("method"),
                    "url": recording.get("url"),
                    "file": recording.get("file"),
                    "file-name": recording.get("file-name"),
                    "sono_small": recording["sono"].get("small") if "sono" in recording else "",
                    "sono_med": recording["sono"].get("med") if "sono" in recording else "",
                    "sono_large": recording["sono"].get("large") if "sono" in recording else "",
                    "sono_full": recording["sono"].get("full") if "sono" in recording else "",
                    "osci_small": recording["osci"].get("small") if "osci" in recording else "",
                    "osci_med": recording["osci"].get("med") if "osci" in recording else "",
                    "osci_large": recording["osci"].get("large") if "osci" in recording else "",
                    "lic": recording.get("lic"),
                    "q": recording.get("q"),
                    "length": recording.get("length"),
                    "time": recording.get("time"),
                    "date": recording.get("date"),
                    "uploaded": recording.get("uploaded"),
                    "also": ", ".join(recording.get("also", [])),
                    "rmk": recording.get("rmk"),
                    "bird-seen": recording.get("bird-seen"),
                    "animal-seen": recording.get("animal-seen"),
                    "playback-used": recording.get("playback-used"),
                    "temperature": recording.get("temp", ""),
                    "regnr": recording.get("regnr", ""),
                    "auto": recording.get("auto"),
                    "dvc": recording.get("dvc"),
                    "mic": recording.get("mic"),
                    "smp": recording.get("smp")
                }
                all_data.append(recording_data)

        # wait for 1 second before making the next request to not overload the server
        time.sleep(wait_time_between_requests)

Total pages to scrape: 79


  0%|          | 0/79 [00:00<?, ?it/s]

Scraping page 1...


  1%|▏         | 1/79 [00:04<06:03,  4.66s/it]

Scraping page 2...


  3%|▎         | 2/79 [00:08<05:41,  4.43s/it]

Scraping page 3...


  4%|▍         | 3/79 [00:13<05:42,  4.51s/it]

Scraping page 4...


  5%|▌         | 4/79 [00:18<05:43,  4.58s/it]

Scraping page 5...


  6%|▋         | 5/79 [00:22<05:39,  4.59s/it]

Scraping page 6...


  8%|▊         | 6/79 [00:27<05:32,  4.55s/it]

Scraping page 7...


  9%|▉         | 7/79 [00:31<05:30,  4.59s/it]

Scraping page 8...


 10%|█         | 8/79 [00:36<05:30,  4.66s/it]

Scraping page 9...


 11%|█▏        | 9/79 [00:41<05:33,  4.77s/it]

Scraping page 10...


 13%|█▎        | 10/79 [00:46<05:31,  4.80s/it]

Scraping page 11...


 14%|█▍        | 11/79 [00:51<05:33,  4.90s/it]

Scraping page 12...


 15%|█▌        | 12/79 [00:56<05:27,  4.89s/it]

Scraping page 13...


 16%|█▋        | 13/79 [01:01<05:18,  4.83s/it]

Scraping page 14...


 18%|█▊        | 14/79 [01:05<05:09,  4.76s/it]

Scraping page 15...


 19%|█▉        | 15/79 [01:10<05:00,  4.69s/it]

Scraping page 16...


 20%|██        | 16/79 [01:15<04:57,  4.72s/it]

Scraping page 17...


 22%|██▏       | 17/79 [01:20<04:54,  4.75s/it]

Scraping page 18...


 23%|██▎       | 18/79 [01:24<04:48,  4.73s/it]

Scraping page 19...


 24%|██▍       | 19/79 [01:29<04:45,  4.76s/it]

Scraping page 20...


 25%|██▌       | 20/79 [01:34<04:41,  4.77s/it]

Scraping page 21...


 27%|██▋       | 21/79 [01:39<04:39,  4.81s/it]

Scraping page 22...


 28%|██▊       | 22/79 [01:44<04:36,  4.85s/it]

Scraping page 23...


 29%|██▉       | 23/79 [01:49<04:33,  4.89s/it]

Scraping page 24...


 30%|███       | 24/79 [01:54<04:31,  4.94s/it]

Scraping page 25...


 32%|███▏      | 25/79 [01:59<04:31,  5.03s/it]

Scraping page 26...


 33%|███▎      | 26/79 [02:05<04:35,  5.20s/it]

Scraping page 27...


 34%|███▍      | 27/79 [02:10<04:27,  5.14s/it]

Scraping page 28...


 35%|███▌      | 28/79 [02:15<04:21,  5.13s/it]

Scraping page 29...


 37%|███▋      | 29/79 [02:19<04:10,  5.01s/it]

Scraping page 30...


 38%|███▊      | 30/79 [02:24<04:02,  4.95s/it]

Scraping page 31...


 39%|███▉      | 31/79 [02:29<03:56,  4.92s/it]

Scraping page 32...


 41%|████      | 32/79 [02:34<03:53,  4.96s/it]

Scraping page 33...


 42%|████▏     | 33/79 [02:39<03:46,  4.93s/it]

Scraping page 34...


 43%|████▎     | 34/79 [02:44<03:41,  4.93s/it]

Scraping page 35...


 44%|████▍     | 35/79 [02:49<03:35,  4.89s/it]

Scraping page 36...


 46%|████▌     | 36/79 [02:54<03:31,  4.93s/it]

Scraping page 37...


 47%|████▋     | 37/79 [02:59<03:32,  5.05s/it]

Scraping page 38...


 48%|████▊     | 38/79 [03:04<03:25,  5.02s/it]

Scraping page 39...


 49%|████▉     | 39/79 [03:09<03:21,  5.05s/it]

Scraping page 40...


 51%|█████     | 40/79 [03:14<03:19,  5.13s/it]

Scraping page 41...


 52%|█████▏    | 41/79 [03:20<03:15,  5.16s/it]

Scraping page 42...


 53%|█████▎    | 42/79 [03:25<03:14,  5.25s/it]

Scraping page 43...


 54%|█████▍    | 43/79 [03:30<03:08,  5.24s/it]

Scraping page 44...


 56%|█████▌    | 44/79 [03:36<03:12,  5.51s/it]

Scraping page 45...


 57%|█████▋    | 45/79 [03:43<03:14,  5.71s/it]

Scraping page 46...


 58%|█████▊    | 46/79 [03:48<03:02,  5.54s/it]

Scraping page 47...


 59%|█████▉    | 47/79 [03:53<02:57,  5.54s/it]

Scraping page 48...


 61%|██████    | 48/79 [03:59<02:49,  5.46s/it]

Scraping page 49...


 62%|██████▏   | 49/79 [04:04<02:46,  5.56s/it]

Scraping page 50...


 63%|██████▎   | 50/79 [04:11<02:46,  5.74s/it]

Scraping page 51...


 65%|██████▍   | 51/79 [04:17<02:45,  5.89s/it]

Scraping page 52...


 66%|██████▌   | 52/79 [04:23<02:39,  5.89s/it]

Scraping page 53...


 67%|██████▋   | 53/79 [04:29<02:39,  6.14s/it]

Scraping page 54...


 68%|██████▊   | 54/79 [04:37<02:41,  6.47s/it]

Scraping page 55...


 70%|██████▉   | 55/79 [04:44<02:38,  6.62s/it]

Scraping page 56...


 71%|███████   | 56/79 [04:50<02:27,  6.41s/it]

Scraping page 57...


 72%|███████▏  | 57/79 [04:56<02:19,  6.33s/it]

Scraping page 58...


 73%|███████▎  | 58/79 [05:02<02:10,  6.21s/it]

Scraping page 59...


 75%|███████▍  | 59/79 [05:08<02:03,  6.16s/it]

Scraping page 60...


 76%|███████▌  | 60/79 [05:15<02:01,  6.42s/it]

Scraping page 61...


 77%|███████▋  | 61/79 [05:22<02:02,  6.78s/it]

Scraping page 62...


 78%|███████▊  | 62/79 [05:29<01:55,  6.82s/it]

Scraping page 63...


 80%|███████▉  | 63/79 [05:35<01:45,  6.62s/it]

Scraping page 64...


 81%|████████  | 64/79 [05:42<01:37,  6.49s/it]

Scraping page 65...


 82%|████████▏ | 65/79 [05:47<01:27,  6.25s/it]

Scraping page 66...


 84%|████████▎ | 66/79 [05:53<01:18,  6.08s/it]

Scraping page 67...


 85%|████████▍ | 67/79 [05:59<01:12,  6.01s/it]

Scraping page 68...


 86%|████████▌ | 68/79 [06:05<01:05,  5.99s/it]

Scraping page 69...


 87%|████████▋ | 69/79 [06:13<01:06,  6.61s/it]

Scraping page 70...


 89%|████████▊ | 70/79 [06:19<00:58,  6.52s/it]

Scraping page 71...


 90%|████████▉ | 71/79 [06:26<00:53,  6.64s/it]

Scraping page 72...


 91%|█████████ | 72/79 [06:33<00:48,  6.86s/it]

Scraping page 73...


 92%|█████████▏| 73/79 [06:42<00:43,  7.27s/it]

Scraping page 74...


 94%|█████████▎| 74/79 [06:49<00:36,  7.29s/it]

Scraping page 75...


 95%|█████████▍| 75/79 [06:56<00:28,  7.10s/it]

Scraping page 76...


 96%|█████████▌| 76/79 [07:02<00:20,  6.74s/it]

Scraping page 77...


 97%|█████████▋| 77/79 [07:08<00:13,  6.56s/it]

Scraping page 78...


 99%|█████████▊| 78/79 [07:14<00:06,  6.40s/it]

Scraping page 79...


100%|██████████| 79/79 [07:20<00:00,  5.57s/it]


Export the scraped data to a CSV file

In [51]:
save_to_csv(all_data, file_save_location)

Data saved to ../data/raw/xeno-canto.csv


Also download all the audio files to a folder. **Note**: This takes a long time (2-3 hours) and a lot of space (~100GB).

In [37]:
for recording_data in tqdm(all_data):
    audio_url = recording_data["file"]
    original_audio_file_name = recording_data["file-name"]
    file_extension = original_audio_file_name.split(".")[-1]
    new_audio_file_name = f"{recording_data['id']}.{file_extension}"

    download_audio_file(audio_url, new_audio_file_name)

 26%|██▌       | 9985/39083 [43:23<13:26:04,  1.66s/it]

Failed to download 692343.mp3: 502


 35%|███▌      | 13721/39083 [58:49<23:18:17,  3.31s/it]

Error downloading 545395.mp3: HTTPSConnectionPool(host='xeno-canto.org', port=443): Max retries exceeded with url: /545395/download (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x7f3bfc547340>: Failed to establish a new connection: [Errno -3] Temporary failure in name resolution'))


 65%|██████▍   | 25222/39083 [1:43:42<22:07, 10.44it/s]   

Error downloading 394335.: Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error downloading 351080.: Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error downloading 473934.: Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error downloading 471533.: Invalid URL '': No scheme supplied. Perhaps you meant https://?


 79%|███████▉  | 30820/39083 [2:06:37<22:03,  6.25it/s]   

Error downloading 827119.: Invalid URL '': No scheme supplied. Perhaps you meant https://?


 80%|███████▉  | 31154/39083 [2:07:57<23:12,  5.69it/s]  

Failed to download 471258.mp3: 500


 81%|████████  | 31642/39083 [2:09:59<30:26,  4.07it/s]  

Failed to download 567174.mp3: 500


 86%|████████▌ | 33461/39083 [2:17:04<19:08,  4.89it/s]  

Error downloading 672135.: Invalid URL '': No scheme supplied. Perhaps you meant https://?


100%|██████████| 39083/39083 [2:39:18<00:00,  4.09it/s]
