# Important
You don't need to execute this notebook if you have already downloaded the data using the notebook `00-download-data-archive.ipynb`.

In [15]:
import requests
import pandas as pd
import time
import os
from tqdm import tqdm
from pathlib import Path

Define some parameters:
- The time to wait between the calls to the API
- The output file location and name
- The query / filter to use
- The API URL

In [2]:
wait_time_between_requests = 1 # 1 second as requested in the API documentation (https://xeno-canto.org/explore/api)
file_save_location = "../data/raw/xeno-canto.parquet"
audio_files_dir = "../data/raw/audio/xeno_canto/"

query = 'area:europe+len:"5-20"+q:">D"+grp:birds' # query for european birds with length 5-20 seconds and quality A, B or C (the three best qualities)
api_url = f"https://xeno-canto.org/api/2/recordings?query={query}&page="

In [3]:
if not os.path.exists(audio_files_dir):
    os.makedirs(audio_files_dir)

Define the function to get the data from the API

In [5]:
def get_data_from_api(page):
    response = requests.get(api_url + str(page))
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error fetching data from page {page}: {response.status_code}")
        return None

Define the function to save the data to a CSV file

In [6]:
def save_to_parquet(data, filename):
    # Convert the list of dictionaries to a pandas DataFrame
    df = pd.DataFrame(data)
    # Save the DataFrame to a Parquet file
    df.to_parquet(filename, engine='pyarrow', index=False)
    print(f"Data saved to {filename}")

Define the function to save the audio files

In [7]:
def download_audio_file(file_url, file_name):
    try:
        audio_response = requests.get(file_url, stream=True)
        if audio_response.status_code == 200:
            # save the audio file to the specified directory
            file_path = os.path.join(audio_files_dir, file_name)

            with open(file_path, 'wb') as audio_file:
                for chunk in audio_response.iter_content(chunk_size=1024):
                    if chunk:
                        audio_file.write(chunk)
        else:
            print(f"Failed to download {file_name}: {audio_response.status_code}")
    except Exception as e:
        print(f"Error downloading {file_name}: {e}")

Loop through all pages and save all data into a list

In [8]:
all_data = []

# fetch the first page to get total pages
first_page_data = get_data_from_api(1)
if first_page_data:
    num_pages = first_page_data['numPages']
    print(f"Total pages to scrape: {num_pages}")

    # loop through all pages
    for page in tqdm(range(1, num_pages + 1)):
        print(f"Scraping page {page}...")
        
        # get the data from the current page
        page_data = get_data_from_api(page)
        if page_data and "recordings" in page_data:
            # extract relevant fields from each recording
            for recording in page_data["recordings"]:              
                recording_data = {
                    "id": recording.get("id"),
                    "gen": recording.get("gen"),
                    "sp": recording.get("sp"),
                    "ssp": recording.get("ssp"),
                    "group": recording.get("group"),
                    "en": recording.get("en"),
                    "rec": recording.get("rec"),
                    "cnt": recording.get("cnt"),
                    "loc": recording.get("loc"),
                    "lat": recording.get("lat"),
                    "lng": recording.get("lng"),
                    "type": recording.get("type"),
                    "sex": recording.get("sex"),
                    "stage": recording.get("stage"),
                    "method": recording.get("method"),
                    "url": recording.get("url"),
                    "file": recording.get("file"),
                    "file-name": recording.get("file-name"),
                    "sono_small": recording["sono"].get("small") if "sono" in recording else "",
                    "sono_med": recording["sono"].get("med") if "sono" in recording else "",
                    "sono_large": recording["sono"].get("large") if "sono" in recording else "",
                    "sono_full": recording["sono"].get("full") if "sono" in recording else "",
                    "osci_small": recording["osci"].get("small") if "osci" in recording else "",
                    "osci_med": recording["osci"].get("med") if "osci" in recording else "",
                    "osci_large": recording["osci"].get("large") if "osci" in recording else "",
                    "lic": recording.get("lic"),
                    "q": recording.get("q"),
                    "length": recording.get("length"),
                    "time": recording.get("time"),
                    "date": recording.get("date"),
                    "uploaded": recording.get("uploaded"),
                    "also": ", ".join(recording.get("also", [])),
                    "rmk": recording.get("rmk"),
                    "bird-seen": recording.get("bird-seen"),
                    "animal-seen": recording.get("animal-seen"),
                    "playback-used": recording.get("playback-used"),
                    "temperature": recording.get("temp", ""),
                    "regnr": recording.get("regnr", ""),
                    "auto": recording.get("auto"),
                    "dvc": recording.get("dvc"),
                    "mic": recording.get("mic"),
                    "smp": recording.get("smp")
                }
                all_data.append(recording_data)

        # wait for 1 second before making the next request to not overload the server
        time.sleep(wait_time_between_requests)

Total pages to scrape: 180


  0%|          | 0/180 [00:00<?, ?it/s]

Scraping page 1...


  1%|          | 1/180 [00:01<04:29,  1.51s/it]

Scraping page 2...


  1%|          | 2/180 [00:04<07:51,  2.65s/it]

Scraping page 3...


  2%|▏         | 3/180 [00:08<08:35,  2.91s/it]

Scraping page 4...


  2%|▏         | 4/180 [00:11<09:00,  3.07s/it]

Scraping page 5...


  3%|▎         | 5/180 [00:14<09:14,  3.17s/it]

Scraping page 6...


  3%|▎         | 6/180 [00:18<09:29,  3.27s/it]

Scraping page 7...


  4%|▍         | 7/180 [00:21<09:25,  3.27s/it]

Scraping page 8...


  4%|▍         | 8/180 [00:25<09:31,  3.32s/it]

Scraping page 9...


  5%|▌         | 9/180 [00:28<09:23,  3.30s/it]

Scraping page 10...


  6%|▌         | 10/180 [00:32<10:00,  3.53s/it]

Scraping page 11...


  6%|▌         | 11/180 [00:35<09:45,  3.47s/it]

Scraping page 12...


  7%|▋         | 12/180 [00:38<09:35,  3.43s/it]

Scraping page 13...


  7%|▋         | 13/180 [00:42<09:26,  3.39s/it]

Scraping page 14...


  8%|▊         | 14/180 [00:45<09:29,  3.43s/it]

Scraping page 15...


  8%|▊         | 15/180 [00:49<09:22,  3.41s/it]

Scraping page 16...


  9%|▉         | 16/180 [00:52<09:19,  3.41s/it]

Scraping page 17...


  9%|▉         | 17/180 [00:56<09:19,  3.43s/it]

Scraping page 18...


 10%|█         | 18/180 [00:59<09:05,  3.37s/it]

Scraping page 19...


 11%|█         | 19/180 [01:02<09:00,  3.36s/it]

Scraping page 20...


 11%|█         | 20/180 [01:05<08:53,  3.33s/it]

Scraping page 21...


 12%|█▏        | 21/180 [01:09<08:48,  3.33s/it]

Scraping page 22...


 12%|█▏        | 22/180 [01:12<08:51,  3.36s/it]

Scraping page 23...


 13%|█▎        | 23/180 [01:16<08:51,  3.39s/it]

Scraping page 24...


 13%|█▎        | 24/180 [01:19<08:52,  3.41s/it]

Scraping page 25...


 14%|█▍        | 25/180 [01:23<08:56,  3.46s/it]

Scraping page 26...


 14%|█▍        | 26/180 [01:26<08:54,  3.47s/it]

Scraping page 27...


 15%|█▌        | 27/180 [01:30<08:48,  3.45s/it]

Scraping page 28...


 16%|█▌        | 28/180 [01:33<08:50,  3.49s/it]

Scraping page 29...


 16%|█▌        | 29/180 [01:37<08:50,  3.51s/it]

Scraping page 30...


 17%|█▋        | 30/180 [01:40<08:39,  3.46s/it]

Scraping page 31...


 17%|█▋        | 31/180 [01:43<08:29,  3.42s/it]

Scraping page 32...


 18%|█▊        | 32/180 [01:47<08:27,  3.43s/it]

Scraping page 33...


 18%|█▊        | 33/180 [01:50<08:26,  3.45s/it]

Scraping page 34...


 19%|█▉        | 34/180 [01:54<08:26,  3.47s/it]

Scraping page 35...


 19%|█▉        | 35/180 [01:57<08:19,  3.44s/it]

Scraping page 36...


 20%|██        | 36/180 [02:01<08:12,  3.42s/it]

Scraping page 37...


 21%|██        | 37/180 [02:04<08:12,  3.45s/it]

Scraping page 38...


 21%|██        | 38/180 [02:08<08:17,  3.50s/it]

Scraping page 39...


 22%|██▏       | 39/180 [02:11<08:23,  3.57s/it]

Scraping page 40...


 22%|██▏       | 40/180 [02:15<08:26,  3.61s/it]

Scraping page 41...


 23%|██▎       | 41/180 [02:19<08:17,  3.58s/it]

Scraping page 42...


 23%|██▎       | 42/180 [02:22<08:15,  3.59s/it]

Scraping page 43...


 24%|██▍       | 43/180 [02:26<08:14,  3.61s/it]

Scraping page 44...


 24%|██▍       | 44/180 [02:30<08:11,  3.62s/it]

Scraping page 45...


 25%|██▌       | 45/180 [02:35<09:28,  4.21s/it]

Scraping page 46...


 26%|██▌       | 46/180 [02:39<09:30,  4.25s/it]

Scraping page 47...


 26%|██▌       | 47/180 [02:43<08:57,  4.04s/it]

Scraping page 48...


 27%|██▋       | 48/180 [02:47<09:05,  4.13s/it]

Scraping page 49...


 27%|██▋       | 49/180 [02:51<08:43,  4.00s/it]

Scraping page 50...


 28%|██▊       | 50/180 [02:55<08:19,  3.84s/it]

Scraping page 51...


 28%|██▊       | 51/180 [02:58<08:17,  3.86s/it]

Scraping page 52...


 29%|██▉       | 52/180 [03:04<09:07,  4.28s/it]

Scraping page 53...


 29%|██▉       | 53/180 [03:09<09:37,  4.54s/it]

Scraping page 54...


 30%|███       | 54/180 [03:13<09:06,  4.34s/it]

Scraping page 55...


 31%|███       | 55/180 [03:17<08:51,  4.26s/it]

Scraping page 56...


 31%|███       | 56/180 [03:21<08:28,  4.10s/it]

Scraping page 57...


 32%|███▏      | 57/180 [03:25<08:28,  4.13s/it]

Scraping page 58...


 32%|███▏      | 58/180 [03:29<08:15,  4.06s/it]

Scraping page 59...


 33%|███▎      | 59/180 [03:33<08:21,  4.15s/it]

Scraping page 60...


 33%|███▎      | 60/180 [03:37<08:01,  4.01s/it]

Scraping page 61...


 34%|███▍      | 61/180 [03:41<07:56,  4.00s/it]

Scraping page 62...


 34%|███▍      | 62/180 [03:44<07:44,  3.94s/it]

Scraping page 63...


 35%|███▌      | 63/180 [03:48<07:42,  3.95s/it]

Scraping page 64...


 36%|███▌      | 64/180 [03:53<07:56,  4.11s/it]

Scraping page 65...


 36%|███▌      | 65/180 [03:57<07:52,  4.11s/it]

Scraping page 66...


 37%|███▋      | 66/180 [04:01<07:43,  4.07s/it]

Scraping page 67...


 37%|███▋      | 67/180 [04:05<07:30,  3.99s/it]

Scraping page 68...


 38%|███▊      | 68/180 [04:09<07:23,  3.96s/it]

Scraping page 69...


 38%|███▊      | 69/180 [04:12<07:12,  3.90s/it]

Scraping page 70...


 39%|███▉      | 70/180 [04:16<07:10,  3.92s/it]

Scraping page 71...


 39%|███▉      | 71/180 [04:21<07:30,  4.14s/it]

Scraping page 72...


 40%|████      | 72/180 [04:25<07:22,  4.09s/it]

Scraping page 73...


 41%|████      | 73/180 [04:29<07:17,  4.09s/it]

Scraping page 74...


 41%|████      | 74/180 [04:33<07:15,  4.11s/it]

Scraping page 75...


 42%|████▏     | 75/180 [04:37<07:06,  4.07s/it]

Scraping page 76...


 42%|████▏     | 76/180 [04:41<06:58,  4.02s/it]

Scraping page 77...


 43%|████▎     | 77/180 [04:45<06:52,  4.00s/it]

Scraping page 78...


 43%|████▎     | 78/180 [04:49<06:49,  4.02s/it]

Scraping page 79...


 44%|████▍     | 79/180 [04:53<06:54,  4.10s/it]

Scraping page 80...


 44%|████▍     | 80/180 [04:57<06:46,  4.06s/it]

Scraping page 81...


 45%|████▌     | 81/180 [05:02<06:45,  4.10s/it]

Scraping page 82...


 46%|████▌     | 82/180 [05:08<07:35,  4.64s/it]

Scraping page 83...


 46%|████▌     | 83/180 [05:11<07:09,  4.43s/it]

Scraping page 84...


 47%|████▋     | 84/180 [05:15<06:49,  4.26s/it]

Scraping page 85...


 47%|████▋     | 85/180 [05:19<06:38,  4.20s/it]

Scraping page 86...


 48%|████▊     | 86/180 [05:24<06:38,  4.23s/it]

Scraping page 87...


 48%|████▊     | 87/180 [05:28<06:32,  4.22s/it]

Scraping page 88...


 49%|████▉     | 88/180 [05:32<06:32,  4.27s/it]

Scraping page 89...


 49%|████▉     | 89/180 [05:37<06:30,  4.29s/it]

Scraping page 90...


 50%|█████     | 90/180 [05:41<06:24,  4.27s/it]

Scraping page 91...


 51%|█████     | 91/180 [05:45<06:19,  4.27s/it]

Scraping page 92...


 51%|█████     | 92/180 [05:49<06:10,  4.21s/it]

Scraping page 93...


 52%|█████▏    | 93/180 [05:54<06:25,  4.44s/it]

Scraping page 94...


 52%|█████▏    | 94/180 [05:58<06:14,  4.35s/it]

Scraping page 95...


 53%|█████▎    | 95/180 [06:03<06:06,  4.32s/it]

Scraping page 96...


 53%|█████▎    | 96/180 [06:07<06:01,  4.30s/it]

Scraping page 97...


 54%|█████▍    | 97/180 [06:11<05:54,  4.27s/it]

Scraping page 98...


 54%|█████▍    | 98/180 [06:15<05:46,  4.22s/it]

Scraping page 99...


 55%|█████▌    | 99/180 [06:19<05:36,  4.15s/it]

Scraping page 100...


 56%|█████▌    | 100/180 [06:24<05:40,  4.25s/it]

Scraping page 101...


 56%|█████▌    | 101/180 [06:28<05:45,  4.37s/it]

Scraping page 102...


 57%|█████▋    | 102/180 [06:33<05:46,  4.44s/it]

Scraping page 103...


 57%|█████▋    | 103/180 [06:37<05:39,  4.41s/it]

Scraping page 104...


 58%|█████▊    | 104/180 [06:42<05:44,  4.53s/it]

Scraping page 105...


 58%|█████▊    | 105/180 [06:46<05:35,  4.48s/it]

Scraping page 106...


 59%|█████▉    | 106/180 [06:51<05:32,  4.49s/it]

Scraping page 107...


 59%|█████▉    | 107/180 [06:55<05:25,  4.46s/it]

Scraping page 108...


 60%|██████    | 108/180 [07:00<05:19,  4.44s/it]

Scraping page 109...


 61%|██████    | 109/180 [07:06<05:51,  4.95s/it]

Scraping page 110...


 61%|██████    | 110/180 [07:10<05:23,  4.62s/it]

Scraping page 111...


 62%|██████▏   | 111/180 [07:13<04:58,  4.32s/it]

Scraping page 112...


 62%|██████▏   | 112/180 [07:17<04:48,  4.25s/it]

Scraping page 113...


 63%|██████▎   | 113/180 [07:21<04:37,  4.14s/it]

Scraping page 114...


 63%|██████▎   | 114/180 [07:25<04:28,  4.07s/it]

Scraping page 115...


 64%|██████▍   | 115/180 [07:29<04:20,  4.00s/it]

Scraping page 116...


 64%|██████▍   | 116/180 [07:33<04:14,  3.98s/it]

Scraping page 117...


 65%|██████▌   | 117/180 [07:37<04:12,  4.01s/it]

Scraping page 118...


 66%|██████▌   | 118/180 [07:41<04:07,  4.00s/it]

Scraping page 119...


 66%|██████▌   | 119/180 [07:45<04:09,  4.09s/it]

Scraping page 120...


 67%|██████▋   | 120/180 [07:49<04:03,  4.06s/it]

Scraping page 121...


 67%|██████▋   | 121/180 [07:53<04:02,  4.12s/it]

Scraping page 122...


 68%|██████▊   | 122/180 [07:57<03:55,  4.07s/it]

Scraping page 123...


 68%|██████▊   | 123/180 [08:02<04:05,  4.31s/it]

Scraping page 124...


 69%|██████▉   | 124/180 [08:06<03:54,  4.18s/it]

Scraping page 125...


 69%|██████▉   | 125/180 [08:10<03:48,  4.15s/it]

Scraping page 126...


 70%|███████   | 126/180 [08:14<03:41,  4.11s/it]

Scraping page 127...


 71%|███████   | 127/180 [08:18<03:37,  4.11s/it]

Scraping page 128...


 71%|███████   | 128/180 [08:22<03:30,  4.05s/it]

Scraping page 129...


 72%|███████▏  | 129/180 [08:26<03:29,  4.10s/it]

Scraping page 130...


 72%|███████▏  | 130/180 [08:31<03:26,  4.13s/it]

Scraping page 131...


 73%|███████▎  | 131/180 [08:35<03:24,  4.16s/it]

Scraping page 132...


 73%|███████▎  | 132/180 [08:39<03:19,  4.16s/it]

Scraping page 133...


 74%|███████▍  | 133/180 [08:43<03:15,  4.15s/it]

Scraping page 134...


 74%|███████▍  | 134/180 [08:47<03:12,  4.18s/it]

Scraping page 135...


 75%|███████▌  | 135/180 [08:52<03:06,  4.14s/it]

Scraping page 136...


 76%|███████▌  | 136/180 [08:56<03:07,  4.26s/it]

Scraping page 137...


 76%|███████▌  | 137/180 [09:00<03:02,  4.24s/it]

Scraping page 138...


 77%|███████▋  | 138/180 [09:05<03:01,  4.32s/it]

Scraping page 139...


 77%|███████▋  | 139/180 [09:09<02:56,  4.31s/it]

Scraping page 140...


 78%|███████▊  | 140/180 [09:13<02:49,  4.25s/it]

Scraping page 141...


 78%|███████▊  | 141/180 [09:18<02:47,  4.28s/it]

Scraping page 142...


 79%|███████▉  | 142/180 [09:22<02:43,  4.29s/it]

Scraping page 143...


 79%|███████▉  | 143/180 [09:26<02:39,  4.32s/it]

Scraping page 144...


 80%|████████  | 144/180 [09:31<02:37,  4.37s/it]

Scraping page 145...


 81%|████████  | 145/180 [09:35<02:31,  4.33s/it]

Scraping page 146...


 81%|████████  | 146/180 [09:39<02:27,  4.34s/it]

Scraping page 147...


 82%|████████▏ | 147/180 [09:44<02:22,  4.32s/it]

Scraping page 148...


 82%|████████▏ | 148/180 [09:48<02:20,  4.39s/it]

Scraping page 149...


 83%|████████▎ | 149/180 [09:52<02:15,  4.37s/it]

Scraping page 150...


 83%|████████▎ | 150/180 [09:57<02:13,  4.45s/it]

Scraping page 151...


 84%|████████▍ | 151/180 [10:02<02:09,  4.48s/it]

Scraping page 152...


 84%|████████▍ | 152/180 [10:06<02:05,  4.48s/it]

Scraping page 153...


 85%|████████▌ | 153/180 [10:11<02:02,  4.52s/it]

Scraping page 154...


 86%|████████▌ | 154/180 [10:15<01:58,  4.56s/it]

Scraping page 155...


 86%|████████▌ | 155/180 [10:20<01:56,  4.66s/it]

Scraping page 156...


 87%|████████▋ | 156/180 [10:25<01:51,  4.64s/it]

Scraping page 157...


 87%|████████▋ | 157/180 [10:29<01:46,  4.62s/it]

Scraping page 158...


 88%|████████▊ | 158/180 [10:34<01:40,  4.55s/it]

Scraping page 159...


 88%|████████▊ | 159/180 [10:39<01:37,  4.64s/it]

Scraping page 160...


 89%|████████▉ | 160/180 [10:43<01:31,  4.58s/it]

Scraping page 161...


 89%|████████▉ | 161/180 [10:48<01:28,  4.66s/it]

Scraping page 162...


 90%|█████████ | 162/180 [10:52<01:22,  4.56s/it]

Scraping page 163...


 91%|█████████ | 163/180 [10:57<01:18,  4.61s/it]

Scraping page 164...


 91%|█████████ | 164/180 [11:02<01:15,  4.69s/it]

Scraping page 165...


 92%|█████████▏| 165/180 [11:08<01:18,  5.23s/it]

Scraping page 166...


 92%|█████████▏| 166/180 [11:13<01:10,  5.06s/it]

Scraping page 167...


 93%|█████████▎| 167/180 [11:18<01:05,  5.00s/it]

Scraping page 168...


 93%|█████████▎| 168/180 [11:22<00:58,  4.86s/it]

Scraping page 169...


 94%|█████████▍| 169/180 [11:27<00:52,  4.79s/it]

Scraping page 170...


 94%|█████████▍| 170/180 [11:32<00:48,  4.81s/it]

Scraping page 171...


 95%|█████████▌| 171/180 [11:36<00:42,  4.69s/it]

Scraping page 172...


 96%|█████████▌| 172/180 [11:41<00:37,  4.70s/it]

Scraping page 173...


 96%|█████████▌| 173/180 [11:46<00:33,  4.77s/it]

Scraping page 174...


 97%|█████████▋| 174/180 [11:51<00:28,  4.76s/it]

Scraping page 175...


 97%|█████████▋| 175/180 [11:55<00:23,  4.75s/it]

Scraping page 176...


 98%|█████████▊| 176/180 [12:01<00:19,  4.85s/it]

Scraping page 177...


 98%|█████████▊| 177/180 [12:07<00:16,  5.46s/it]

Scraping page 178...


 99%|█████████▉| 178/180 [12:12<00:10,  5.28s/it]

Scraping page 179...


 99%|█████████▉| 179/180 [12:17<00:05,  5.16s/it]

Scraping page 180...


100%|██████████| 180/180 [12:21<00:00,  4.12s/it]


Export the scraped data to a CSV file

In [16]:
print(f"Total recordings scraped: {len(all_data)}")
save_to_parquet(all_data, file_save_location)

Total recordings scraped: 89546
Data saved to ../data/raw/xeno-canto.parquet


Also download all the audio files to a folder. **Note**: This takes a long time (5-6 hours) and a lot of space (~50GB).

In [17]:
for recording_data in tqdm(all_data):
    audio_url = recording_data["file"]
    original_audio_file_name = recording_data["file-name"]
    file_extension = original_audio_file_name.split(".")[-1]
    new_audio_file_name = f"{recording_data['id']}.{file_extension}"

    download_audio_file(audio_url, new_audio_file_name)

 43%|████▎     | 38326/89546 [2:27:38<2:28:41,  5.74it/s] 

Error downloading 309870.: Invalid URL '': No scheme supplied. Perhaps you meant https://?


 43%|████▎     | 38367/89546 [2:27:49<2:18:21,  6.16it/s]

Error downloading 335904.: Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error downloading 335903.: Invalid URL '': No scheme supplied. Perhaps you meant https://?


 49%|████▉     | 44276/89546 [2:51:10<1:52:25,  6.71it/s] 

Failed to download 357351.mp3: 500


 63%|██████▎   | 56472/89546 [3:39:04<1:37:27,  5.66it/s] 

Failed to download 441473.mp3: 500


 63%|██████▎   | 56632/89546 [3:39:49<1:25:01,  6.45it/s]

Failed to download 516953.mp3: 500


 65%|██████▍   | 57948/89546 [3:45:09<22:20, 23.56it/s]  

Error downloading 692902.: Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error downloading 577467.: Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error downloading 351080.: Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error downloading 941228.: Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error downloading 919299.: Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error downloading 481312.: Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error downloading 473934.: Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error downloading 394336.: Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error downloading 309892.: Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error downloading 347475.: Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error downloading 347474.: Invalid URL '': No scheme supplied. Perhaps you meant https://?

 68%|██████▊   | 60940/89546 [3:56:12<2:37:43,  3.02it/s]

Error downloading 167588.: Invalid URL '': No scheme supplied. Perhaps you meant https://?


 78%|███████▊  | 70118/89546 [4:30:12<56:08,  5.77it/s]  

Error downloading 500378.: Invalid URL '': No scheme supplied. Perhaps you meant https://?


 81%|████████▏ | 72949/89546 [4:41:58<21:22:35,  4.64s/it]

Error downloading 246962.mp3: HTTPSConnectionPool(host='xeno-canto.org', port=443): Max retries exceeded with url: /246962/download (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7fddb81f2810>: Failed to resolve 'xeno-canto.org' ([Errno -3] Temporary failure in name resolution)"))


 85%|████████▌ | 76313/89546 [4:54:30<41:39,  5.29it/s]   

Error downloading 527580.: Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error downloading 489553.: Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error downloading 489416.: Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error downloading 483752.: Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error downloading 464986.: Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error downloading 359196.: Invalid URL '': No scheme supplied. Perhaps you meant https://?


 90%|█████████ | 80732/89546 [5:11:24<55:21,  2.65it/s]  

Failed to download 825922.mp3: 500


 95%|█████████▌| 85078/89546 [5:28:03<14:02,  5.30it/s]  

Error downloading 107400.: Invalid URL '': No scheme supplied. Perhaps you meant https://?
Error downloading 107397.: Invalid URL '': No scheme supplied. Perhaps you meant https://?


 99%|█████████▉| 89018/89546 [5:42:44<19:44,  2.24s/it]

Error downloading 619072.mp3: HTTPSConnectionPool(host='xeno-canto.org', port=443): Max retries exceeded with url: /619072/download (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x7fddb81f32c0>: Failed to resolve 'xeno-canto.org' ([Errno -3] Temporary failure in name resolution)"))


100%|██████████| 89546/89546 [5:44:32<00:00,  4.33it/s]


In [18]:
audio_files_total_size = sum(file.stat().st_size for file in Path(audio_files_dir).rglob('*'))
print(f"Total size of downloaded audio files: {audio_files_total_size / 1e9:.2f} GB")
print(f"Number of metadata records: {len(all_data)}")

Total size of downloaded audio files: 50.17 GB
Number of metadata records: 89546
