# Data collection
This file contains the code to collect a list of bird species from GBIF in the Ede area and download related bird song audio from Xeno Canto.

GBIF.org (01 March 2023) GBIF Occurrence Download  https://doi.org/10.15468/dl.hzws49

In [164]:
from urllib.request import urlopen, urlretrieve, HTTPError, URLError
import os
import cgi
import duckdb
import pandas as pd
import logging

logging.basicConfig(filename='app.log', filemode='w', format='%(name)s - %(levelname)s - %(message)s')

In [None]:
# Connect to Duck DB database file
con = duckdb.connect('data/db/collection.db')

## Functions

In [120]:
def fetch_bird_recordings(search_key, page_number):
    print(f'Fetching page {page_number} of {search_key}')
    url = f'https://xeno-canto.org/api/2/recordings?query=q:A+{search_key}&page={page_number}'
    df = pd.read_json(url)
    current_page = df['page'].max()
    number_of_pages = df['numPages'].max()
    df = pd.DataFrame(df.recordings.values.tolist())
    
    con.sql(f'''
        INSERT INTO bird_recordings
        SELECT rec.*, '{search_key}' AS search_key
        FROM df AS rec
        WHERE rec.also = '[]'
    ''')
    
    if current_page < number_of_pages:
        fetch_bird_recordings(search_key, page_number + 1)

In [161]:
def download_audio_data(dest_folder, file_url):
    try:
        remotefile = urlopen(file_url)
        contentdisposition = remotefile.info()['Content-Disposition']
        _, params = cgi.parse_header(contentdisposition)
        filename = f'data/audio/{dest_folder}/{params["filename"]}'

        os.makedirs(os.path.dirname(filename), exist_ok=True)
        urlretrieve(file_url, filename)

        print(f'Downloaded file: {filename}')
    except HTTPError as err:
        log_msg = f'Failed to download file at {file_url}, because of error code: {err.code}'
        print(log_msg)
        logging.error(log_msg)
    
    except URLError as err:
        log_msg = f'Failed to download file at {file_url}, because of error reason: {err.reason}'
        print(log_msg)
        logging.error(log_msg)
    

## Pipeline

In [48]:
# Load observerd species based on GBIF data from Ede to a grouped pandas dataframe
df = pd.read_csv('data/csv/ede_bird_occurrence_pp.csv', sep='\t')
species = df[['species', 'individualCount']] \
            .fillna(1) \
            .groupby('species') \
            .agg(['sum','count']) \
            .pipe(lambda x: x.set_axis(x.columns.map('_'.join), axis=1)) \
            .reset_index()

In [58]:
# Store the dataframe data into the database
con.sql('''
    CREATE TABLE bird_observations_ede AS
    SELECT
        species,
        REPLACE(LOWER(species), ' ', '+') AS search_key,
        individualCount_sum               AS observation_sum,
        individualCount_count             AS observation_cnt
    FROM
        species
    ORDER BY
        individualCount_count DESC
''')

In [122]:
# Take the top 10 species and fetch their search keys to loop over
search_keys = con.sql('''
    SELECT search_key
    FROM bird_observations_ede
    LIMIT 50
''').fetchnumpy()

con.sql('''
    SELECT *
    FROM bird_observations_ede
    LIMIT 50
''').to_df()

Unnamed: 0,species,search_key,observation_sum,observation_cnt
0,Parus major,parus+major,1351.0,729
1,Fringilla coelebs,fringilla+coelebs,3835.0,575
2,Dryocopus martius,dryocopus+martius,493.0,452
3,Buteo buteo,buteo+buteo,548.0,434
4,Sitta europaea,sitta+europaea,559.0,427
5,Dendrocopos major,dendrocopos+major,483.0,383
6,Lanius excubitor,lanius+excubitor,344.0,336
7,Emberiza citrinella,emberiza+citrinella,670.0,331
8,Turdus merula,turdus+merula,503.0,321
9,Alauda arvensis,alauda+arvensis,784.0,282


In [124]:
# Loop over search keys
for search_key in search_keys['search_key']:
    fetch_bird_recordings(search_key, 1)

Fetching page 1 of parus+major
Fetching page 2 of parus+major
Fetching page 3 of parus+major
Fetching page 4 of parus+major
Fetching page 1 of fringilla+coelebs
Fetching page 2 of fringilla+coelebs
Fetching page 3 of fringilla+coelebs
Fetching page 4 of fringilla+coelebs
Fetching page 1 of dryocopus+martius
Fetching page 1 of buteo+buteo
Fetching page 1 of sitta+europaea
Fetching page 2 of sitta+europaea
Fetching page 1 of dendrocopos+major
Fetching page 2 of dendrocopos+major
Fetching page 1 of lanius+excubitor
Fetching page 1 of emberiza+citrinella
Fetching page 2 of emberiza+citrinella
Fetching page 1 of turdus+merula
Fetching page 2 of turdus+merula
Fetching page 3 of turdus+merula
Fetching page 4 of turdus+merula
Fetching page 1 of alauda+arvensis
Fetching page 1 of turdus+philomelos
Fetching page 2 of turdus+philomelos
Fetching page 3 of turdus+philomelos
Fetching page 1 of cyanistes+caeruleus
Fetching page 2 of cyanistes+caeruleus
Fetching page 1 of lanius+collurio
Fetching page

In [126]:
con.sql('''
    SELECT search_key, COUNT(*) AS count
    FROM bird_recordings
    GROUP BY search_key
    ORDER BY count DESC
''').to_df()

Unnamed: 0,search_key,count
0,parus+major,1173
1,turdus+merula,923
2,erithacus+rubecula,872
3,fringilla+coelebs,844
4,troglodytes+troglodytes,798
5,phylloscopus+collybita,738
6,turdus+philomelos,689
7,sylvia+atricapilla,641
8,cyanistes+caeruleus,565
9,dendrocopos+major,520


In [167]:
df = con.sql('''
    SELECT REPLACE(search_key, '+', '_') AS dest_folder, file
    FROM bird_recordings
''').to_df()

In [168]:
df

Unnamed: 0,dest_folder,file
0,corvus_corax,https://xeno-canto.org/782652/download
1,corvus_corax,https://xeno-canto.org/782651/download
2,corvus_corax,https://xeno-canto.org/780911/download
3,corvus_corax,https://xeno-canto.org/779842/download
4,corvus_corax,https://xeno-canto.org/779216/download
...,...,...
6034,phoenicurus_phoenicurus,https://xeno-canto.org/76763/download
6035,phoenicurus_phoenicurus,https://xeno-canto.org/76762/download
6036,phoenicurus_phoenicurus,https://xeno-canto.org/35035/download
6037,phoenicurus_phoenicurus,https://xeno-canto.org/33363/download


In [None]:
for index, row in df.iterrows():
    download_audio_data(row['dest_folder'], row['file'])

Downloaded file: data/audio/corvus_corax/XC782652 - Northern Raven - Corvus corax.mp3
Downloaded file: data/audio/corvus_corax/XC782651 - Northern Raven - Corvus corax.mp3
Downloaded file: data/audio/corvus_corax/XC780911 - Northern Raven - Corvus corax.mp3
Downloaded file: data/audio/corvus_corax/XC779842 - Northern Raven - Corvus corax corax.mp3
Downloaded file: data/audio/corvus_corax/XC779216 - Northern Raven - Corvus corax.wav
Downloaded file: data/audio/corvus_corax/XC779086 - Northern Raven - Corvus corax.wav
Downloaded file: data/audio/corvus_corax/XC778352 - Northern Raven - Corvus corax.wav
Downloaded file: data/audio/corvus_corax/XC778341 - Northern Raven - Corvus corax.wav
Downloaded file: data/audio/corvus_corax/XC776638 - Northern Raven - Corvus corax.mp3
Downloaded file: data/audio/corvus_corax/XC774442 - Northern Raven - Corvus corax.mp3
Downloaded file: data/audio/corvus_corax/XC771478 - Northern Raven - Corvus corax.mp3
Downloaded file: data/audio/corvus_corax/XC76871

In [116]:
# con.sql('''
#     SELECT id
#     FROM bird_recordings
# ''').to_csv('data/csv/ids.csv')

## Experimental code snipets
These snipets were used to experiment and kept for future use.

In [89]:
search_key = 'parus+major'
page_number = 1
url = f'https://xeno-canto.org/api/2/recordings?query=q:A+{search_key}&page={page_number}'

# urlretrieve(url, f'data/json/{search_key}_{page_number}.json')

('data/json/parus+major_1.json', <http.client.HTTPMessage at 0x172e16ac0>)

In [94]:
# Collect recordings
df = pd.read_json(url)
current_page = df['page'].max()
number_of_pages = df['numPages'].max()
df = pd.DataFrame(df.recordings.values.tolist())

In [152]:
# Download audio file
dest_folder = 'test'
file_url = 'https://xeno-canto.org/783426/download'

remotefile = urlopen(file_url)
contentdisposition = remotefile.info()['Content-Disposition']
_, params = cgi.parse_header(contentdisposition)
filename = f'data/audio/{dest_folder}/{params["filename"]}'

os.makedirs(os.path.dirname(filename), exist_ok=True)
urlretrieve(file_url, filename)

('data/audio/test/XC783426 - Australian Masked Owl - Tyto novaehollandiae castanops.wav',
 <http.client.HTTPMessage at 0x1777feca0>)

In [165]:
log_msg = f'Test message'
print(log_msg)
logging.error(log_msg)

Test message


In [123]:
# con.sql(f'''
#     CREATE TABLE bird_recordings AS
#     SELECT rec.*, '{search_key}' AS search_key
#     FROM df_rec AS rec
#     WHERE also = '[]'
# ''')

# con.sql('''
#     DELETE
#     FROM bird_recordings
# ''')

In [None]:
con.close()