# Purpose
This file is used to generate the metadata files for diverse datasets.

## Storing Data
Before running this notebook make shure that the data is in folders corresponding to the dataset names. The folder structure of the datafolder should be as follows:
- data_path
    - Data
        - OekoFor
            - 2016_Scotland
                - *audio_files*
            - ...
        - Zenodo
            - *audio_files*
        - Powdermill
            - Recording_1
                - *audio_files*
            - ...



In [1]:
#The path to the folder containing all audiofiles, this folder should be named Data
#If Data is in the same directory as this notebook the data_path should be the empty string
data_path = "/data-project/DeepBirdDetect/BirdSet/"
#The path to where annotation files are stored
annotations_path = "Annotations/"
#The path to where the gernerated metadata should be stored
metadata_path = "Metadata/"

In [2]:
"""
Build an empty dictionary with the colmun-names as keys. For each dataset (i.e. Oeko4, Powdermill, etc.) the empty_dataset is deepcopied and filled.
The resulting dataset can then be converted into a HuggingFace Dataset object.
Also build dictionarys to translate different bird-codes
"""

columns=["id", "filepath", "start_time", "end_time", "low_freq", "high_freq", "ebird_code"
         , "call_type", "sex", "lat", "long", "microphone", "license", "source", "local_time"]
"""
Read the taxonomy from the csv file and convert it to the e_bird_codes dictionary.
common_to_ebird maps common_names to e-bird-codes
"""
import csv
common_to_ebird = {} #This dict saves the e-bird-code for each common name
with open('ebird_taxonomy_v2022.csv', newline='') as csvfile:
    taxonomy = csv.reader(csvfile, delimiter=',', quotechar='|')
    first = True
    for row in taxonomy:
        if first:
            first = False
            continue
        common_name = row[3]
        e_bird_code = row[2]
        common_to_ebird[common_name]=e_bird_code

"""
The sci_to_ebird dictionary maps each scientific (latin) name to the corresponding e-bird-code.
"""
import csv
sci_to_ebird = {} #This dict saves the e-bird-code for each common name
with open('ebird_taxonomy_v2022.csv', newline='') as csvfile:
    taxonomy = csv.reader(csvfile, delimiter=',', quotechar='|')
    first = True
    for row in taxonomy:
        if first:
            first = False
            continue
        sci_name = row[4]
        e_bird_code = row[2]
        sci_to_ebird[sci_name]=e_bird_code

"""
The ebird_to_common dictionary maps each e-bird-code to the corresponding common name.
"""
ebird_to_common = {} #This dict saves the e-bird-code for each common name
with open('ebird_taxonomy_v2022.csv', newline='') as csvfile:
    taxonomy = csv.reader(csvfile, delimiter=',', quotechar='|')
    first = True
    for row in taxonomy:
        if first:
            first = False
            continue
        common_name = row[3]
        e_bird_code = row[2]
        ebird_to_common[e_bird_code]=common_name



"""
The alpha_to_ebird dictionary maps each alpha-code to the corresponding ebird-code.
"""
alpha_to_ebird = {} #This dict saves the alpha-code for each e_bird_code
not_found = []
non_bird_sound = []
with open('AlphaCodes.csv', newline='') as csvfile:
    taxonomy = csv.reader(csvfile, delimiter=',', quotechar='|')
    first = True
    for row in taxonomy:
        if first:
            first = False
            continue
        common_name = row[3]
        alpha_code = row[1]
        sci_name = row[4]

        #I am pretty shure that AMGP is a typo in the AlphaCodes.csv
        #AMGO does not appear as a label for any sample
        #AMGO does appear as a label for 62 samples but is not in AlphaCodes.csv
        if alpha_code == "AMGP":
            alpha_code = "AMGO"

        if sci_name in sci_to_ebird.keys():
            alpha_to_ebird[alpha_code]=sci_to_ebird[sci_name]
        elif common_name in common_to_ebird.keys():
            #If the scientific name can not be found try the common name
            alpha_to_ebird[alpha_code]=common_to_ebird[common_name]
        else:
            not_found.append(f"{common_name}/ {sci_name}")
print(f"There are {len(not_found)} out of {len(not_found)+len(alpha_to_ebird.keys())} entrys in the Alpha codes which I could not translate to ebirdcodes")


"""
The NIPS4BPlus dataset uses it's verry own bird names. The nips_to_ebird dictionary maps these bird names to the corresponding ebird-codes.
"""
nips_to_ebird = {} #This dict saves the nips4bplus names for each e_bird_code
nips_to_ebird["Human"]="not_a_bird"
not_found = []
with open('nips4b_birdchallenge_espece_list.csv', newline='') as csvfile:
    taxonomy = csv.reader(csvfile, delimiter=',', quotechar='|')
    first = True
    for row in taxonomy:
        if first:
            first = False
            continue

        common_name = row[2]
        nips_code = row[1]
        sci_name = row[3]
        type = row[4]

        #fixing a misspelled name
        if sci_name == "lullula arborea":
            sci_name = "Lullula arborea"
        
        #fixing synonimous scientific names
        if sci_name == "Sylvia cantillans":
            sci_name = "Curruca cantillans"
        if sci_name == "Sylvia melanocephala":
            sci_name = "Curruca melanocephala"

        if type != "bird":
            #This is not a bird sound
            nips_to_ebird[nips_code]="not_a_bird"
        elif sci_name in sci_to_ebird.keys():
            nips_to_ebird[nips_code]=sci_to_ebird[sci_name]
        elif common_name in common_to_ebird.keys():
            #If the scientific name can not be found try the common name
            nips_to_ebird[nips_code]=common_to_ebird[common_name]    
        else:
            not_found.append(f"{common_name}/ {sci_name}")
print(f"There are {len(not_found)} out of {len(not_found)+len(alpha_to_ebird.keys())} entrys in the NIPS4BPlus codes which I could not translate to ebirdcodes")
for x in not_found:
    print(x)

There are 79 out of 2326 entrys in the Alpha codes which I could not translate to ebirdcodes
There are 0 out of 2247 entrys in the NIPS4BPlus codes which I could not translate to ebirdcodes


## Zenodo
The following Blocks build the Zenodo dataset and adds it to the list of datasets. The Zenodo dataset consists of 5 Datasets, namely HSN, SNE, UHH, PER, SSW and NES. These are stored as individual metadata-files.

#### Define Constants
The following codeblock defines some required constants. These include the folder names for the 5 Datasets, their respective sources, as well as their respective cooridinates (Latitude, Longitude).

In [3]:
subsets = ["HSN", "SNE", "UHH", "PER", "SSW", "NES"]

sources={"HSN": "https://zenodo.org/record/7525805",
        "SNE": "https://zenodo.org/record/7050014",
        "UHH": "https://zenodo.org/record/7078499",
        "PER": "https://zenodo.org/record/7079124",
        "SSW": "https://zenodo.org/record/7079380",
        "NES": "https://zenodo.org/record/7525349"}
"""
Coordinates are saved as [latitude, logitude]. Some of the datasets contain multiple recording sights.
I filled these in by hand, as the different datasets had used different formats which had to be 
converted to (Lat,Lon).
"""
coordinates={"HSN": {0:[37.0, -118.5]},
             "SNE": {0:[38.49, -119.95]},
             "UHH": {1:[19.801668, -155.609444],
                     2:[19.792975, -155.321332],
                     3:[19.46647, -155.582011],
                     4:[19.820609, -155.468097]},
             "PER": {1:[-12.542578, -69.062050],
                     2:[-12.541925, -69.058642],
                     4:[-12.537814, -69.054308],
                     5:[-12.535539, -69.06674],
                     6:[-12.532981, -69.049864],
                     8:[-12.529858, -69.046164],
                     10:[-12.522983, -69.046822]},
             "SSW": {0:[42.4768, -76.4527]},
             "NES": {1:[5.59,-75.85],
                     2:[10.11,-84.52]}
} 

#### Build Dataset
The following block builds a dictionary from the data and converts it into a HuggingFace Dataset Object

In [4]:
import csv
from datetime import date, datetime, time, timedelta

for subset in subsets:
    ID=0
    with open(f'{annotations_path}Zenodo/{subset}_annotations.csv', newline='') as annotations:
        reader = csv.DictReader(annotations) 
        with open(f"{metadata_path}{subset}_metadata.csv","w",newline="") as file:
            writer = csv.writer(file)
            writer.writerow(columns)
            for sample in reader:
                #filter out unknown birds (in the case of zenodo these are all marked as ????)
                if sample["Species eBird Code"] == "????":
                    continue

                # Some of the subsets used multiple recording sights
                if subset in ["UHH","PER","NES"]:
                    #This part of the filename denotes the Recording Sight
                    sight = int(sample['Filename'][9:11])
                else:
                    sight = 0

                #start time of the recording
                #note that the date is irrelevant
                if subset in ["HSN","SNE","SSW"]:
                    t = sample['Filename'][17:23]
                else:
                    t = sample['Filename'][21:27]
                start_time = time(int(t[0:2]),int(t[2:4]),int(t[4:6]))
                local_time = datetime.combine(date.today(),start_time)
                #the start of the actual bird sound is relative to the audiofile
                local_time+=timedelta(seconds=int(float(sample['Start Time (s)'])))
                row = []
                row.append(str(ID))#id
                row.append(f"Data/Zenodo/{sample['Filename']}")#filepath
                row.append(sample["Start Time (s)"])#start_time
                row.append(sample["End Time (s)"])#end_time
                row.append(sample["Low Freq (Hz)"])#low_freq
                row.append(sample["High Freq (Hz)"])#high_freq
                row.append(sample["Species eBird Code"])#ebird_code
                row.append(None)#call_type
                row.append(None)#sex
                row.append(coordinates[subset][sight][0])#lat
                row.append(coordinates[subset][sight][1])#long
                row.append("Soundscape")#microphone
                row.append("Creative Commons Attribution 4.0 International Public License")#license
                row.append(sources[subset])#source
                row.append(str(local_time.time()))#local_time
                writer.writerow(row)
                ID += 1
print(row)

['6951', 'Data/Zenodo/NES_034_S02_20191009_170000.flac', '3585.0', '3585.6', '2881', '5423', 'yeceup1', None, None, 10.11, -84.52, 'Soundscape', 'Creative Commons Attribution 4.0 International Public License', 'https://zenodo.org/record/7525349', '17:59:45']


## Powdermill
The powdermill dataset is also a **Zenodo** dataset, but it must be processed differently.

In [5]:
import csv
import os
from datetime import date, datetime, time, timedelta

start_times = {"Recording_1":time(5,32), "Recording_2":time(5,32), 
               "Recording_3":time(5,17), "Recording_4":time(6,19)}
folders = ["Recording_1", "Recording_2", "Recording_3", "Recording_4"]

ID=0
other_sounds=[]
with open(f"{metadata_path}POW_metadata.csv","w",newline="") as file:
    writer = csv.writer(file)
    writer.writerow(columns)
    for folder in folders:
        path = f"{annotations_path}Powdermill/{folder}/"
        annotations = [name for name in os.listdir(path)]
        for annotation_file in annotations:
            with open(f'{path}{annotation_file}', newline='') as annotations:
                reader = csv.DictReader(annotations, delimiter='\t') 
                for sample in reader:                         
                    #Samples that are not Birds are removed
                    bird = sample["Species"]
                    if bird in alpha_to_ebird:
                        bird = alpha_to_ebird[bird]
                    else:
                        other_sounds.append(bird)
                        continue
                    
                    #For uniformity replace NA and "" with None
                    for key in sample.keys():
                        if sample[key] == "NA" or sample[key]=="":
                            sample[key] = None
                    
                    #start time of the recording
                    #note that the date is irrelevant
                    local_time = datetime.combine(date.today(),start_times[folder])
                    #each recording is split into segments in 5 minute intervals
                    segment = int(annotation_file[20:22])
                    local_time+=timedelta(minutes=segment*5)
                    #the start of the actual bird sound is relative to the audiofile
                    local_time+=timedelta(seconds=int(float(sample['Begin Time (s)'])))

                    row = []
                    row.append(str(ID))#id
                    audio_name = f"{folder}_Segment_{annotation_file[20:22]}.WAV"
                    row.append(f"Data/Powdermill/{audio_name}")#filepath
                    row.append(sample['Begin Time (s)'])#start_time
                    row.append(sample['End Time (s)'])#end_time
                    row.append(sample['Low Freq (Hz)'])#low_freq
                    row.append(sample['High Freq (Hz)'])#high_freq
                    row.append(bird)#ebird_code
                    row.append(None)#call_type
                    row.append(None)#sex
                    row.append(40.1602)#lat
                    row.append(-79.2719)#long
                    row.append("Soundscape (AudioMoths)")#microphone
                    row.append("Creative Commons Zero v1.0 Universal")#license
                    row.append("https://zenodo.org/record/4656848")#source
                    row.append(str(local_time.time()))#local_time
                    writer.writerow(row)
                    ID += 1
print(row)

['16051', 'Data/Powdermill/Recording_4_Segment_10.WAV', '293.635281287', '295.32619025', '1982.2', '4870.6', 'eastow', None, None, 40.1602, -79.2719, 'Soundscape (AudioMoths)', 'Creative Commons Zero v1.0 Universal', 'https://zenodo.org/record/4656848', '07:13:53']


## NIPS4BPlus

In [6]:
import csv, os
from datetime import date, datetime, time, timedelta

annotation_files = [name for name in os.listdir(f"{annotations_path}NIPS4BPlus/")]
ID=0
unknown = 0
with open(f"{metadata_path}NIPS4BPlus_metadata.csv","w",newline="") as file:
    writer = csv.writer(file)
    writer.writerow(columns)
    for annotation_file in annotation_files:
        with open(f'{annotations_path}NIPS4BPlus/{annotation_file}', newline='') as annotation_f:
            reader = csv.DictReader(annotation_f, fieldnames = ["start_time","duration","label"]) 
            for sample in reader:
                
                if sample["label"]=="Unknown":
                    unknown+=1
                    continue

                row = []
                row.append(str(ID))#id
                row.append(f"Data/NIPS4BPlus/nips4b_birds_trainfile{annotation_file[-7:-4]}.wav")#filepath
                row.append(sample["start_time"])#start_time
                row.append(float(sample["start_time"])+float(sample["duration"]))#end_time
                row.append(None)#low_freq
                row.append(None)#high_freq
                row.append(nips_to_ebird[sample["label"]])#ebird_code
                row.append(sample["label"][-4:])#call_type
                row.append(None)#sex
                row.append(None)#lat
                row.append(None)#long
                row.append("Soundscape (SMX-US)")#microphone
                row.append(None)#license
                row.append("https://figshare.com/articles/dataset/Transcriptions_of_NIPS4B_2013_Bird_Challenge_Training_Dataset/6798548")#source
                row.append(None)#local_time
                writer.writerow(row)
                ID += 1
print(row)
print(f"{ID} rows created")
print(f"{unknown} unknown labels")

['5492', 'Data/NIPS4BPlus/nips4b_birds_trainfile344.wav', '0.391836735', 0.59138322, None, None, 'spofly1', 'call', None, None, None, 'Soundscape (SMX-US)', None, 'https://figshare.com/articles/dataset/Transcriptions_of_NIPS4B_2013_Bird_Challenge_Training_Dataset/6798548', None]
5493 rows created
282 unknown labels


# Bird-DB
The following blocks build the Bird-DB metadata. The Bird-DB.pkl file is required and can be build in usefull_code.ipynb
The columns of the bird_db are:
"track_name","microphone","sample_rate","recording_date","recording_time","recording_length",
"audio_file", "importance","quality_rating", "common_name", "sex", "age_class", 
"certainty_of_species","lat_deg","lat_min","lat_sec","lat_orientation","long_deg",
"long_min","long_sec","long_orientation", "country", "number_of_phrases", "textgrid_file"

In [7]:
%pip install textgrid

Note: you may need to restart the kernel to use updated packages.


In [8]:
import pickle, textgrid, csv
from pathlib import Path
from datetime import time, timedelta

# Converts degrees, minuts seconds to lat/long
def dms2dd(degrees, minutes, seconds, direction):
    dd = float(degrees) + float(minutes)/60 + float(seconds)/(60*60)
    if direction == 'W' or direction == 'S':
        dd *= -1
    return dd

with open(f"{annotations_path}Bird-DB.pkl","rb") as f:
    bird_db = pickle.load(f)


#Change audio-paths to audio name
bird_db["audio_file"]=bird_db["audio_file"].apply(lambda path : Path(path).name)

#Change textgrid-paths to local paths
bird_db["textgrid_file"]=bird_db["textgrid_file"].apply(lambda path : f"./{annotations_path}Bird-DB/{Path(path).name}")

#Some textgrid-files did not exist (404 error when opening link)
count = 0
for index, row in bird_db.iterrows():
    with open(row["textgrid_file"],"r") as f:
        if '<title>404 Not Found</title>' in f.read():
            row["textgrid_file"]=None 
            count += 1
print(f"{count} textgrid-files where not downloadable")

with open(f"{metadata_path}BirdDB_metadata.csv","w",newline="") as file:
    writer = csv.writer(file)
    writer.writerow(columns)
    #Each row in the bird_db corresponds to one species in a specific audio-file
    #Each row has a textgrid-file which specifes, where the bird-calls occur
    ID=0
    for index, input_row in bird_db.iterrows():
        if input_row["textgrid_file"] is None:
            continue
        
        if input_row["audio_file"] in ["1438.WAV", "1454.WAV", "1617.WAV", "TRK15a-12.WAV",
                                        "GTNP609SATHi.wav", "GTNP609SATHj.wav"]:
            #A few audio-files are broken
            continue


        #each input_row anotates a specific bird
        bird = input_row["common_name"]
        if bird not in common_to_ebird:
            #Unknown Bird
            print(f"ebird code for {bird} not known")
            continue
        bird = common_to_ebird[bird]

        # For eficiency some calclulations can be done before iteration over the textgrid entries
        t = input_row["recording_time"].split(":")
        t = map(int, t)
        #start time of the recording
        #note that the date is irrelevant
        t = datetime.combine(date.today(),time(*t))
        #The annotators seem to have gotten the orientations of the locations wrong at times
        #All recordings where done in the USA (California and Wyoming)
        #Therefore lat_orientation must be N and long_orientation must be W
        lat_dms = [input_row[key] for key in ["lat_deg","lat_min","lat_sec"]]
        lat_dms.append("N")
        long_dms = [input_row[key] for key in ["long_deg","long_min","long_sec"]]
        long_dms.append("W")
        if "" in lat_dms or "" in long_dms:
            lat = None
            long = None
        else:
            lat = round(dms2dd(*lat_dms),5)
            long = round(dms2dd(*long_dms),5)

        #Now iterate over each item and interval in the given textgrid file
        tg = textgrid.TextGrid.fromFile(input_row["textgrid_file"])
        for items in tg:
            for interval in items:
                if interval.mark == "":
                    # the bird is not detected in this interval
                    continue
                elif interval.mark == "~1":
                    #I assume this means that the bird is hardly audible here
                    #I therefore think, that this should be removed.
                    continue
                row = []
                row.append(str(ID))#id
                row.append(f"Data/BirdDB/{input_row['audio_file']}")#filepath
                row.append(interval.minTime)#start_time
                row.append(interval.maxTime)#end_time
                row.append(None)#low_freq
                row.append(None)#high_freq
                row.append(bird)#ebird_code
                row.append(None)#call_type
                if "male" in str.lower(input_row["sex"]):
                    row.append("male")#sex
                elif "female" in str.lower(input_row["sex"]):
                    row.append("female")#sex
                else:
                    row.append(None)#sex
                row.append(lat)#lat
                row.append(long)#long
                row.append(input_row["microphone"])#microphone
                row.append("Attribution-NonCommercial-NoDerivs 4.0 International")#license
                row.append("https://doi.org/10.1016/j.ecoinf.2015.01.007")#source
                local_time=t+timedelta(seconds=int(interval.minTime))
                row.append(local_time.time())#local_time
                writer.writerow(row)
                ID += 1
print(row)



24 textgrid-files where not downloadable
['106146', 'Data/BirdDB/SMMJAN13TRK9.WAV', 167.59684, 168.40132, None, None, 'calthr', None, None, 34.08619, -118.64531, 'Sennheiser omnidirectional with Telinga parabolic reflector', 'Attribution-NonCommercial-NoDerivs 4.0 International', 'https://doi.org/10.1016/j.ecoinf.2015.01.007', datetime.time(13, 6, 47)]


## OekoFor
The oekofor dataset still has some dificulties:
- time can be extracted from the filenames (in Greenwich Mean Time)
- the rough locantion can be extracted from the folder names