# Purpose
This file is used to generate the metadata files for diverse datasets.

## Preparation

In [1]:
# Installing dependencies
#%pip install datasets
#%pip install librosa
%pip install -U matplotlib

Requirement already up-to-date: matplotlib in /home/jlange/.local/lib/python3.8/site-packages (3.7.2)
Note: you may need to restart the kernel to use updated packages.


Change the following variables, if the Data or Annotations folder are not in the same directory as this file.

## Storing Data
Before running this notebook make shure that the data is in folders corresponding to the dataset names. The folder structure of the datafolder should be as follows:
- data_path
    - Data
        - OekoFor
            - 2016_Scotland
                - *audio_files*
            - ...
        - Zenodo
            - *audio_files*
        - Powdermill
            - Recording_1
                - *audio_files*
            - ...



In [5]:
#The path to the folder containing all audiofiles, this folder should be named Data
#If Data is in the same directory as this notebook the data_path should be the empty string
data_path = "/data-project/DeepBirdDetect/BirdSet/"
#The path to where annotation files are stored
annotations_path = "Annotations/"
#The path to where the gernerated metadata should be stored
metadata_path = "Metadata/"

In [2]:
"""
Build an empty dictionary with the colmun-names as keys. For each dataset (i.e. Oeko4, Powdermill, etc.) the empty_dataset is deepcopied and filled.
The resulting dataset can then be converted into a HuggingFace Dataset object.
Also build dictionarys to translate different bird-codes
"""
import copy
columns=["id", "filepath", "start_time", "end_time", "low_freq", "high_freq", "ebird_code"
         , "call_type", "sex", "lat", "long", "microphone", "license", "source", "local_time"]
"""
Read the taxonomy from the csv file and convert it to the e_bird_codes dictionary.
common_to_ebird maps common_names to e-bird-codes
"""
import csv
common_to_ebird = {} #This dict saves the e-bird-code for each common name
with open('ebird_taxonomy_v2022.csv', newline='') as csvfile:
    taxonomy = csv.reader(csvfile, delimiter=',', quotechar='|')
    first = True
    for row in taxonomy:
        if first:
            first = False
            continue
        common_name = row[3]
        e_bird_code = row[2]
        common_to_ebird[common_name]=e_bird_code

"""
The sci_to_ebird dictionary maps each scientific (latin) name to the corresponding e-bird-code.
"""
import csv
sci_to_ebird = {} #This dict saves the e-bird-code for each common name
with open('ebird_taxonomy_v2022.csv', newline='') as csvfile:
    taxonomy = csv.reader(csvfile, delimiter=',', quotechar='|')
    first = True
    for row in taxonomy:
        if first:
            first = False
            continue
        sci_name = row[4]
        e_bird_code = row[2]
        sci_to_ebird[sci_name]=e_bird_code

"""
The ebird_to_common dictionary maps each e-bird-code to the corresponding common name.
"""
ebird_to_common = {} #This dict saves the e-bird-code for each common name
with open('ebird_taxonomy_v2022.csv', newline='') as csvfile:
    taxonomy = csv.reader(csvfile, delimiter=',', quotechar='|')
    first = True
    for row in taxonomy:
        if first:
            first = False
            continue
        common_name = row[3]
        e_bird_code = row[2]
        ebird_to_common[e_bird_code]=common_name



"""
The alpha_to_ebird dictionary maps each alpha-code to the corresponding ebird-code.
"""
alpha_to_ebird = {} #This dict saves the alpha-code for each e_bird_code
not_found = []
with open('AlphaCodes.csv', newline='') as csvfile:
    taxonomy = csv.reader(csvfile, delimiter=',', quotechar='|')
    first = True
    for row in taxonomy:
        if first:
            first = False
            continue
        common_name = row[3]
        alpha_code = row[1]
        sci_name = row[4]
        if common_name in common_to_ebird.keys():
            alpha_to_ebird[alpha_code]=common_to_ebird[common_name]
        elif sci_name in sci_to_ebird.keys():
            #Sometimes different common names are used
            alpha_to_ebird[alpha_code]=sci_to_ebird[sci_name]
        else:
            not_found.append(f"{common_name}/ {sci_name}")
print(f"There are {len(not_found)} out of {len(not_found)+len(alpha_to_ebird.keys())} entrys in the Alpha codes which I could not translate to ebirdcodes")



There are 79 out of 2326 entrys in the Alpha codes which I could not translate to ebirdcodes


## Zenodo
The following Blocks build the Zenodo dataset and adds it to the list of datasets. The Zenodo dataset consists of 5 Datasets, namely HSN, SNE, UHH, PER, SSW and NES. These are stored as individual metadata-files.

#### Define Constants
The following codeblock defines some required constants. These include the folder names for the 5 Datasets, their respective sources, as well as their respective cooridinates (Latitude, Longitude).

In [6]:
subsets = ["HSN", "SNE", "UHH", "PER", "SSW", "NES"]

sources={"HSN": "https://zenodo.org/record/7525805",
        "SNE": "https://zenodo.org/record/7050014",
        "UHH": "https://zenodo.org/record/7078499",
        "PER": "https://zenodo.org/record/7079124",
        "SSW": "https://zenodo.org/record/7079380",
        "NES": "https://zenodo.org/record/7525349"}
"""
Coordinates are saved as [latitude, logitude]. Some of the datasets contain multiple recording sights.
I filled these in by hand, as the different datasets had used different formats which had to be 
converted to (Lat,Lon).
"""
coordinates={"HSN": {0:[37.0, -118.5]},
             "SNE": {0:[38.49, -119.95]},
             "UHH": {1:[19.801668, -155.609444],
                     2:[19.792975, -155.321332],
                     3:[19.46647, -155.582011],
                     4:[19.820609, -155.468097]},
             "PER": {1:[-12.542578, -69.062050],
                     2:[-12.541925, -69.058642],
                     4:[-12.537814, -69.054308],
                     5:[-12.535539, -69.06674],
                     6:[-12.532981, -69.049864],
                     8:[-12.529858, -69.046164],
                     10:[-12.522983, -69.046822]},
             "SSW": {0:[42.4768, -76.4527]},
             "NES": {1:[5.59,-75.85],
                     2:[10.11,-84.52]}
} 

#### Build Dataset
The following block builds a dictionary from the data and converts it into a HuggingFace Dataset Object

In [25]:
import csv
from datetime import date, datetime, time, timedelta

#build the dictionary
#deepcopy is necessary here, because otherwise the empty_dataset gets changed
for subset in subsets:
    ID=0
    with open(f'{annotations_path}Zenodo/{subset}_annotations.csv', newline='') as annotations:
        reader = csv.DictReader(annotations) 
        with open(f"{metadata_path}{subset}_metadata.csv","w",newline="") as file:
            writer = csv.writer(file)
            writer.writerow(columns)
            for sample in reader:
                #filter out unknown birds (in the case of zenodo these are all marked as ????)
                if sample["Species eBird Code"] == "????":
                    continue

                # Some of the subsets used multiple recording sights
                if subset in ["UHH","PER","NES"]:
                    #This part of the filename denotes the Recording Sight
                    sight = int(sample['Filename'][9:11])
                else:
                    sight = 0

                #start time of the recording
                #note that the date is irrelevant
                if subset in ["HSN","SNE","SSW"]:
                    t = sample['Filename'][17:23]
                else:
                    t = sample['Filename'][21:27]
                start_time = time(int(t[0:2]),int(t[2:4]),int(t[4:6]))
                local_time = datetime.combine(date.today(),start_time)
                #the start of the actual bird sound is relative to the audiofile
                local_time+=timedelta(seconds=int(float(sample['Start Time (s)'])))
                row = []
                row.append(str(ID))#id
                row.append(f"Data/Zenodo/{sample['Filename']}")#filepath
                row.append(sample["Start Time (s)"])#start_time
                row.append(sample["End Time (s)"])#end_time
                row.append(sample["Low Freq (Hz)"])#low_freq
                row.append(sample["High Freq (Hz)"])#high_freq
                row.append(sample["Species eBird Code"])#ebird_code
                row.append(None)#call_type
                row.append(None)#sex
                row.append(coordinates[subset][sight][0])#lat
                row.append(coordinates[subset][sight][1])#long
                row.append("Soundscape")#microphone
                row.append("Creative Commons Attribution 4.0 International Public License")#license
                row.append(sources[subset])#source
                row.append(local_time)#local_time
                writer.writerow(row)
                ID += 1


## Powdermill
The powdermill dataset is also a **Zenodo** dataset, but it must be processed differently.

In [16]:
import csv
import os
from datetime import date, datetime, time, timedelta

start_times = {"Recording_1":time(5,32), "Recording_2":time(5,32), 
               "Recording_3":time(5,17), "Recording_4":time(6,19)}
folders = ["Recording_1", "Recording_2", "Recording_3", "Recording_4"]

ID=0
other_sounds=[]
with open(f"{metadata_path}POW_metadata.csv","w",newline="") as file:
    writer = csv.writer(file)
    writer.writerow(columns)
    for folder in folders:
        path = f"{annotations_path}Powdermill/{folder}/"
        annotations = [name for name in os.listdir(path)]
        for annotation_file in annotations:
            with open(f'{path}{annotation_file}', newline='') as annotations:
                reader = csv.DictReader(annotations, delimiter='\t') 
                for sample in reader:                         
                    #Samples that are not Birds are removed
                    bird = sample["Species"]
                    if bird in alpha_to_ebird:
                        bird = alpha_to_ebird[bird]
                    else:
                        other_sounds.append(bird)
                        continue
                    
                    #For uniformity replace NA and "" with None
                    for key in sample.keys():
                        if sample[key] == "NA" or sample[key]=="":
                            sample[key] = None
                    
                    #start time of the recording
                    #note that the date is irrelevant
                    local_time = datetime.combine(date.today(),start_times[folder])
                    #each recording is split into segments in 5 minute intervals
                    segment = int(annotation_file[20:22])
                    local_time+=timedelta(minutes=segment*5)
                    #the start of the actual bird sound is relative to the audiofile
                    local_time+=timedelta(seconds=int(float(sample['Begin Time (s)'])))

                    row = []
                    row.append(str(ID))#id
                    audio_name = f"{folder}_Segment_{annotation_file[20:22]}.WAV"
                    row.append(f"Data/Powdermill/{audio_name}")#filepath
                    row.append(sample['Begin Time (s)'])#start_time
                    row.append(sample['End Time (s)'])#end_time
                    row.append(sample['Low Freq (Hz)'])#low_freq
                    row.append(sample['High Freq (Hz)'])#high_freq
                    row.append(bird)#ebird_code
                    row.append(None)#call_type
                    row.append(None)#sex
                    row.append(40.1602)#lat
                    row.append(-79.2719)#long
                    row.append("Soundscape (AudioMoths)")#microphone
                    row.append("Creative Commons Zero v1.0 Universal")#license
                    row.append("https://zenodo.org/record/4656848")#source
                    row.append(str(local_time.time()))#local_time
                    writer.writerow(row)
                    ID += 1
print(f"Was unable to lable {len(other_sounds)} samples")
print(f"Could not translate the following alpha-codes to ebird: {list(dict.fromkeys(other_sounds))}")

Was unable to lable 62 samples
Could not translate the following alpha-codes to ebird: ['AMGO']


## OekoFor
The oekofor dataset still has some dificulties:
- time can be extracted from the filenames (in Greenwich Mean Time)
- the rough locantion can be extracted from the folder names