In [2]:
import pandas as pd
import numpy as np
import os

In [3]:
gbif_path = 'xeno_canto_bsfatw'

occurrence_df = pd.read_csv(os.path.join(gbif_path, 'occurrence.txt'), sep = '\t')

In [4]:
occurrence_df.columns

Index(['gbifID', 'abstract', 'accessRights', 'accrualMethod',
       'accrualPeriodicity', 'accrualPolicy', 'alternative', 'audience',
       'available', 'bibliographicCitation',
       ...
       'acceptedScientificName', 'verbatimScientificName', 'typifiedName',
       'protocol', 'lastParsed', 'lastCrawled', 'repatriated',
       'relativeOrganismQuantity', 'recordedByID', 'identifiedByID'],
      dtype='object', length=241)

In [5]:
occurrence_df.isnull().sum()

gbifID                           0
abstract                    435578
accessRights                435578
accrualMethod               435578
accrualPeriodicity          435578
                             ...  
lastCrawled                      5
repatriated                      5
relativeOrganismQuantity    435578
recordedByID                435578
identifiedByID              435578
Length: 241, dtype: int64

Drop all columns that do not have any values

In [6]:
occurrence_columns_with_no_values = []

for column in occurrence_df.columns:
    if occurrence_df[column].isnull().all():
        occurrence_columns_with_no_values.append(column)

print("Columns with no values:", occurrence_columns_with_no_values)

occurrence_df = occurrence_df.drop(occurrence_columns_with_no_values, axis=1)

Columns with no values: ['abstract', 'accessRights', 'accrualMethod', 'accrualPeriodicity', 'accrualPolicy', 'alternative', 'audience', 'available', 'bibliographicCitation', 'conformsTo', 'contributor', 'coverage', 'created', 'creator', 'date', 'dateAccepted', 'dateCopyrighted', 'dateSubmitted', 'description', 'educationLevel', 'extent', 'format', 'hasFormat', 'hasPart', 'hasVersion', 'instructionalMethod', 'isFormatOf', 'isPartOf', 'isReferencedBy', 'isReplacedBy', 'isRequiredBy', 'isVersionOf', 'issued', 'language', 'mediator', 'medium', 'modified', 'provenance', 'relation', 'replaces', 'requires', 'rights', 'source', 'spatial', 'subject', 'tableOfContents', 'temporal', 'title', 'type', 'valid', 'institutionID', 'collectionID', 'datasetID', 'institutionCode', 'ownerInstitutionCode', 'informationWithheld', 'dataGeneralizations', 'dynamicProperties', 'recordNumber', 'individualCount', 'organismQuantity', 'organismQuantityType', 'sex', 'lifeStage', 'reproductiveCondition', 'establishmen

List all columns with a single value

In [7]:
occurrence_columns_with_one_value = []

for column in occurrence_df.columns:
    if occurrence_df[column].unique().size == 1:
        occurrence_columns_with_one_value.append(column)

print("Columns with only one value:", occurrence_columns_with_one_value)

occurrence_df = occurrence_df.drop(occurrence_columns_with_one_value, axis=1)



Columns with only one value: ['license', 'publisher', 'collectionCode', 'datasetName', 'basisOfRecord']


In [8]:
occurrence_df

Unnamed: 0,gbifID,identifier,references,rightsHolder,occurrenceID,catalogNumber,recordedBy,behavior,associatedTaxa,occurrenceRemarks,...,genusKey,speciesKey,species,genericName,acceptedScientificName,verbatimScientificName,protocol,lastParsed,lastCrawled,repatriated
0,2432439881,485806@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,Stanislas Wroza,https://data.biodiversitydata.nl/xeno-canto/ob...,XC485806,Stanislas Wroza,song,,,...,5429330.0,2492537.0,Luscinia calliope,Calliope,"Luscinia calliope (Pallas, 1776)",Calliope calliope,DWC_ARCHIVE,2020-06-08T17:16:09.759Z,2020-06-02T09:48:40.848Z,True
1,2432440798,485824@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,Stanislas Wroza,https://data.biodiversitydata.nl/xeno-canto/ob...,XC485824,Stanislas Wroza,alarm call,,,...,2490241.0,2490244.0,Anthus hodgsoni,Anthus,"Anthus hodgsoni Richmond, 1907",Anthus hodgsoni,DWC_ARCHIVE,2020-06-08T17:16:09.759Z,2020-06-02T09:48:40.848Z,True
2,2432439707,485842@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,Stanislas Wroza,https://data.biodiversitydata.nl/xeno-canto/ob...,XC485842,Stanislas Wroza,"call, song",,,...,2493047.0,2493071.0,Phylloscopus borealis,Phylloscopus,"Phylloscopus borealis (J.H.Blasius, 1858)",Phylloscopus borealis,DWC_ARCHIVE,2020-06-08T17:16:09.759Z,2020-06-02T09:48:40.848Z,True
3,2432440942,485860@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,Stanislas Wroza,https://data.biodiversitydata.nl/xeno-canto/ob...,XC485860,Stanislas Wroza,"begging call, juvenile",,,...,5429347.0,5231223.0,Phoenicurus auroreus,Phoenicurus,"Phoenicurus auroreus (Pallas, 1776)",Phoenicurus auroreus,DWC_ARCHIVE,2020-06-08T17:16:09.759Z,2020-06-02T09:48:40.848Z,True
4,2432440206,485879@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,Stanislas Wroza,https://data.biodiversitydata.nl/xeno-canto/ob...,XC485879,Stanislas Wroza,flight call,,,...,2494150.0,9629160.0,Loxia curvirostra,Loxia,"Loxia curvirostra Linnaeus, 1758","Loxia curvirostra N4, N8",DWC_ARCHIVE,2020-06-08T17:16:09.759Z,2020-06-02T09:48:40.848Z,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435573,2432439860,485707@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,Stanislas Wroza,https://data.biodiversitydata.nl/xeno-canto/ob...,XC485707,Stanislas Wroza,"display, song",,,...,2481808.0,2481819.0,Gallinago gallinago,Gallinago,"Gallinago gallinago (Linnaeus, 1758)",Gallinago gallinago,DWC_ARCHIVE,2020-06-08T17:16:09.759Z,2020-06-02T09:48:40.848Z,True
435574,2432439243,485725@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,Stanislas Wroza,https://data.biodiversitydata.nl/xeno-canto/ob...,XC485725,Stanislas Wroza,flight call,,,...,2491468.0,2491472.0,Emberiza leucocephalos,Emberiza,"Emberiza leucocephalos S.G.Gmelin, 1771",Emberiza leucocephalos,DWC_ARCHIVE,2020-06-08T17:16:09.759Z,2020-06-02T09:48:40.848Z,True
435575,2432439694,485743@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,Stanislas Wroza,https://data.biodiversitydata.nl/xeno-canto/ob...,XC485743,Stanislas Wroza,flight call,,,...,2492321.0,5231198.0,Passer montanus,Passer,"Passer montanus (Linnaeus, 1758)",Passer montanus,DWC_ARCHIVE,2020-06-08T17:16:09.759Z,2020-06-02T09:48:40.848Z,True
435576,2432439085,485761@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,Stanislas Wroza,https://data.biodiversitydata.nl/xeno-canto/ob...,XC485761,Stanislas Wroza,flight call,,,...,2490241.0,2490244.0,Anthus hodgsoni,Anthus,"Anthus hodgsoni Richmond, 1907",Anthus hodgsoni,DWC_ARCHIVE,2020-06-08T17:16:09.759Z,2020-06-02T09:48:40.848Z,True


In [9]:
occurrence_df['species'].isnull().sum()

15819

In [10]:
unqiue_taxa = occurrence_df['associatedTaxa'].unique()

for taxa in unqiue_taxa:
    print(taxa)

a palumbus|Turdus philomelos|Prunella modularis|Emberiza citrinella
has background sounds: Batis pririt|Pycnonotus nigricans|Apalis thoracica|Urocolius indicus
has background sounds: Parus major|Certhia brachydactyla|Lophophanes cristatus|Columba palumbus
has background sounds: Columba palumbus|Certhia brachydactyla
has background sounds: Phylloscopus collybita|Sylvia melanocephala
has background sounds: Amazona amazonica|Cyclarhis gujanensis|Pheugopedius rutilus|Myrmotherula axillaris
has background sounds: Xiphorhynchus susurrans|Psarocolius decumanus|Amazilia tobaci|Amazona amazonica
has background sounds: Cuculus solitarius|Laniarius ferrugineus
has background sounds: Chrysococcyx cupreus|Camaroptera brachyura|Dryoscopus cubla|Cuculus solitarius|Columba delegorguei
has background sounds: Vireo olivaceus|Parkesia motacilla|Setophaga chrysoparia
has background sounds: Baeolophus atricristatus|Polioptila caerulea
has background sounds: Charadrius semipalmatus|Pluvialis squatarola
has 

In [11]:
occurrence_df.columns

Index(['gbifID', 'identifier', 'references', 'rightsHolder', 'occurrenceID',
       'catalogNumber', 'recordedBy', 'behavior', 'associatedTaxa',
       'occurrenceRemarks', 'associatedOccurrences', 'previousIdentifications',
       'eventDate', 'eventTime', 'year', 'month', 'day', 'verbatimEventDate',
       'fieldNotes', 'countryCode', 'locality', 'verbatimElevation',
       'decimalLatitude', 'decimalLongitude', 'georeferencedBy',
       'geologicalContextID', 'earliestEpochOrLowestSeries',
       'latestEpochOrHighestSeries', 'earliestAgeOrLowestStage',
       'latestAgeOrHighestStage', 'lowestBiostratigraphicZone',
       'highestBiostratigraphicZone', 'lithostratigraphicTerms', 'formation',
       'bed', 'identificationQualifier', 'typeStatus', 'identifiedBy',
       'identificationVerificationStatus', 'identificationRemarks', 'taxonID',
       'acceptedNameUsageID', 'taxonConceptID', 'scientificName',
       'acceptedNameUsage', 'parentNameUsage', 'originalNameUsage',
       'nam

In [38]:
occurrence_df['datasetKey'].unique()

array(['b1047888-ae52-4179-9dd5-5448ea342a24', nan], dtype=object)

# species info columns

* genus 
* scientific name 
* name

In [13]:
#occurrence_df['decimal_latitude'].unique()

In [14]:
occurrence_df['eventTime'].iloc[0]

'05:00'

In [15]:
occurrence_df['year'].iloc[0]

2019.0

In [16]:
unqiue_behaviors = occurrence_df['behavior'].fillna("").unique()

behaviors_arr = []

for behaviors in unqiue_behaviors:
    #seperate sample behaviors and append to array
    split_behaviors = [behavior.strip().lower() for behavior in behaviors.split(',')] # remove leading and trailinf white spaces and set to lower case
    #remove double quotes
    split_behaviors = [behavior.replace('"', '') for behavior in split_behaviors]
    #remove single quotes    
    split_behaviors = [behavior.replace("'", '') for behavior in split_behaviors]
    behaviors_arr += split_behaviors



def unclear_behavior(behavior):
    if '?' in behavior or not behavior:
        return True
    if "uncertain" in behavior:
        return True
    #check if behavior has only alphabetical letters or spaces 
    if not all(char.isalpha() or char.isspace() for char in behavior):
        return True

    return False

cleaned_behaviors_arr = []
for behavior in behaviors_arr:
    # add if not question mark and not empty
    if not unclear_behavior(behavior):
        cleaned_behaviors_arr.append(behavior)

unique_cleaned_behaviors_arr = np.unique(cleaned_behaviors_arr)

cleaned_behaviors_arr
len(unique_cleaned_behaviors_arr)
#unique_cleaned_behaviors_arr
#cleaned_behaviors_arr

5719

In [17]:
cleaned_behaviors_arr

['song',
 'alarm call',
 'call',
 'song',
 'begging call',
 'juvenile',
 'flight call',
 'call',
 'flight call',
 'chatter',
 'flight call',
 'female',
 'male',
 'song',
 'call',
 'male',
 'song',
 'call',
 'male',
 'huit call',
 'tak call',
 'call',
 'flight call',
 'two types of calls in flight',
 'call',
 'juvenile',
 'alarm call',
 'call',
 'flight call',
 'juvenile',
 'begging call',
 'call',
 'juvenile',
 'flight call',
 'song',
 'wing flapping',
 'allarm calls',
 'alarm call',
 'call',
 'juvenile',
 'rattle',
 'calls',
 'song',
 'song in display',
 'begging call',
 'hatchling or nestling',
 'alarm call',
 'call',
 'flight call',
 'song',
 'flight call',
 'nocturnal flight call',
 'alarm call',
 'song',
 'alarm call',
 'call',
 'metallic call',
 'bill clapping',
 'alarm call',
 'call',
 'female',
 'call',
 'female',
 'alarm call',
 'begging call',
 'call',
 'female',
 'flight call',
 'juvenile',
 'male',
 'song',
 'display',
 'song',
 'begging call',
 'call',
 'female',
 'juvenil

# Begin creation of species info df

In [18]:
#speciesKey	species

species_name_key_df = occurrence_df[['speciesKey', 'species']]
species_name_key_df['species'] = occurrence_df['species'].str.lower()

species_key_dict = dict(zip(species_name_key_df['species'], species_name_key_df['speciesKey']))

#sample dict template
sample_dict = {
    'gbifID' : None,
    'recording_link' : None,
    'decimal_latitude' : None,
    'decimal_longitude': None,
    'date' : None,
    'behavior' : [],
    'background_birds' : [],
}

def get_behavior(behaviors):
    #check if behaviors string is empty, None or nan
    if not behaviors or behaviors is None or pd.isnull(behaviors):
        return []
    #seperate sample behaviors and append to array
    split_behaviors = [behavior.strip().lower() for behavior in behaviors.split(',')] # remove leading and trailinf white spaces and set to lower case
    #remove double quotes
    split_behaviors = [behavior.replace('"', '') for behavior in split_behaviors]
    #remove single quotes    
    split_behaviors = [behavior.replace("'", '') for behavior in split_behaviors]

    return [behavior for behavior in split_behaviors if not unclear_behavior(behavior)]

def get_species_key(sci_sepecies_name):
    pass

def get_associated_birds(associated_taxa):
    #check if associated_taxa string is empty, None or nan
    if not associated_taxa or associated_taxa is None or pd.isnull(associated_taxa):
        return []
    associated_taxa = associated_taxa.replace('has background sounds:', '')
    split_taxa = [taxa.strip().lower() for taxa in associated_taxa.split('|')]
    bird_species_keys = [species_key_dict[taxa] for taxa in split_taxa if species_key_dict.get(taxa) is not None]
    return bird_species_keys

samples_per_species = occurrence_df.groupby(['speciesKey'])

data_dict = {}
for species_key, samples in samples_per_species:
    data_dict[species_key] = []
    for index, sample in samples.iterrows():
        gbifID = sample['gbifID']
        decimal_latitude = sample['decimalLatitude']
        decimal_longitude = sample['decimalLongitude']
        date = sample['eventDate']
        behavior = get_behavior(sample['behavior'])
        associated_birds = get_associated_birds(sample['associatedTaxa'])

        data_dict[species_key].append({
            'gbifID' : gbifID,
            'recording_link' : None,
            'recording_time_sec' : None,
            'decimal_latitude' : decimal_latitude,
            'decimal_longitude': decimal_longitude,
            'date' : date,
            'behavior' : [],
            'background_birds' : [],
        })



In [None]:
samples_per_species = samples_per_species.size().sort_values(ascending=False)

In [21]:
#samples_per_species.size()

In [22]:
data_dictionary_df = pd.DataFrame()

In [23]:
data_dictionary_df['gbifID'] = occurrence_df['gbifID']

In [24]:
occurrence_df.isnull().sum()

gbifID                    0
identifier                0
references                0
rightsHolder              0
occurrenceID              0
                         ..
verbatimScientificName    5
protocol                  5
lastParsed                5
lastCrawled               5
repatriated               5
Length: 89, dtype: int64

In [25]:
verbatim_df = pd.read_csv(os.path.join(gbif_path, 'verbatim.txt'), sep = '\t')


In [26]:
verbatim_df

Unnamed: 0,gbifID,abstract,accessRights,accrualMethod,accrualPeriodicity,accrualPolicy,alternative,audience,available,bibliographicCitation,...,taxonRank,verbatimTaxonRank,scientificNameAuthorship,vernacularName,nomenclaturalCode,taxonomicStatus,nomenclaturalStatus,taxonRemarks,recordedByID,identifiedByID
0,2432439881,,,,,,,,,,...,species,,,Siberian Rubythroat,ICZN,,,,,
1,2432440798,,,,,,,,,,...,species,,,Olive-backed Pipit,ICZN,,,,,
2,2432439707,,,,,,,,,,...,species,,,Arctic Warbler,ICZN,,,,,
3,2432440942,,,,,,,,,,...,species,,,Daurian Redstart,ICZN,,,,,
4,2432440206,,,,,,,,,,...,subspecies,,,Red Crossbill,ICZN,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435573,2432439860,,,,,,,,,,...,species,,,Common Snipe,ICZN,,,,,
435574,2432439243,,,,,,,,,,...,species,,,Pine Bunting,ICZN,,,,,
435575,2432439694,,,,,,,,,,...,species,,,Eurasian Tree Sparrow,ICZN,,,,,
435576,2432439085,,,,,,,,,,...,species,,,Olive-backed Pipit,ICZN,,,,,


In [27]:
verbatim_df['vernacularName'] = verbatim_df['vernacularName'].fillna('Unkown').str.lower()

grouped_common_name = occurrence_df.groupby(['vernacularName']) #group samples of each 

In [28]:
species_arr = []
for name, samples in grouped_common_name:
    sample_ids = samples['gbifID']

    species_in_samples = occurrence_df[occurrence_df['gbifID'].isin(sample_ids)]

    species_arr.append(len(species_in_samples['species'].unique()))

unique_elements, counts_elements = np.unique(species_arr, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))


Frequency of unique values of the said array:
[[   1    2    4]
 [9742  110    1]]


In [30]:
verbatim_df['vernacularName'].value_counts()

soundscape                         7246
identity unknown                   5000
great tit                          4466
common blackbird                   3295
common chaffinch                   3242
                                   ... 
jamaican lizard cuckoo                1
brown-backed flowerpecker             1
great-billed kingfisher               1
calabrian speckled bush-cricket       1
pinsker's hawk-eagle                  1
Name: vernacularName, Length: 9849, dtype: int64

In [31]:
verbatim_df['taxonRank'].value_counts()

species       344525
subspecies     91048
Name: taxonRank, dtype: int64

In [32]:
multimedia_df = pd.read_csv(os.path.join(gbif_path, 'multimedia.txt'), sep = '\t')

In [33]:
multimedia_df

Unnamed: 0,gbifID,type,format,identifier,references,title,description,source,audience,created,creator,contributor,publisher,license,rightsHolder
0,2243549888,StillImage,image/png,https://www.xeno-canto.org/sounds/uploaded/ZNC...,,,Sonogram of the first ten seconds of the sound...,,,,Stichting Xeno-canto voor Natuurgeluiden,,,http://creativecommons.org/licenses/by-nc-nd/2.5/,Stichting Xeno-canto voor Natuurgeluiden
1,2243549888,Sound,audio/mpeg,https://www.xeno-canto.org/sounds/uploaded/ZNC...,,,141 s,,,,Jarek Matusiak,,,http://creativecommons.org/licenses/by-nc-nd/2.5/,Jarek Matusiak
2,2243549893,Sound,audio/mpeg,https://www.xeno-canto.org/sounds/uploaded/ZNC...,,,115 s,,,,Jarek Matusiak,,,http://creativecommons.org/licenses/by-nc-nd/2.5/,Jarek Matusiak
3,2243549893,StillImage,image/png,https://www.xeno-canto.org/sounds/uploaded/ZNC...,,,Sonogram of the first ten seconds of the sound...,,,,Stichting Xeno-canto voor Natuurgeluiden,,,http://creativecommons.org/licenses/by-nc-nd/2.5/,Stichting Xeno-canto voor Natuurgeluiden
4,2243549898,Sound,audio/mpeg,https://www.xeno-canto.org/sounds/uploaded/PWD...,,,34 s,,,,Mike Nelson,,,http://creativecommons.org/licenses/by-nc-nd/2.5/,Mike Nelson
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871717,2609693247,StillImage,image/png,https://www.xeno-canto.org/sounds/uploaded/AZX...,,,Sonogram of the first ten seconds of the sound...,,,,Stichting Xeno-canto voor Natuurgeluiden,,,http://creativecommons.org/licenses/by-nc-sa/3.0/,Stichting Xeno-canto voor Natuurgeluiden
871718,2609693252,StillImage,image/png,https://www.xeno-canto.org/sounds/uploaded/AZX...,,,Sonogram of the first ten seconds of the sound...,,,,Stichting Xeno-canto voor Natuurgeluiden,,,http://creativecommons.org/licenses/by-nc-sa/3.0/,Stichting Xeno-canto voor Natuurgeluiden
871719,2609693252,Sound,audio/mpeg,https://www.xeno-canto.org/sounds/uploaded/AZX...,,,4 s,,,,Allen T. Chartier,,,http://creativecommons.org/licenses/by-nc-sa/3.0/,Allen T. Chartier
871720,2609693257,StillImage,image/png,https://www.xeno-canto.org/sounds/uploaded/AZX...,,,Sonogram of the first ten seconds of the sound...,,,,Stichting Xeno-canto voor Natuurgeluiden,,,http://creativecommons.org/licenses/by-nc-sa/3.0/,Stichting Xeno-canto voor Natuurgeluiden


In [34]:
desc_df = multimedia_df.loc[multimedia_df['format'] == 'audio/mpeg']
desc_df['format'].unique()

array(['audio/mpeg'], dtype=object)