In [58]:
import xenocanto
import pandas as pd
import numpy as np
import os

In [59]:
species_path = os.path.join("dataset",'species_info.csv') 
species_info_df = pd.read_csv(species_path)

Drop species which were recorded less than the rec_min

In [60]:
rec_min = 250

print("Total number of species:", len(species_info_df))

min_species_info_df = species_info_df[species_info_df['forefront recs'] > rec_min]

print("Number of species meeting min:", len(min_species_info_df))

Total number of species: 10963
Number of species meeting min: 312


In [61]:
min_species_info_df

Unnamed: 0,common name,scientific name,extinct,forefront recs,background recs,africa,americas,asia,australasia,europe
23,Little Tinamou,Crypturellus soui,False,317,432,False,True,False,False,False
76,Canada Goose,Branta canadensis,False,363,964,False,True,True,True,True
83,Greylag Goose,Anser anser,False,475,1378,True,True,True,True,True
88,Greater White-fronted Goose,Anser albifrons,False,257,116,True,True,True,False,True
93,Mute Swan,Cygnus olor,False,272,112,True,True,True,True,True
...,...,...,...,...,...,...,...,...,...,...
10498,Northern Cardinal,Cardinalis cardinalis,False,593,2477,False,True,False,False,False
10811,Slate-colored Grosbeak,Saltator grossus,False,257,466,False,True,False,False,False
10814,Buff-throated Saltator,Saltator maximus,False,260,384,False,True,False,False,False
10817,Greyish Saltator,Saltator coerulescens,False,374,556,False,True,False,False,False


In [62]:
xenocanto.download('bearded+bellbird+q:A')

# Using gbif dataset

In [63]:
gbif_path = 'gbif_xeno_canto'

occurrence_df = pd.read_csv(os.path.join(gbif_path, 'occurrence.txt'), sep = '\t')

In [52]:
occurrence_df.columns

Index(['gbifID', 'abstract', 'accessRights', 'accrualMethod',
       'accrualPeriodicity', 'accrualPolicy', 'alternative', 'audience',
       'available', 'bibliographicCitation',
       ...
       'acceptedScientificName', 'verbatimScientificName', 'typifiedName',
       'protocol', 'lastParsed', 'lastCrawled', 'repatriated',
       'relativeOrganismQuantity', 'recordedByID', 'identifiedByID'],
      dtype='object', length=241)

In [54]:
occurrence_df.isnull().sum()

gbifID                           0
abstract                    435578
accessRights                435578
accrualMethod               435578
accrualPeriodicity          435578
                             ...  
lastCrawled                      5
repatriated                      5
relativeOrganismQuantity    435578
recordedByID                435578
identifiedByID              435578
Length: 241, dtype: int64

Drop all columns that do not have any values

In [55]:
occurrence_columns_with_no_values = []

for column in occurrence_df.columns:
    if occurrence_df[column].isnull().all():
        occurrence_columns_with_no_values.append(column)

print("Columns with no values:", occurrence_columns_with_no_values)

occurrence_df = occurrence_df.drop(occurrence_columns_with_no_values, axis=1)

Columns with no values: ['abstract', 'accessRights', 'accrualMethod', 'accrualPeriodicity', 'accrualPolicy', 'alternative', 'audience', 'available', 'bibliographicCitation', 'conformsTo', 'contributor', 'coverage', 'created', 'creator', 'date', 'dateAccepted', 'dateCopyrighted', 'dateSubmitted', 'description', 'educationLevel', 'extent', 'format', 'hasFormat', 'hasPart', 'hasVersion', 'instructionalMethod', 'isFormatOf', 'isPartOf', 'isReferencedBy', 'isReplacedBy', 'isRequiredBy', 'isVersionOf', 'issued', 'language', 'mediator', 'medium', 'modified', 'provenance', 'relation', 'replaces', 'requires', 'rights', 'source', 'spatial', 'subject', 'tableOfContents', 'temporal', 'title', 'type', 'valid', 'institutionID', 'collectionID', 'datasetID', 'institutionCode', 'ownerInstitutionCode', 'informationWithheld', 'dataGeneralizations', 'dynamicProperties', 'recordNumber', 'individualCount', 'organismQuantity', 'organismQuantityType', 'sex', 'lifeStage', 'reproductiveCondition', 'establishmen

List all columns with a single value

In [57]:
occurrence_columns_with_one_value = []

for column in occurrence_df.columns:
    if occurrence_df[column].unique().size == 1:
        occurrence_columns_with_one_value.append(column)

print("Columns with only one value:", occurrence_columns_with_one_value)

occurrence_df = occurrence_df.drop(occurrence_columns_with_one_value, axis=1)



Columns with only one value: []


In [50]:
occurrence_df

Unnamed: 0,gbifID,identifier,references,rightsHolder,occurrenceID,catalogNumber,recordedBy,behavior,associatedTaxa,occurrenceRemarks,...,genusKey,speciesKey,species,genericName,acceptedScientificName,verbatimScientificName,protocol,lastParsed,lastCrawled,repatriated
0,2432439881,485806@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,Stanislas Wroza,https://data.biodiversitydata.nl/xeno-canto/ob...,XC485806,Stanislas Wroza,song,,,...,5429330.0,2492537.0,Luscinia calliope,Calliope,"Luscinia calliope (Pallas, 1776)",Calliope calliope,DWC_ARCHIVE,2020-06-08T17:16:09.759Z,2020-06-02T09:48:40.848Z,True
1,2432440798,485824@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,Stanislas Wroza,https://data.biodiversitydata.nl/xeno-canto/ob...,XC485824,Stanislas Wroza,alarm call,,,...,2490241.0,2490244.0,Anthus hodgsoni,Anthus,"Anthus hodgsoni Richmond, 1907",Anthus hodgsoni,DWC_ARCHIVE,2020-06-08T17:16:09.759Z,2020-06-02T09:48:40.848Z,True
2,2432439707,485842@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,Stanislas Wroza,https://data.biodiversitydata.nl/xeno-canto/ob...,XC485842,Stanislas Wroza,"call, song",,,...,2493047.0,2493071.0,Phylloscopus borealis,Phylloscopus,"Phylloscopus borealis (J.H.Blasius, 1858)",Phylloscopus borealis,DWC_ARCHIVE,2020-06-08T17:16:09.759Z,2020-06-02T09:48:40.848Z,True
3,2432440942,485860@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,Stanislas Wroza,https://data.biodiversitydata.nl/xeno-canto/ob...,XC485860,Stanislas Wroza,"begging call, juvenile",,,...,5429347.0,5231223.0,Phoenicurus auroreus,Phoenicurus,"Phoenicurus auroreus (Pallas, 1776)",Phoenicurus auroreus,DWC_ARCHIVE,2020-06-08T17:16:09.759Z,2020-06-02T09:48:40.848Z,True
4,2432440206,485879@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,Stanislas Wroza,https://data.biodiversitydata.nl/xeno-canto/ob...,XC485879,Stanislas Wroza,flight call,,,...,2494150.0,9629160.0,Loxia curvirostra,Loxia,"Loxia curvirostra Linnaeus, 1758","Loxia curvirostra N4, N8",DWC_ARCHIVE,2020-06-08T17:16:09.759Z,2020-06-02T09:48:40.848Z,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435573,2432439860,485707@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,Stanislas Wroza,https://data.biodiversitydata.nl/xeno-canto/ob...,XC485707,Stanislas Wroza,"display, song",,,...,2481808.0,2481819.0,Gallinago gallinago,Gallinago,"Gallinago gallinago (Linnaeus, 1758)",Gallinago gallinago,DWC_ARCHIVE,2020-06-08T17:16:09.759Z,2020-06-02T09:48:40.848Z,True
435574,2432439243,485725@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,Stanislas Wroza,https://data.biodiversitydata.nl/xeno-canto/ob...,XC485725,Stanislas Wroza,flight call,,,...,2491468.0,2491472.0,Emberiza leucocephalos,Emberiza,"Emberiza leucocephalos S.G.Gmelin, 1771",Emberiza leucocephalos,DWC_ARCHIVE,2020-06-08T17:16:09.759Z,2020-06-02T09:48:40.848Z,True
435575,2432439694,485743@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,Stanislas Wroza,https://data.biodiversitydata.nl/xeno-canto/ob...,XC485743,Stanislas Wroza,flight call,,,...,2492321.0,5231198.0,Passer montanus,Passer,"Passer montanus (Linnaeus, 1758)",Passer montanus,DWC_ARCHIVE,2020-06-08T17:16:09.759Z,2020-06-02T09:48:40.848Z,True
435576,2432439085,485761@XC,https://data.biodiversitydata.nl/xeno-canto/ob...,Stanislas Wroza,https://data.biodiversitydata.nl/xeno-canto/ob...,XC485761,Stanislas Wroza,flight call,,,...,2490241.0,2490244.0,Anthus hodgsoni,Anthus,"Anthus hodgsoni Richmond, 1907",Anthus hodgsoni,DWC_ARCHIVE,2020-06-08T17:16:09.759Z,2020-06-02T09:48:40.848Z,True


In [8]:
verbatim_df = pd.read_csv(os.path.join(gbif_path, 'verbatim.txt'), sep = '\t')


In [35]:
verbatim_df

Unnamed: 0,gbifID,abstract,accessRights,accrualMethod,accrualPeriodicity,accrualPolicy,alternative,audience,available,bibliographicCitation,...,taxonRank,verbatimTaxonRank,scientificNameAuthorship,vernacularName,nomenclaturalCode,taxonomicStatus,nomenclaturalStatus,taxonRemarks,recordedByID,identifiedByID
0,2432439881,,,,,,,,,,...,species,,,Siberian Rubythroat,ICZN,,,,,
1,2432440798,,,,,,,,,,...,species,,,Olive-backed Pipit,ICZN,,,,,
2,2432439707,,,,,,,,,,...,species,,,Arctic Warbler,ICZN,,,,,
3,2432440942,,,,,,,,,,...,species,,,Daurian Redstart,ICZN,,,,,
4,2432440206,,,,,,,,,,...,subspecies,,,Red Crossbill,ICZN,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
435573,2432439860,,,,,,,,,,...,species,,,Common Snipe,ICZN,,,,,
435574,2432439243,,,,,,,,,,...,species,,,Pine Bunting,ICZN,,,,,
435575,2432439694,,,,,,,,,,...,species,,,Eurasian Tree Sparrow,ICZN,,,,,
435576,2432439085,,,,,,,,,,...,species,,,Olive-backed Pipit,ICZN,,,,,


In [10]:
multimedia_df = pd.read_csv(os.path.join(gbif_path, 'multimedia.txt'), sep = '\t')

In [11]:
multimedia_df

Unnamed: 0,gbifID,type,format,identifier,references,title,description,source,audience,created,creator,contributor,publisher,license,rightsHolder
0,2243549888,StillImage,image/png,https://www.xeno-canto.org/sounds/uploaded/ZNC...,,,Sonogram of the first ten seconds of the sound...,,,,Stichting Xeno-canto voor Natuurgeluiden,,,http://creativecommons.org/licenses/by-nc-nd/2.5/,Stichting Xeno-canto voor Natuurgeluiden
1,2243549888,Sound,audio/mpeg,https://www.xeno-canto.org/sounds/uploaded/ZNC...,,,141 s,,,,Jarek Matusiak,,,http://creativecommons.org/licenses/by-nc-nd/2.5/,Jarek Matusiak
2,2243549893,Sound,audio/mpeg,https://www.xeno-canto.org/sounds/uploaded/ZNC...,,,115 s,,,,Jarek Matusiak,,,http://creativecommons.org/licenses/by-nc-nd/2.5/,Jarek Matusiak
3,2243549893,StillImage,image/png,https://www.xeno-canto.org/sounds/uploaded/ZNC...,,,Sonogram of the first ten seconds of the sound...,,,,Stichting Xeno-canto voor Natuurgeluiden,,,http://creativecommons.org/licenses/by-nc-nd/2.5/,Stichting Xeno-canto voor Natuurgeluiden
4,2243549898,Sound,audio/mpeg,https://www.xeno-canto.org/sounds/uploaded/PWD...,,,34 s,,,,Mike Nelson,,,http://creativecommons.org/licenses/by-nc-nd/2.5/,Mike Nelson
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
871717,2609693247,StillImage,image/png,https://www.xeno-canto.org/sounds/uploaded/AZX...,,,Sonogram of the first ten seconds of the sound...,,,,Stichting Xeno-canto voor Natuurgeluiden,,,http://creativecommons.org/licenses/by-nc-sa/3.0/,Stichting Xeno-canto voor Natuurgeluiden
871718,2609693252,StillImage,image/png,https://www.xeno-canto.org/sounds/uploaded/AZX...,,,Sonogram of the first ten seconds of the sound...,,,,Stichting Xeno-canto voor Natuurgeluiden,,,http://creativecommons.org/licenses/by-nc-sa/3.0/,Stichting Xeno-canto voor Natuurgeluiden
871719,2609693252,Sound,audio/mpeg,https://www.xeno-canto.org/sounds/uploaded/AZX...,,,4 s,,,,Allen T. Chartier,,,http://creativecommons.org/licenses/by-nc-sa/3.0/,Allen T. Chartier
871720,2609693257,StillImage,image/png,https://www.xeno-canto.org/sounds/uploaded/AZX...,,,Sonogram of the first ten seconds of the sound...,,,,Stichting Xeno-canto voor Natuurgeluiden,,,http://creativecommons.org/licenses/by-nc-sa/3.0/,Stichting Xeno-canto voor Natuurgeluiden
