In [79]:
import requests
#from bs4 import BeautifulSoup
import pandas as pd
import os

In [80]:
DIR_PATH = 'C:/Users/Tony/Documents/Data/'

## Dataset Origin
The following dataset is generated from the occurence records obtained from the Atlas of Living Australia with filters "machine obcervation" and "sound". A dataset was generated from this search and saved as a cvs. This service is provided to ALA members with free registration. https://biocache.ala.org.au/occurrence/search?q=data_resource_uid%3Adr341&disableAllQualityFilters=true&qualityProfile=ALA&fq=multimedia%3A%22Sound%22&fq=basis_of_record%3A%22MACHINE_OBSERVATION%22

In [81]:
df = pd.read_csv('records-2022-11-25.csv')
# iterating the columns
for col in df.columns:
    print(col)

dataResourceUid
images
raw_recordedBy
dcterms:modified
dcterms:language
dcterms:license
rightsHolder
dcterms:accessRights
dcterms:bibliographicCitation
references
institutionID
collectionID
datasetID
institutionCode
collectionCode
datasetName
ownerInstitutionCode
basisOfRecord
informationWithheld
dataGeneralizations
dynamicProperties
occurrenceID
catalogNumber
recordNumber
recordedBy
individualCount
organismQuantity
organismQuantityType
sex
lifeStage
reproductiveCondition
behavior
establishmentMeans
occurrenceStatus
preparations
disposition
associatedMedia
associatedReferences
associatedSequences
associatedTaxa
otherCatalogNumbers
occurrenceRemarks
organismID
organismName
organismScope
associatedOccurrences
associatedOrganisms
previousIdentifications
organismRemarks
materialSampleID
eventID
parentEventID
fieldNumber
eventDate
eventTime
startDayOfYear
endDayOfYear
year
month
day
verbatimEventDate
habitat
samplingProtocol
samplingEffort
sampleSizeValue
sampleSizeUnit
fieldNotes
eventRema

Upon manual inspection of the dataset, we can find the relevant columns: recordID for the URL link to pull the audio, species for classification tag, and latitude and longitude for future reference.

In [101]:
df2 = df.filter(['sounds','species','decimalLatitude','decimalLongitude','dcterms:bibliographicCitation'], axis=1)

df2.rename(columns={'sounds': 'sound_link', 'decimalLatitude': 'latitude', 'decimalLongitude': 'longitude', 'dcterms:bibliographicCitation': 'sounds'
}, inplace=True)
df2.head()

Unnamed: 0,sound_link,species,latitude,longitude,sounds
0,f54cfdfa-1397-4756-8a1a-e5a3a77aa194,Eopsaltria australis,-29.15,153.2333,X01150
1,6b347b90-6ac9-47c2-b87b-2ccdd6be7442,Manorina melanophrys,-33.7122,150.9575,X00864
2,d7d7d686-9fbb-4269-a24c-eaac07f845f0,Dacelo novaeguineae,-33.1171,150.1566,X02335
3,6a3767c3-4dc1-45bc-94e5-f3ac173a06cc,Climacteris picumnus,-33.1519,150.1279,X03143
4,1f624db7-a06b-4d44-a1cc-65ae9adadff1,Ptilotula fusca,-33.1384,150.1269,X03518


In [102]:
# get rid of null value for species as it is not useful
df2['species'].isnull().sum()
df2 = df2[df2['species'].notna()]
len(df2.index)

2517

In [103]:
# Make directory one for each unique species
unique_species = df2.species.unique()
print(f'number of unique species: {len(unique_species)}')
parent_dir = DIR_PATH

for specie in unique_species:
    
    path = os.path.join(parent_dir, specie)
    if not os.path.isdir(path):
        os.mkdir(path)
    
totalDir = 0
for base, dirs, files in os.walk(parent_dir):
    for directories in dirs:
        totalDir += 1
print(f'total number of directories created: {totalDir}')

number of unique species: 260
total number of directories created: 260


In [104]:
# pull the audio for the first 5 records. The url is generated by contatenating the root address with the sound IDs from file
# replace 5 with len(df2.index) for complete dataset
for i in range(5):

    url = 'https://images.ala.org.au/image/proxyImage?imageId=' + df2.sound_link[i]
    print(url)
    audio = requests.get(url)
    species = df2.species[i]
    file_path = f'{DIR_PATH}{species}/{df2.sounds[i]}.mp3'

    with open(file_path,'wb') as audio_file:
        audio_file.write(audio.content)

    print(f"Saved file: {file_path}")

https://images.ala.org.au/image/proxyImage?imageId=f54cfdfa-1397-4756-8a1a-e5a3a77aa194
Saved file: C:/Users/Tony/Documents/Data/Eopsaltria australis/X01150.mp3
https://images.ala.org.au/image/proxyImage?imageId=6b347b90-6ac9-47c2-b87b-2ccdd6be7442
Saved file: C:/Users/Tony/Documents/Data/Manorina melanophrys/X00864.mp3
https://images.ala.org.au/image/proxyImage?imageId=d7d7d686-9fbb-4269-a24c-eaac07f845f0
Saved file: C:/Users/Tony/Documents/Data/Dacelo novaeguineae/X02335.mp3
https://images.ala.org.au/image/proxyImage?imageId=6a3767c3-4dc1-45bc-94e5-f3ac173a06cc
Saved file: C:/Users/Tony/Documents/Data/Climacteris picumnus/X03143.mp3
https://images.ala.org.au/image/proxyImage?imageId=1f624db7-a06b-4d44-a1cc-65ae9adadff1
Saved file: C:/Users/Tony/Documents/Data/Ptilotula fusca/X03518.mp3
