In [147]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os
import re

In [80]:
DIR_PATH = 'C:/Users/Tony/Documents/Data/'

## Dataset Origin
The following dataset is generated from the occurence records obtained from the Atlas of Living Australia with filters "machine obcervation" and "sound". https://biocache.ala.org.au/occurrence/search?q=data_resource_uid%3Adr341&disableAllQualityFilters=true&qualityProfile=ALA&fq=multimedia%3A%22Sound%22&fq=basis_of_record%3A%22MACHINE_OBSERVATION%22


Upon manual inspection of the dataset, we can find the relevant columns: recordID for the URL link to pull the audio, species for classification tag, and latitude and longitude for future reference. There's also a sound ID "sounds" to link it with the existing database.

In [239]:
# Empty dataframe to store metadata
df = pd.DataFrame(columns=['sounds','species','latitude','longitude'])

# Empty list to store metadata
rows_list = []

In [None]:
# Default url after searching with machine sounds as filters
url = 'https://biocache.ala.org.au/occurrence/search?q=data_resource_uid%3Adr341&disableAllQualityFilters=true&qualityProfile=ALA&fq=multimedia%3A%22Sound%22&fq=basis_of_record%3A%22MACHINE_OBSERVATION%22'
page = requests.get(url)
soup = BeautifulSoup(page.content, 'html.parser')
# Get the link of the next results page
link = soup.find('a', {'class': 'nextLink'})

while link is not None:
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    
    # Go thorugh all the records of the current page
    for link in soup.findAll('div', {'class': 'recordRow'}):
        # Go into each entry of the results
        url_id = (link['id']) 
        nexturl = 'https://biocache.ala.org.au/occurrence/' + url_id
        nextpage = requests.get(nexturl)
        nextsoup = BeautifulSoup(nextpage.content, 'html.parser')
        # Scrape relevant metadata
        species = nextsoup.find('tr', {'id': 'species'}).find('i').text
        lat = nextsoup.find('tr', {'id': 'latitude'}).find('td', {'class': 'value'}).text.split()[-1]
        long = nextsoup.find('tr', {'id': 'longitude'}).find('td', {'class': 'value'}).text.split()[-1]
        cat_no = nextsoup.find('tr', {'id': 'catalogNumber'}).find('td', {'class': 'value'}).text.split()[-1]
        # Scrape the audio
        audio_url = nextsoup.audio['src']
        audio = requests.get(audio_url)
        
        # update a dict to store metadata to a dataframe afterwards.
        dict1 = {}
        dict1.update({'sounds': cat_no, 'species': species, 'latitude': lat, 
                     'longitude': long})
        rows_list.append(dict1)


        # make directory if not already present
        parent_dir = DIR_PATH
        path = os.path.join(parent_dir, species)
        if not os.path.isdir(path):
            os.mkdir(path)

        file_path = file_path = f'{DIR_PATH}{species}/{cat_no}.mp3'
        # Write the audio file into directory
        with open(file_path,'wb') as audio_file:
            audio_file.write(audio.content)

        print(f"Saved file: {file_path}")

    # Find the link to the next results page
    link = soup.find('a', {'class': 'nextLink'})
    url = 'https://biocache.ala.org.au' + link['href']
    print(url)


Saved file: C:/Users/Tony/Documents/Data/Ninox boobook/X02937.mp3
Saved file: C:/Users/Tony/Documents/Data/Menura novaehollandiae/X01769.mp3
Saved file: C:/Users/Tony/Documents/Data/Chalcites basalis/X00489.mp3
Saved file: C:/Users/Tony/Documents/Data/Froggattina australis/X00213.mp3
Saved file: C:/Users/Tony/Documents/Data/Podargus strigoides/X02955.mp3
Saved file: C:/Users/Tony/Documents/Data/Philemon corniculatus/X00763.mp3
Saved file: C:/Users/Tony/Documents/Data/Myiagra rubecula/X01274.mp3
Saved file: C:/Users/Tony/Documents/Data/Corvus coronoides/X02275.mp3
Saved file: C:/Users/Tony/Documents/Data/Menura novaehollandiae/X03063.mp3
Saved file: C:/Users/Tony/Documents/Data/Corcorax melanorhamphos/X03916.mp3
Saved file: C:/Users/Tony/Documents/Data/Psophodes nigrogularis/X04156.mp3
Saved file: C:/Users/Tony/Documents/Data/Gerygone olivacea/X02463.mp3
Saved file: C:/Users/Tony/Documents/Data/Meliphaga lewinii/X02534.mp3
Saved file: C:/Users/Tony/Documents/Data/Litoria serrata/X04272.

Saved file: C:/Users/Tony/Documents/Data/Psophodes olivaceus/X03639.mp3
Saved file: C:/Users/Tony/Documents/Data/Menura novaehollandiae/X01768.mp3
Saved file: C:/Users/Tony/Documents/Data/Scythrops novaehollandiae/X02920.mp3
Saved file: C:/Users/Tony/Documents/Data/Anthochaera phrygia/X03388.mp3
Saved file: C:/Users/Tony/Documents/Data/Struthidea cinerea/X01473.mp3
Saved file: C:/Users/Tony/Documents/Data/Gryllidae sp./X03970.mp3
Saved file: C:/Users/Tony/Documents/Data/Pachycephala rufiventris/X02172.mp3
Saved file: C:/Users/Tony/Documents/Data/Fulica atra/X00320.mp3
Saved file: C:/Users/Tony/Documents/Data/Coracina papuensis/X03743.mp3
Saved file: C:/Users/Tony/Documents/Data/Rhipidura albiscapa/X03727.mp3
Saved file: C:/Users/Tony/Documents/Data/Colluricincla harmonica/X03689.mp3
Saved file: C:/Users/Tony/Documents/Data/Ptilotula fusca/X02439.mp3
Saved file: C:/Users/Tony/Documents/Data/Limnodynastes dumerilii/X00136.mp3
Saved file: C:/Users/Tony/Documents/Data/Aegotheles cristatus/

Saved file: C:/Users/Tony/Documents/Data/Litoria serrata/X04260.mp3
Saved file: C:/Users/Tony/Documents/Data/Crinia signifera/X00152.mp3
Saved file: C:/Users/Tony/Documents/Data/Fulica atra/X02782.mp3
Saved file: C:/Users/Tony/Documents/Data/Gryllotalpidae sp./X03974.mp3
Saved file: C:/Users/Tony/Documents/Data/Gryllotalpidae sp./X03978.mp3
Saved file: C:/Users/Tony/Documents/Data/Pachycephala rufiventris/X02426.mp3
Saved file: C:/Users/Tony/Documents/Data/Litoria myola/X04221.mp3
Saved file: C:/Users/Tony/Documents/Data/Corvus coronoides/X02252.mp3
Saved file: C:/Users/Tony/Documents/Data/Henicopsaltria eydouxii/X03956.mp3
Saved file: C:/Users/Tony/Documents/Data/Colluricincla harmonica/X02302.mp3
Saved file: C:/Users/Tony/Documents/Data/Heteroscenes pallidus/X00453.mp3
Saved file: C:/Users/Tony/Documents/Data/Zosterops lateralis/X01605.mp3
Saved file: C:/Users/Tony/Documents/Data/Anthochaera carunculata/X02612.mp3
Saved file: C:/Users/Tony/Documents/Data/Menura novaehollandiae/X01694

In [234]:
# Write the dataframe using the list of stored metadata
df = pd.DataFrame(rows_list)         
df.head()

Unnamed: 0,sounds,species,latitude,longitude
0,X02937,Ninox boobook,"""-33.1449""","""149.9814"""
1,X01769,Menura novaehollandiae,"""-33.1738""","""149.9977"""
2,X00489,Chalcites basalis,"""-33.1449""","""149.9814"""


In [238]:
totalDir = 0
totalFile = 0
parent_dir = DIR_PATH
for base, dirs, files in os.walk(parent_dir):
    for directories in dirs:
        totalDir += 1
    for file in files:
        totalFile += 1
print(f'total number of directories: {totalDir}')
print(f'total number of files: {totalFile}')

total number of directories: 262
total number of files: 218
