In [48]:
SLICE_SECONDS = 10 # Length of input slices for model.
FFT_HOP_LENGTH = 512 # How many time domain samples per spectrogram frame

In [2]:
# -*- coding: utf-8 -*-

# don
import pandas as pd
import numpy as np
import librosa
import os
import zipfile
import csv
import shutil
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from concurrent.futures import ThreadPoolExecutor, as_completed
from sklearn.metrics import (
    roc_auc_score,        
)


In [3]:
"""
Script for extracting data from https://figshare.com/articles/dataset/Sounds_of_the_Eleutherodactylus_frog_community_from_Puerto_Rico/806302?file=3104183
Unzips all the zips from root
Simply Unzip downloaded file and provide path root of folder
"""


def extract_zip_file(file_path, extract_to):
    """
    Extracts a single zip file to a specified directory.

    Args:
        file_path (str): Path to the zip file.
        extract_to (str): Path to the directory where the zip file will be extracted.
    """
    with zipfile.ZipFile(file_path, "r") as zip_ref:
        zip_ref.extractall(extract_to)
        # print(f"{os.path.basename(file_path)} extracted to {os.path.abspath(extract_to)}")


def extract_zip_files(zip_folder, extract_to):
    """
    Extracts all zip files from a folder to a specified directory using threading.

    Args:
        zip_folder (str): Path to the folder containing zip files.
        extract_to (str): Path to the directory where zip files will be extracted.
    """
    # Make sure the extraction directory exists
    os.makedirs(extract_to, exist_ok=True)

    data_file = "FrequencyRange_by_species_and_site_Averages.csv"
    shutil.copyfile(os.path.join(zip_folder, data_file) , os.path.join(extract_to, data_file))

    # List all zip files in the folder
    zip_files = [
        os.path.join(zip_folder, item)
        for item in os.listdir(zip_folder)
        if item.endswith(".zip")
    ]

    # Use ThreadPoolExecutor to extract zip files concurrently
    with ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(extract_zip_file, zip_file, extract_to)
            for zip_file in zip_files
        ]

        # Wait for all futures to complete
        for future in futures:
            future.result()
    
    print("Done.")

In [4]:
filepath = "/home/edwinc/Downloads/806302"
ExtractTo = "/home/edwinc/Downloads/806302/Extracted"
extract_zip_files(filepath, ExtractTo)

Done.


In [5]:
def readAveragesData(path: str):

    data = []
    # Read the CSV file and store the data in a list of dictionaries
    with open(path, "r") as file:
        reader = csv.DictReader(file)
        for row in reader:
            data.append(row)

    return data


def prepare_csv(data_dir, output) -> None:

    
    averagesData = readAveragesData(
        os.path.join(data_dir, "FrequencyRange_by_species_and_site_Averages.csv")
    )

    data = []

    # Iterate through each subfolder
    for siteDataSet in os.listdir(data_dir):
        site_folder = os.path.join(data_dir, siteDataSet)
        if os.path.isdir(site_folder):
            # example siteId  "Site01-1" such that the 4-6 index represents the site id; in this case 01
            siteId = int(siteDataSet[4:6])
            SiteData = [
                averageClassification
                for averageClassification in averagesData
                if int(averageClassification["SiteID"]) == siteId
            ]

            classifications = ", ".join(
                [classification["Species"] for classification in SiteData]
            )
            for audio_recording in os.listdir(site_folder):
                if audio_recording.endswith(".wav"):
                    audio_recording_abs_path = os.path.abspath(
                        os.path.join(site_folder, audio_recording)
                    )

                    data.append([siteId, audio_recording_abs_path, classifications])

    # Create DataFrame
    df = pd.DataFrame(
        data,
        columns=[
            "siteId",
            "filename",
            "species",
        ],
    )

    df.to_csv(output, index=False)

In [6]:
data_dir = ExtractTo
output = "machine_learning/processed/processed.csv"
prepare_csv(data_dir, output)

In [None]:
def extract_features(file_path):
    """
    Extract spectrogram from audio file using librosa. 

    Args:
        file_path (str): Path to the audio file.

    Returns:
        np.array: Extracted features.
        int: Sample rate in hertz
    """
    audio, sr = librosa.load(file_path)

    # MFCC
    return librosa.feature.mfcc(y=audio, sr=sr, hop_length = FFT_HOP_LENGTH), sr


def process_data(data_csv_path):
    """Read Csv with filename and generate spectrogram for each sample

    Returns:
        DataFrame: dataframe with all data
    """
    # data_csv_path = sys.argv[1]

    df = pd.read_csv(data_csv_path)

    # Initialize a list to store the results
    spectrograms = []
    sample_rates = []

    with ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(extract_features, row["filename"])
            for _, row in df.iterrows()
        ]

        for future in as_completed(futures):
            try:
                spectrogram, sr = future.result()
                spectrograms.append(spectrogram)
                sample_rates.append(sr)
            except Exception as exc:
                print(f"Generated an exception: {exc}")

    # Convert the list of spectrograms into a DataFrame
    assert(min(sample_rates) == max(sample_rates))
    sr = min(sample_rates)
    slice_width = sr * SLICE_SECONDS // FFT_HOP_LENGTH
    # Slice them into fixed widths
    spectrogram_slices = []
    for spectrogram in spectrograms:    
        spectrogram = spectrogram[:, :-(spectrogram.shape[1] % slice_width)] # Take only the section of the spectrogram that will split into fixed slices
        n_slices = spectrogram.shape[1] / slice_width
        result = np.hsplit(spectrogram, n_slices)
        spectrogram_slices.append(result)

    # Group up the rows with slices. Since we have to duplicate the rows some number of times so they can each go with their own slice, we'll make a new df
    master = []
    for df_row, slices in zip(df.itertuples(index=False), spectrogram_slices):
        for spectrogram in slices:
            new_row = list(df_row) + list(spectrogram.flatten())
            master.append(new_row)
    # spectrogram_df = pd.DataFrame(spectrogram_slices)

    # Concatenate the original DataFrame with the new DataFrame containing spectrograms
    # df = pd.concat([df, spectrogram_df], axis=1)

    return pd.DataFrame(master)

In [102]:
csv_dir = output
df = process_data(csv_dir)
df

Pandas(siteId=3, filename='/home/edwinc/Downloads/806302/Extracted/Site03-1/LINE_2004-04-29_05_59_59.wav', species='E. coqui - co, E. coqui - qui, E. gryllus, E. portoricensis - co, E. portoricensis - qui, E. unicolor')
Pandas(siteId=3, filename='/home/edwinc/Downloads/806302/Extracted/Site03-1/LINE_2004-04-29_18_29_53.wav', species='E. coqui - co, E. coqui - qui, E. gryllus, E. portoricensis - co, E. portoricensis - qui, E. unicolor')
Pandas(siteId=3, filename='/home/edwinc/Downloads/806302/Extracted/Site03-1/LINE_2004-04-29_18_59_52.wav', species='E. coqui - co, E. coqui - qui, E. gryllus, E. portoricensis - co, E. portoricensis - qui, E. unicolor')
Pandas(siteId=3, filename='/home/edwinc/Downloads/806302/Extracted/Site03-1/LINE_2004-04-29_22_29_50.wav', species='E. coqui - co, E. coqui - qui, E. gryllus, E. portoricensis - co, E. portoricensis - qui, E. unicolor')
Pandas(siteId=3, filename='/home/edwinc/Downloads/806302/Extracted/Site03-1/LINE_2004-04-29_03_00_01.wav', species='E. c

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8593,8594,8595,8596,8597,8598,8599,8600,8601,8602
0,3,/home/edwinc/Downloads/806302/Extracted/Site03...,"E. coqui - co, E. coqui - qui, E. gryllus, E. ...",-258.907990,-232.094894,-226.508392,-235.727097,-234.146576,-222.760300,-215.504379,...,8.196312,5.553250,-0.008759,0.873199,1.839963,-1.285686,-4.960220,-7.230106,-4.734491,-1.136608
1,3,/home/edwinc/Downloads/806302/Extracted/Site03...,"E. coqui - co, E. coqui - qui, E. gryllus, E. ...",-227.534851,-216.711395,-212.192535,-220.333374,-227.439392,-225.514084,-232.357635,...,6.730955,4.329302,-0.511441,-0.429305,-1.218568,-3.745374,0.277401,-1.694374,-4.156485,-2.869764
2,3,/home/edwinc/Downloads/806302/Extracted/Site03...,"E. coqui - co, E. coqui - qui, E. gryllus, E. ...",-233.158813,-235.836945,-242.486938,-238.739594,-238.480042,-234.335876,-237.143066,...,10.046548,8.860480,9.022156,5.986008,1.708223,1.206275,1.483954,0.579580,1.845343,-1.616910
3,3,/home/edwinc/Downloads/806302/Extracted/Site03...,"E. coqui - co, E. coqui - qui, E. gryllus, E. ...",-163.214706,-207.436996,-226.848114,-225.573227,-225.489899,-225.918289,-207.760330,...,-3.567730,-2.323535,1.231682,0.492969,-3.255579,-2.373746,-0.490466,-1.610283,-3.388490,-0.296241
4,3,/home/edwinc/Downloads/806302/Extracted/Site03...,"E. coqui - co, E. coqui - qui, E. gryllus, E. ...",-225.464142,-229.984940,-233.509644,-224.003693,-221.250275,-229.671143,-228.857971,...,2.844812,-1.682932,1.587868,3.429075,2.604915,7.762749,9.843101,11.419058,16.426598,16.311155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8750,9,/home/edwinc/Downloads/806302/Extracted/Site09...,"E. coqui - co, E. coqui - qui, E. gryllus, E. ...",-250.666367,-200.190491,-182.154938,-193.562836,-114.092621,-111.169029,-145.175293,...,2.692854,5.744445,6.874275,7.866467,0.357853,-6.557953,-7.766747,-8.440030,-7.955336,-1.260153
8751,9,/home/edwinc/Downloads/806302/Extracted/Site09...,"E. coqui - co, E. coqui - qui, E. gryllus, E. ...",-222.126297,-195.695038,-202.494644,-224.102524,-218.942642,-221.509262,-239.688904,...,-16.312403,-9.355195,-5.709651,-3.017781,1.106369,1.698544,-3.062351,-11.644132,-8.977406,-4.557225
8752,9,/home/edwinc/Downloads/806302/Extracted/Site09...,"E. coqui - co, E. coqui - qui, E. gryllus, E. ...",-207.091354,-211.098602,-218.797302,-224.587448,-228.803619,-221.248810,-218.629517,...,3.831399,3.494405,3.827552,6.249031,5.517063,-0.217222,1.556711,1.385989,3.714207,4.064235
8753,9,/home/edwinc/Downloads/806302/Extracted/Site09...,"E. coqui - co, E. coqui - qui, E. gryllus, E. ...",-188.595673,-180.631775,-186.508102,-195.934265,-198.946472,-195.002380,-185.674332,...,2.192450,0.922413,9.565634,10.239399,5.896046,0.963325,0.091774,6.596493,12.898052,9.674455


In [120]:
# Set up readable index for df
df.columns = pd.MultiIndex.from_arrays([   
    ['metadata'] * 3 + ['spectral'] * (df.shape[1] - 3),
    ['siteId', 'filename', 'species'] +
    list(range(df.shape[1] - 3))
])
df

Unnamed: 0_level_0,metadata,metadata,metadata,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral
Unnamed: 0_level_1,siteId,filename,species,0,1,2,3,4,5,6,...,8590,8591,8592,8593,8594,8595,8596,8597,8598,8599
0,3,/home/edwinc/Downloads/806302/Extracted/Site03...,"E. coqui - co, E. coqui - qui, E. gryllus, E. ...",-258.907990,-232.094894,-226.508392,-235.727097,-234.146576,-222.760300,-215.504379,...,8.196312,5.553250,-0.008759,0.873199,1.839963,-1.285686,-4.960220,-7.230106,-4.734491,-1.136608
1,3,/home/edwinc/Downloads/806302/Extracted/Site03...,"E. coqui - co, E. coqui - qui, E. gryllus, E. ...",-227.534851,-216.711395,-212.192535,-220.333374,-227.439392,-225.514084,-232.357635,...,6.730955,4.329302,-0.511441,-0.429305,-1.218568,-3.745374,0.277401,-1.694374,-4.156485,-2.869764
2,3,/home/edwinc/Downloads/806302/Extracted/Site03...,"E. coqui - co, E. coqui - qui, E. gryllus, E. ...",-233.158813,-235.836945,-242.486938,-238.739594,-238.480042,-234.335876,-237.143066,...,10.046548,8.860480,9.022156,5.986008,1.708223,1.206275,1.483954,0.579580,1.845343,-1.616910
3,3,/home/edwinc/Downloads/806302/Extracted/Site03...,"E. coqui - co, E. coqui - qui, E. gryllus, E. ...",-163.214706,-207.436996,-226.848114,-225.573227,-225.489899,-225.918289,-207.760330,...,-3.567730,-2.323535,1.231682,0.492969,-3.255579,-2.373746,-0.490466,-1.610283,-3.388490,-0.296241
4,3,/home/edwinc/Downloads/806302/Extracted/Site03...,"E. coqui - co, E. coqui - qui, E. gryllus, E. ...",-225.464142,-229.984940,-233.509644,-224.003693,-221.250275,-229.671143,-228.857971,...,2.844812,-1.682932,1.587868,3.429075,2.604915,7.762749,9.843101,11.419058,16.426598,16.311155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8750,9,/home/edwinc/Downloads/806302/Extracted/Site09...,"E. coqui - co, E. coqui - qui, E. gryllus, E. ...",-250.666367,-200.190491,-182.154938,-193.562836,-114.092621,-111.169029,-145.175293,...,2.692854,5.744445,6.874275,7.866467,0.357853,-6.557953,-7.766747,-8.440030,-7.955336,-1.260153
8751,9,/home/edwinc/Downloads/806302/Extracted/Site09...,"E. coqui - co, E. coqui - qui, E. gryllus, E. ...",-222.126297,-195.695038,-202.494644,-224.102524,-218.942642,-221.509262,-239.688904,...,-16.312403,-9.355195,-5.709651,-3.017781,1.106369,1.698544,-3.062351,-11.644132,-8.977406,-4.557225
8752,9,/home/edwinc/Downloads/806302/Extracted/Site09...,"E. coqui - co, E. coqui - qui, E. gryllus, E. ...",-207.091354,-211.098602,-218.797302,-224.587448,-228.803619,-221.248810,-218.629517,...,3.831399,3.494405,3.827552,6.249031,5.517063,-0.217222,1.556711,1.385989,3.714207,4.064235
8753,9,/home/edwinc/Downloads/806302/Extracted/Site09...,"E. coqui - co, E. coqui - qui, E. gryllus, E. ...",-188.595673,-180.631775,-186.508102,-195.934265,-198.946472,-195.002380,-185.674332,...,2.192450,0.922413,9.565634,10.239399,5.896046,0.963325,0.091774,6.596493,12.898052,9.674455


In [122]:
x = df.drop(
    columns=["metadata"]
)  # Adjust this to include only feature columns
# Convert all column names to strings

x

Unnamed: 0_level_0,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral,spectral
Unnamed: 0_level_1,0,1,2,3,4,5,6,7,8,9,...,8590,8591,8592,8593,8594,8595,8596,8597,8598,8599
0,-258.907990,-232.094894,-226.508392,-235.727097,-234.146576,-222.760300,-215.504379,-217.754272,-218.839447,-222.334579,...,8.196312,5.553250,-0.008759,0.873199,1.839963,-1.285686,-4.960220,-7.230106,-4.734491,-1.136608
1,-227.534851,-216.711395,-212.192535,-220.333374,-227.439392,-225.514084,-232.357635,-231.624298,-227.497879,-230.746841,...,6.730955,4.329302,-0.511441,-0.429305,-1.218568,-3.745374,0.277401,-1.694374,-4.156485,-2.869764
2,-233.158813,-235.836945,-242.486938,-238.739594,-238.480042,-234.335876,-237.143066,-235.792816,-235.290771,-237.057266,...,10.046548,8.860480,9.022156,5.986008,1.708223,1.206275,1.483954,0.579580,1.845343,-1.616910
3,-163.214706,-207.436996,-226.848114,-225.573227,-225.489899,-225.918289,-207.760330,-204.464294,-212.940002,-213.305298,...,-3.567730,-2.323535,1.231682,0.492969,-3.255579,-2.373746,-0.490466,-1.610283,-3.388490,-0.296241
4,-225.464142,-229.984940,-233.509644,-224.003693,-221.250275,-229.671143,-228.857971,-219.340378,-197.523438,-198.334854,...,2.844812,-1.682932,1.587868,3.429075,2.604915,7.762749,9.843101,11.419058,16.426598,16.311155
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8750,-250.666367,-200.190491,-182.154938,-193.562836,-114.092621,-111.169029,-145.175293,-161.936905,-207.811676,-226.778641,...,2.692854,5.744445,6.874275,7.866467,0.357853,-6.557953,-7.766747,-8.440030,-7.955336,-1.260153
8751,-222.126297,-195.695038,-202.494644,-224.102524,-218.942642,-221.509262,-239.688904,-244.561768,-233.908905,-216.524414,...,-16.312403,-9.355195,-5.709651,-3.017781,1.106369,1.698544,-3.062351,-11.644132,-8.977406,-4.557225
8752,-207.091354,-211.098602,-218.797302,-224.587448,-228.803619,-221.248810,-218.629517,-224.379517,-223.217987,-223.325439,...,3.831399,3.494405,3.827552,6.249031,5.517063,-0.217222,1.556711,1.385989,3.714207,4.064235
8753,-188.595673,-180.631775,-186.508102,-195.934265,-198.946472,-195.002380,-185.674332,-190.254440,-202.722778,-206.122589,...,2.192450,0.922413,9.565634,10.239399,5.896046,0.963325,0.091774,6.596493,12.898052,9.674455


In [124]:
y = df["metadata", "species"]

# Encode the target labels as integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

le_bron = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
le_bron


{'E. coqui - co, E. coqui - qui': np.int64(0),
 'E. coqui - co, E. coqui - qui, E. gryllus, E. locustus': np.int64(1),
 'E. coqui - co, E. coqui - qui, E. gryllus, E. portoricensis - co, E. portoricensis - qui, E. unicolor': np.int64(2),
 'E. coqui - co, E. coqui - qui, E. hedricki': np.int64(3),
 'E. coqui - co, E. coqui - qui, E. hedricki, E. portoricensis - co, E. portoricensis - qui': np.int64(4),
 'E. coqui - co, E. coqui - qui, E. hedricki, E. portoricensis - co, E. portoricensis - qui, E. unicolor': np.int64(5),
 'E. coqui - co, E. coqui - qui, E. portoricensis - co, E. portoricensis - qui, E. richmondi': np.int64(6),
 'E. coqui - co, E. coqui - qui, E. portoricensis - co, E. portoricensis - qui, E. unicolor': np.int64(7),
 'E. coqui - co, E. coqui - qui, E. richmondi': np.int64(8),
 'E. coqui - co, E. coqui - qui, E. richmondi, E. wightmanae': np.int64(9),
 'E. coqui - co, E. coqui - qui, E. wightmanae': np.int64(10)}

In [125]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

classifier = RandomForestClassifier(n_estimators=600, max_depth=18, min_samples_leaf=3)

classifier.fit(x_train, y_train)

y_pred = classifier.predict_proba(
    x_test,
)

accuracy = roc_auc_score(y_test, y_pred, multi_class="ovr")
print("Accuracy :", accuracy)




Accuracy : 0.9938702039928109


In [127]:
import pickle
with open("./backend/trainedRF.pkl", "wb") as f:
    pickle.dump(classifier, f)

In [128]:
discrete_pred = np.argmax(y_pred, axis=1)

In [129]:
sum(y_test != discrete_pred) / len(y_test)

np.float64(0.054254711593375214)