In [1]:
# -*- coding: utf-8 -*-

# don
import pandas as pd
import numpy as np
import librosa
import os
import zipfile
import csv
import shutil
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from concurrent.futures import ThreadPoolExecutor, as_completed
from sklearn.metrics import (
    roc_auc_score,        
)


In [2]:
"""
Script for extracting data from https://figshare.com/articles/dataset/Sounds_of_the_Eleutherodactylus_frog_community_from_Puerto_Rico/806302?file=3104183
Unzips all the zips from root
Simply Unzip downloaded file and provide path root of folder
"""


def extract_zip_file(file_path, extract_to):
    """
    Extracts a single zip file to a specified directory.

    Args:
        file_path (str): Path to the zip file.
        extract_to (str): Path to the directory where the zip file will be extracted.
    """
    with zipfile.ZipFile(file_path, "r") as zip_ref:
        zip_ref.extractall(extract_to)
        # print(f"{os.path.basename(file_path)} extracted to {os.path.abspath(extract_to)}")


def extract_zip_files(zip_folder, extract_to):
    """
    Extracts all zip files from a folder to a specified directory using threading.

    Args:
        zip_folder (str): Path to the folder containing zip files.
        extract_to (str): Path to the directory where zip files will be extracted.
    """
    # Make sure the extraction directory exists
    os.makedirs(extract_to, exist_ok=True)

    data_file = "FrequencyRange_by_species_and_site_Averages.csv"
    shutil.copyfile(os.path.join(zip_folder, data_file) , os.path.join(extract_to, data_file))

    # List all zip files in the folder
    zip_files = [
        os.path.join(zip_folder, item)
        for item in os.listdir(zip_folder)
        if item.endswith(".zip")
    ]

    # Use ThreadPoolExecutor to extract zip files concurrently
    with ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(extract_zip_file, zip_file, extract_to)
            for zip_file in zip_files
        ]

        # Wait for all futures to complete
        for future in futures:
            future.result()
    
    print("Done.")

In [3]:
filepath = "/home/edwin/Downloads/806302"
ExtractTo = "/home/edwin/Downloads/806302/Extracted"
extract_zip_files(filepath, ExtractTo)

Done.


In [4]:
def readAveragesData(path: str):

    data = []
    # Read the CSV file and store the data in a list of dictionaries
    with open(path, "r") as file:
        reader = csv.DictReader(file)
        for row in reader:
            data.append(row)

    return data


def prepare_csv(data_dir, output) -> None:

    
    averagesData = readAveragesData(
        os.path.join(data_dir, "FrequencyRange_by_species_and_site_Averages.csv")
    )

    data = []

    # Iterate through each subfolder
    for siteDataSet in os.listdir(data_dir):
        site_folder = os.path.join(data_dir, siteDataSet)
        if os.path.isdir(site_folder):
            # example siteId  "Site01-1" such that the 4-6 index represents the site id; in this case 01
            siteId = int(siteDataSet[4:6])
            SiteData = [
                averageClassification
                for averageClassification in averagesData
                if int(averageClassification["SiteID"]) == siteId
            ]

            classifications = ", ".join(
                [classification["Species"] for classification in SiteData]
            )
            for audio_recording in os.listdir(site_folder):
                if audio_recording.endswith(".wav"):
                    audio_recording_abs_path = os.path.abspath(
                        os.path.join(site_folder, audio_recording)
                    )

                    data.append([siteId, audio_recording_abs_path, classifications])

    # Create DataFrame
    df = pd.DataFrame(
        data,
        columns=[
            "siteId",
            "filename",
            "species",
        ],
    )

    df.to_csv(output, index=False)

In [5]:
data_dir = ExtractTo
output = "processed/processed.csv"
prepare_csv(data_dir, output)

In [6]:
def extract_features(file_path):
    """
    Extract features from audio file using librosa.

    Args:
        file_path (str): Path to the audio file.

    Returns:
        np.array: Extracted features.
    """
    audio, sr = librosa.load(file_path)
    result = np.array([])

    # MFCC
    mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=sr).T, axis=0)
    result = np.hstack((result, mfccs))

    # Chroma
    stft = np.abs(librosa.stft(audio))
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T, axis=0)
    result = np.hstack((result, chroma))

    # Mel-scaled spectrogram
    mel = np.mean(librosa.feature.melspectrogram(y=audio, sr=sr).T, axis=0)
    result = np.hstack((result, mel))

    return result


def process_data(data_csv_path):
    """Read Csv with fileanme and generate spectrogram for each sample

    Returns:
        DataFrame: dataframe with all data
    """
    # data_csv_path = sys.argv[1]

    df = pd.read_csv(data_csv_path)

    # Initialize a list to store the results
    spectrograms = []

    with ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(extract_features, row["filename"])
            for _, row in df.iterrows()
        ]

        for future in as_completed(futures):
            try:
                spectrogram = future.result()
                spectrograms.append(spectrogram)
            except Exception as exc:
                print(f"Generated an exception: {exc}")

    # Convert the list of spectrograms into a DataFrame
    spectrogram_df = pd.DataFrame(spectrograms)

    # Concatenate the original DataFrame with the new DataFrame containing spectrograms
    df = pd.concat([df, spectrogram_df], axis=1)

    return df

In [7]:
csv_dir = output
df = process_data(csv_dir)
df.head()

Unnamed: 0,siteId,filename,species,0,1,2,3,4,5,6,...,150,151,152,153,154,155,156,157,158,159
0,13,/home/edwin/Downloads/806302/Extracted/Site13-...,"E. coqui - co, E. coqui - qui",-365.364166,86.90493,-106.172852,68.06057,17.61063,6.652588,15.586069,...,2e-06,1.033863e-06,1.005551e-06,8.521368e-07,7.499838e-07,6.814324e-07,4.886843e-07,3.613497e-07,1.820131e-07,1.423889e-08
1,13,/home/edwin/Downloads/806302/Extracted/Site13-...,"E. coqui - co, E. coqui - qui",-406.500671,113.842438,-68.206215,47.862782,20.65132,-6.530857,3.600087,...,2e-06,9.99036e-07,1.103059e-06,9.592569e-07,8.830082e-07,7.530214e-07,4.835236e-07,3.708429e-07,1.902824e-07,1.955865e-08
2,13,/home/edwin/Downloads/806302/Extracted/Site13-...,"E. coqui - co, E. coqui - qui",-347.871185,98.270226,-113.713997,52.658756,28.086157,13.775445,3.790692,...,2e-06,1.093613e-06,9.336469e-07,7.941216e-07,7.298338e-07,6.177173e-07,4.224469e-07,3.427497e-07,1.762806e-07,2.857463e-08
3,13,/home/edwin/Downloads/806302/Extracted/Site13-...,"E. coqui - co, E. coqui - qui",-431.293304,125.512405,-56.79377,27.191978,32.1227,7.789864,-18.500711,...,9e-06,5.364567e-06,4.622537e-06,3.462598e-06,2.657325e-06,1.527835e-06,7.68389e-07,3.773702e-07,1.70811e-07,1.311635e-08
4,13,/home/edwin/Downloads/806302/Extracted/Site13-...,"E. coqui - co, E. coqui - qui",-327.020691,108.122726,-100.805252,60.04599,17.855522,7.025489,4.331455,...,2e-06,1.298369e-06,1.175133e-06,1.145827e-06,8.843793e-07,6.959663e-07,5.277243e-07,4.115347e-07,2.027749e-07,1.60782e-08


In [8]:
x = df.drop(
    columns=["filename", "species"]
)  # Adjust this to include only feature columns
# Convert all column names to strings
x.columns = x.columns.astype(str)

x

Unnamed: 0,siteId,0,1,2,3,4,5,6,7,8,...,150,151,152,153,154,155,156,157,158,159
0,13,-365.364166,86.904930,-106.172852,68.060570,17.610630,6.652588,15.586069,-11.991188,2.748386,...,1.644769e-06,1.033863e-06,1.005551e-06,8.521368e-07,7.499838e-07,6.814324e-07,4.886843e-07,3.613497e-07,1.820131e-07,1.423889e-08
1,13,-406.500671,113.842438,-68.206215,47.862782,20.651320,-6.530857,3.600087,14.337640,-13.014687,...,1.526512e-06,9.990360e-07,1.103059e-06,9.592569e-07,8.830082e-07,7.530214e-07,4.835236e-07,3.708429e-07,1.902824e-07,1.955865e-08
2,13,-347.871185,98.270226,-113.713997,52.658756,28.086157,13.775445,3.790692,-19.096933,14.272011,...,1.697720e-06,1.093613e-06,9.336469e-07,7.941216e-07,7.298338e-07,6.177173e-07,4.224469e-07,3.427497e-07,1.762806e-07,2.857463e-08
3,13,-431.293304,125.512405,-56.793770,27.191978,32.122700,7.789864,-18.500711,12.831899,-2.560224,...,9.499254e-06,5.364567e-06,4.622537e-06,3.462598e-06,2.657325e-06,1.527835e-06,7.683890e-07,3.773702e-07,1.708110e-07,1.311635e-08
4,13,-327.020691,108.122726,-100.805252,60.045990,17.855522,7.025489,4.331455,-15.363570,7.281242,...,2.074947e-06,1.298369e-06,1.175133e-06,1.145827e-06,8.843793e-07,6.959663e-07,5.277243e-07,4.115347e-07,2.027749e-07,1.607820e-08
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1545,14,-360.793213,135.355591,-79.874069,33.914722,24.455133,29.985666,5.089327,-3.236809,30.435177,...,8.581591e-07,6.824343e-07,6.312418e-07,5.631853e-07,5.520564e-07,6.555168e-07,6.910793e-07,5.193340e-07,2.318115e-07,1.759385e-08
1546,14,-341.601776,124.877823,-81.960869,25.944866,34.249691,27.945740,4.721941,-2.066292,27.809549,...,1.606695e-06,1.240373e-06,1.033492e-06,1.022801e-06,7.829952e-07,8.138248e-07,1.468499e-06,1.201710e-06,4.714741e-07,3.109947e-08
1547,14,-348.720245,117.477097,-104.238144,19.341476,37.335217,33.531551,-1.152712,-8.220524,25.460920,...,1.160665e-06,9.019577e-07,8.050399e-07,1.117119e-06,1.589559e-06,1.560947e-06,2.207701e-06,1.978113e-06,4.632507e-07,2.647404e-08
1548,14,-442.768585,138.157593,-34.087162,21.701490,16.534599,11.656564,-1.711867,5.866610,7.741029,...,1.440193e-06,8.323494e-07,5.406529e-07,4.368491e-07,3.851308e-07,3.373848e-07,2.802447e-07,2.373542e-07,1.209391e-07,1.044739e-08


In [10]:
y = df["species"]

# Encode the target labels as integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

le_bron = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
le_bron


{'E. coqui - co, E. coqui - qui': 0,
 'E. coqui - co, E. coqui - qui, E. gryllus, E. locustus': 1,
 'E. coqui - co, E. coqui - qui, E. gryllus, E. portoricensis - co, E. portoricensis - qui, E. unicolor': 2,
 'E. coqui - co, E. coqui - qui, E. hedricki': 3,
 'E. coqui - co, E. coqui - qui, E. hedricki, E. portoricensis - co, E. portoricensis - qui': 4,
 'E. coqui - co, E. coqui - qui, E. hedricki, E. portoricensis - co, E. portoricensis - qui, E. unicolor': 5,
 'E. coqui - co, E. coqui - qui, E. portoricensis - co, E. portoricensis - qui, E. richmondi': 6,
 'E. coqui - co, E. coqui - qui, E. portoricensis - co, E. portoricensis - qui, E. unicolor': 7,
 'E. coqui - co, E. coqui - qui, E. richmondi': 8,
 'E. coqui - co, E. coqui - qui, E. richmondi, E. wightmanae': 9,
 'E. coqui - co, E. coqui - qui, E. wightmanae': 10}

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

classifier = RandomForestClassifier(n_estimators=600, max_depth=18, min_samples_leaf=3)

classifier.fit(x_train, y_train)

y_pred = classifier.predict_proba(
    x_test,
)

accuracy = roc_auc_score(y_test, y_pred, multi_class="ovr")
print("Accuracy :", accuracy)




In [None]:
import pickle
with open("../Backend/trainedRF.pkl", "wb") as f:
    pickle.dump(classifier, f)