# Model Training Notebook

This notebook can be used for downloading the [coqui acoustic dataset](https://figshare.com/articles/dataset/Sounds_of_the_Eleutherodactylus_frog_community_from_Puerto_Rico/806302) provided by researcher Luis Villanueva-Rivera and creating a machine learning model using Random Forest Classification algorithm. 

# Requirements

Created python virtual environment with `backend\requirements.txt` dependencies

# Usage

Play the cells one by one according to your needs. You may choose to skip certain steps if already done previously.

In [None]:
# -*- coding: utf-8 -*-

# don
import pandas as pd
import numpy as np
import librosa
import os
import zipfile
import csv
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from concurrent.futures import ThreadPoolExecutor, as_completed
from sklearn.metrics import (
    roc_auc_score,        
)

## Download The Dataset

If you have not already downloaded the dataset uncomment the `urlretrieve` line from the cell below

Otherwise if you already have the zip folder, place it `machine_learning\dataset\coqui_dataset_Luis_Villanueva-Rivera.zip` and make sure to rename the zip folder accordingly. 


In [None]:
from urllib.request import urlretrieve


def extract_zip_file(file_path, extract_to):
    """
    Extracts a single zip file to a specified directory.

    Args:
        file_path (str): Path to the zip file.
        extract_to (str): Path to the directory where the zip file will be extracted.
    """
    with zipfile.ZipFile(file_path, "r") as zip_ref:
        zip_ref.extractall(extract_to)
        # print(f"{os.path.basename(file_path)} extracted to {os.path.abspath(extract_to)}")


def extract_zip_files(source_data_zip, destination_folder):
    """
    Extracts all zip files from a folder to a specified directory using threading.

    Args:
        zip_folder (str): Path to the folder containing zip files.
        extract_to (str): Path to the directory where zip files will be extracted.
    """
    # Make sure the extraction directory exists
    os.makedirs(destination_folder, exist_ok=True)

    extract_zip_file(source_data_zip, destination_folder)
    # List all zip files in the folder

    zip_files = [
        os.path.join(destination_folder, item)
        for item in os.listdir(destination_folder)
        if item.endswith(".zip")
    ]

    # Use ThreadPoolExecutor to extract zip files concurrently
    with ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(extract_zip_file, zip_file, destination_folder)
            for zip_file in zip_files
        ]

        # Wait for all futures to complete
        for future in futures:
            future.result()

    print("Done.")


def remove_zipped_files(extraction_destination_filepath):

    zip_files = [
        os.path.join(extraction_destination_filepath, item)
        for item in os.listdir(extraction_destination_filepath)
        if item.endswith(".zip")
    ]

    for file in zip_files:
        os.remove(file)



source_data_filepath = "dataset" + os.sep + "coqui_dataset_Luis_Villanueva-Rivera.zip"
dataset_source = "https://figshare.com/ndownloader/articles/806302/versions/18"
extraction_destination_filepath = "dataset" + os.sep + "extracted"


# urlretrieve(dataset_source, source_data_filepath)

# extract_zip_files(source_data_filepath, extraction_destination_filepath)
# remove_zipped_files(extraction_destination_filepath)

# Data Preparation 

Generate a csv associating each audio sample to the corresponding coqui identified in it. A single file can have multiple coqui's identified. We use the `FrequencyRange_by_species_and_site_Averages.csv` to create this report

In [None]:
def readClassifiedReportData(path: str):

    data = []
    # Read the CSV file and store the data in a list of dictionaries
    with open(path, "r") as file:
        reader = csv.DictReader(file)
        for row in reader:
            data.append(row)

    return data

averagesData = readClassifiedReportData(
    os.path.join(
        extraction_destination_filepath,
        "FrequencyRange_by_species_and_site_Averages.csv",
    )
)
pd.options.display.max_columns = 250 #Changes the number of columns diplayed (default is 20)
pd.options.display.max_rows = 250 #Changes the number of rows diplayed (default is 60)
df = pd.DataFrame(averagesData)
df = df.sort_values(by=["MinFreq (Hz)"])

df

In [None]:
def prepare_csv(extraction_destination_filepath, output) -> None:

    averagesData = readClassifiedReportData(
        os.path.join(
            extraction_destination_filepath,
            "FrequencyRange_by_species_and_site_Averages.csv",
        )
    )

    data = []

    # Iterate through each subfolder
    for siteDataSet in os.listdir(extraction_destination_filepath):
        site_folder = os.path.join(extraction_destination_filepath, siteDataSet)
        if os.path.isdir(site_folder):
            # example siteId  "Site01-1" such that the 4-6 index represents the site id; in this case 01
            siteId = int(siteDataSet[4:6])
            SiteData = [
                averageClassification
                for averageClassification in averagesData
                if int(averageClassification["SiteID"]) == siteId
            ]

            classifications = ", ".join(
                [classification["Species"] for classification in SiteData]
            )
            for audio_recording in os.listdir(site_folder):
                if audio_recording.endswith(".wav"):
                    audio_recording_abs_path = os.path.abspath(
                        os.path.join(site_folder, audio_recording)
                    )

                    data.append([siteId, audio_recording_abs_path, classifications])

    # Create DataFrame
    df = pd.DataFrame(
        data,
        columns=[
            "siteId",
            "filename",
            "species",
        ],
    )

    df.to_csv(output, index=False)



output = "processed/processed.csv"

prepare_csv(extraction_destination_filepath, output)

# Data Processing

Generate a Spectrogram for every audio sample. 

In [None]:
def extract_features(file_path):
    """
    Extract features from audio file using librosa.

    Args:
        file_path (str): Path to the audio file.

    Returns:
        np.array: Extracted features.
    """
    audio, sr = librosa.load(file_path)
    result = np.array([])

    # MFCC
    mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=sr).T, axis=0)
    result = np.hstack((result, mfccs))

    # Chroma
    stft = np.abs(librosa.stft(audio))
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T, axis=0)
    result = np.hstack((result, chroma))

    # Mel-scaled spectrogram
    mel = np.mean(librosa.feature.melspectrogram(y=audio, sr=sr).T, axis=0)
    result = np.hstack((result, mel))

    return result


def process_data(data_csv_path):
    """Read Csv with fileanme and generate spectrogram for each sample

    Returns:
        DataFrame: dataframe with all data
    """
    # data_csv_path = sys.argv[1]

    df = pd.read_csv(data_csv_path)

    # Initialize a list to store the results
    spectrograms = []

    with ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(extract_features, row["filename"])
            for _, row in df.iterrows()
        ]

        for future in as_completed(futures):
            try:
                spectrogram = future.result()
                spectrograms.append(spectrogram)
            except Exception as exc:
                print(f"Generated an exception: {exc}")

    # Convert the list of spectrograms into a DataFrame
    spectrogram_df = pd.DataFrame(spectrograms)

    # Concatenate the original DataFrame with the new DataFrame containing spectrograms
    df = pd.concat([df, spectrogram_df], axis=1)

    return df


csv_dir = output

df = process_data(csv_dir)
df.head()

In [None]:
x = df.drop(
    columns=["filename", "species"]
)  # Adjust this to include only feature columns
# Convert all column names to strings
x.columns = x.columns.astype(str)

x

In [None]:
y = df["species"]

# Encode the target labels as integers
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

le_bron = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
le_bron


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

classifier = RandomForestClassifier(n_estimators=600, max_depth=18, min_samples_leaf=3)

classifier.fit(x_train, y_train)

y_pred = classifier.predict_proba(
    x_test,
)

accuracy = roc_auc_score(y_test, y_pred, multi_class="ovr")
print("Accuracy :", accuracy)




In [None]:
import pickle
with open("../backend/trainedRF.pkl", "wb") as f:
    pickle.dump(classifier, f)