# Model Training Notebook

This notebook can be used for downloading the [coqui acoustic dataset](https://figshare.com/articles/dataset/Sounds_of_the_Eleutherodactylus_frog_community_from_Puerto_Rico/806302) provided by researcher Luis Villanueva-Rivera and creating a machine learning model using Random Forest Classification algorithm. 

# Requirements

Created python virtual environment with `backend\requirements.txt` dependencies

# Usage

Play the cells one by one according to your needs. You may choose to skip certain steps if already done previously.

In [None]:
# -*- coding: utf-8 -*-

# don
import pandas as pd
import numpy as np
import librosa
import os
import zipfile
import csv
import functools
import librosa
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from concurrent.futures import ThreadPoolExecutor, as_completed
from sklearn.metrics import (
    roc_auc_score,
)




## Download The Dataset

If you have not already downloaded the dataset uncomment the `urlretrieve` line from the cell below

Otherwise if you already have the zip folder, place it `machine_learning\dataset\coqui_dataset_Luis_Villanueva-Rivera.zip` and make sure to rename the zip folder accordingly. 


In [None]:
from urllib.request import urlretrieve


def extract_zip_file(file_path, extract_to):
    """
    Extracts a single zip file to a specified directory.

    Args:
        file_path (str): Path to the zip file.
        extract_to (str): Path to the directory where the zip file will be extracted.
    """
    with zipfile.ZipFile(file_path, "r") as zip_ref:
        zip_ref.extractall(extract_to)
        print(
            f"{os.path.basename(file_path)} extracted to {os.path.abspath(extract_to)}"
        )


def extract_zip_files(source_data_zip, destination_folder):
    """
    Extracts all zip files from a folder to a specified directory using threading.

    Args:
        zip_folder (str): Path to the folder containing zip files.
        extract_to (str): Path to the directory where zip files will be extracted.
    """
    # Make sure the extraction directory exists
    os.makedirs(destination_folder, exist_ok=True)

    extract_zip_file(source_data_zip, destination_folder)
    # List all zip files in the folder

    zip_files = [
        os.path.join(destination_folder, item)
        for item in os.listdir(destination_folder)
        if item.endswith(".zip")
    ]

    # Use ThreadPoolExecutor to extract zip files concurrently
    with ThreadPoolExecutor() as executor:
        futures = [
            executor.submit(extract_zip_file, zip_file, destination_folder)
            for zip_file in zip_files
        ]

        # Wait for all futures to complete
        for future in futures:
            future.result()

    print("Done.")


def remove_zipped_files(extraction_destination_filepath):

    zip_files = [
        os.path.join(extraction_destination_filepath, item)
        for item in os.listdir(extraction_destination_filepath)
        if item.endswith(".zip")
    ]

    for file in zip_files:
        os.remove(file)


def download_if_necessary(dataset_source, source_data_filepath):

    should_download = True
    if os.path.exists(source_data_filepath):
        should_replace = str(
            input("Zipped file already downloaded, Do you wish to replace it? y/n")
        )
        should_download = False if should_replace.lower() == "n" else True
    else:
        if not os.path.exists("dataset"):
            os.mkdir("dataset")
        print("Donwloading...")

    if should_download:
        print("Donwloading Dataset to ", os.path.abspath(source_data_filepath))
        urlretrieve(dataset_source, source_data_filepath)
    else:
        print("Skipping Download since it's already downloaded")
        



extraction_destination_filepath = os.path.join("dataset", "extracted")
source_data_filepath = os.path.join(
    "dataset", "coqui_dataset_Luis_Villanueva-Rivera.zip"
)
dataset_source = "https://figshare.com/ndownloader/articles/806302/versions/18"
download_if_necessary(dataset_source, source_data_filepath)
print("Donwloaded Dataset!")
extract_zip_files(source_data_filepath, extraction_destination_filepath)
print("Extracted Zip Files!")
remove_zipped_files(extraction_destination_filepath)
print("done!")

# Data Preparation 

Generate a csv associating each audio sample to the corresponding coqui identified in it. A single file can have multiple coqui's identified. We use the `FrequencyRange_by_species_and_site_Averages.csv` to create this report

In [None]:

def clean_all_data_file(all_data_csv_path) -> pd.DataFrame:
    with open(all_data_csv_path, mode="r") as all_data_file:
        reader = csv.DictReader(all_data_file)
        root_df = pd.DataFrame.from_records(list(reader))
        root_df = root_df.drop(columns=[None])
        root_df.SoundID = pd.to_numeric(root_df.SoundID)
        return root_df


def match_sound_id_to_filename(zip_walk: list) -> pd.DataFrame:

    # Now go through all the data.csv files to match the SoundID in our dataframe to a sound file
    # Find all data.csv files

    leaf_paths = [
        os.path.join(trace[0], "data.csv")
        for trace in zip_walk
        if "data.csv" in trace[2]
    ]
    fragments = [
        pd.read_csv(leaf_path)[["SoundID", "SiteID", "Filename"]].drop_duplicates()
        for leaf_path in leaf_paths
    ]

    soundid_to_filename_df = pd.concat(fragments)

    return soundid_to_filename_df


def update_return(d1: pd.DataFrame, d2: pd.DataFrame) -> pd.DataFrame:
    d1.update(d2)
    return d1


def apply_abosolute_path_to_dataframe(
    dataframe: pd.DataFrame, zip_walk: list
) -> pd.DataFrame:

    filename_dict = functools.reduce(
        update_return,
        [
            {
                filename: str(os.path.abspath(os.path.join(trace[0], filename)))
                for filename in trace[2]
                if filename.endswith(".wav")
            }
            for trace in zip_walk[1:]
        ],
    )

    dataframe.Filename = dataframe.Filename.apply(lambda x: filename_dict[x])

    return dataframe


def merge_and_export_to_csv(
    root_df: pd.DataFrame, processed_df_with_filename: pd.DataFrame
):

    # Now merge them and save the result
    preprocess_final = pd.merge(
        root_df, processed_df_with_filename, on="SoundID", validate="many_to_one"
    )[["SiteID_x", "Filename", "Species"]]
    preprocess_final.columns = pd.Index(["siteId", "filename", "species"])

    preprocess_final.siteId = pd.to_numeric(preprocess_final.siteId)

    species_classes = list(preprocess_final["species"].unique())

    final_output = preprocess_final[["siteId", "filename"]].drop_duplicates()
    for species in species_classes:
        # For each species, make a new column in the dataframe that says which files contain that species
        final_output[species] = [
            species
            in set(
                preprocess_final[preprocess_final["filename"] == filename]["species"]
            )
            for filename in final_output["filename"]
        ]

    if (not os.path.exists("processed")):
        os.mkdir("processed")
    
    output_path = "processed/processed.csv"
    final_output.to_csv(output_path)

        

    return os.path.abspath("processed/processed.csv")

In [None]:
extraction_destination_filepath = os.path.join("dataset", "extracted")
all_data_file_path: str = os.path.join(
    extraction_destination_filepath, "FrequencyRange_by_species_and_site_AllData.csv"
)

root_df: pd.DataFrame = clean_all_data_file(all_data_file_path)
zip_walk = list(os.walk(extraction_destination_filepath))
soundid_to_filename: pd.DataFrame = match_sound_id_to_filename(zip_walk)
dataframe_with_absoule_path_in_filename: pd.DataFrame = apply_abosolute_path_to_dataframe(soundid_to_filename, zip_walk)
pre_processed_data_csv_filepath = merge_and_export_to_csv(root_df, dataframe_with_absoule_path_in_filename)

# Data Processing

Generate a Spectrogram for every audio sample. 

In [None]:
SLICE_SECONDS = 10  # Length of input slices for model.
FFT_HOP_LENGTH = 512  # How many time domain samples per spectrogram frame
SAMPLE_RATE = 22050
Y_RESOLUTION = 20

n_model_input_parameters = SAMPLE_RATE // FFT_HOP_LENGTH * SLICE_SECONDS * Y_RESOLUTION

f"Model takes {n_model_input_parameters} parameters"

In [None]:
df = pd.read_csv(str(pre_processed_data_csv_filepath))
df = df.drop(columns="Unnamed: 0")
df["E. coqui"] = df["E. coqui - co"]
df["E. portoricensis"] = df["E. portoricensis - co"]
df = df.drop(
    columns=[
        "E. coqui - co",
        "E. coqui - qui",
        "E. portoricensis - co",
        "E. portoricensis - qui",
    ]
)
df = df[
    [
        "siteId",
        "filename",
        "E. coqui",
        "E. wightmanae",
        "E. gryllus",
        "E. portoricensis",
        "E. unicolor",
        "E. hedricki",
        "E. locustus",
        "E. richmondi",
    ]
]
df

In [None]:
def extract_features(file_path):
    """
    Extract spectrogram from audio file using librosa.

    Args:
        file_path (str): Path to the audio file.

    Returns:
        np.array: Extracted features.
        int: Sample rate in hertz
    """
    audio, sr = librosa.load(file_path)

    # MFCC
    return librosa.feature.mfcc(y=audio, sr=sr, hop_length=FFT_HOP_LENGTH), sr


# Initialize a list to store the results
spectrograms = []
sample_rates = []

with ThreadPoolExecutor() as executor:
    futures = [
        executor.submit(extract_features, row["filename"]) for _, row in df.iterrows()
    ]

    for future in as_completed(futures):
        try:
            spectrogram, sr = future.result()
            spectrograms.append(spectrogram)
            sample_rates.append(sr)
        except Exception as exc:
            print(f"Generated an exception: {exc}")

# Process the spectrograms
assert min(sample_rates) == max(sample_rates)
sr = min(sample_rates)
slice_width = sr * SLICE_SECONDS // FFT_HOP_LENGTH
# Slice them into fixed widths
spectrogram_slices = []
for spectrogram in spectrograms:
    spectrogram = spectrogram[
        :, : -(spectrogram.shape[1] % slice_width)
    ]  # Take only the section of the spectrogram that will split into fixed slices
    n_slices = spectrogram.shape[1] / slice_width
    result = np.hsplit(spectrogram, n_slices)
    spectrogram_slices.append(result)

# Group up the filenames with their corresponding spectrogram slices
spectral_data = pd.DataFrame(
    sum(
        [
            [(filename,) + tuple(spectrogram.flatten()) for spectrogram in spectrograms]
            for spectrograms, filename in zip(spectrogram_slices, df.filename)
        ],
        [],
    )
)

In [None]:
# Get the names of the species
species_names = list(df.columns.drop(["siteId", "filename"]))
# Join the new spectrogram data with the existing dataframe
df = df.merge(spectral_data, left_on="filename", right_on=0, how="right").drop(
    columns=0
)
df

In [None]:
# Set up readable index for df
df.columns = pd.MultiIndex.from_arrays(
    [
        ["metadata"] * 2
        + ["classes"] * len(species_names)
        + ["spectral"] * (n_model_input_parameters),
        ["siteId", "filename"] + species_names + list(range(n_model_input_parameters)),
    ]
)
df

In [None]:
x = df["spectral"]  # Adjust this to include only feature columns
# Convert all column names to strings

x

In [None]:
y = df["classes"].applymap(int)
y

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

classifier = RandomForestClassifier(
    n_estimators=600, max_depth=25, min_samples_leaf=3, n_jobs=-1
)

classifier.fit(x_train, y_train)

y_pred = classifier.predict(
    x_test,
)

In [None]:
y_test

In [None]:
prediction_df = pd.DataFrame(y_pred, columns=y_test.columns, index=y_test.index)

prediction_df

In [None]:
accuracy_df = prediction_df == y_test
accuracy_df.sum() / 69

In [None]:
import pickle
model_backend_path = os.path.abspath(os.path.join(os.path.dirname(os.getcwd()), "backend", "trainedRF.pkl"))
with open(model_backend_path, "wb") as f:
    pickle.dump(classifier, f)