In [None]:
SLICE_SECONDS = 10 # Length of input slices for model.
FFT_HOP_LENGTH = 512 # How many time domain samples per spectrogram frame
SAMPLE_RATE = 22050
Y_RESOLUTION = 20

n_model_input_parameters = SAMPLE_RATE // FFT_HOP_LENGTH * SLICE_SECONDS * Y_RESOLUTION

f"Model takes {n_model_input_parameters} parameters"

In [None]:
# -*- coding: utf-8 -*-

# don
import pandas as pd
import numpy as np
import librosa
import os
import csv
import shutil
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from concurrent.futures import ThreadPoolExecutor, as_completed
from sklearn.metrics import (
    roc_auc_score,        
)


In [None]:
df = pd.read_csv("machine_learning/processed/processed.csv")
df = df.drop(columns='Unnamed: 0')
df['E. coqui'] = df['E. coqui - co']
df['E. portoricensis'] = df['E. portoricensis - co']
df = df.drop(columns=[
    'E. coqui - co',
    'E. coqui - qui',
    'E. portoricensis - co',
    'E. portoricensis - qui'
])
df = df[[
    'siteId',
    'filename',
    'E. coqui',
    'E. wightmanae',
    'E. gryllus',
    'E. portoricensis',
    'E. unicolor',
    'E. hedricki',
    'E. locustus',
    'E. richmondi'
]]
df

In [None]:
def extract_features(file_path):
    """
    Extract spectrogram from audio file using librosa. 

    Args:
        file_path (str): Path to the audio file.

    Returns:
        np.array: Extracted features.
        int: Sample rate in hertz
    """
    audio, sr = librosa.load(file_path)

    # MFCC
    return librosa.feature.mfcc(y=audio, sr=sr, hop_length = FFT_HOP_LENGTH), sr

# Initialize a list to store the results
spectrograms = []
sample_rates = []

with ThreadPoolExecutor() as executor:
    futures = [
        executor.submit(extract_features, row["filename"])
        for _, row in df.iterrows()
    ]

    for future in as_completed(futures):
        try:
            spectrogram, sr = future.result()
            spectrograms.append(spectrogram)
            sample_rates.append(sr)
        except Exception as exc:
            print(f"Generated an exception: {exc}")

# Process the spectrograms
assert(min(sample_rates) == max(sample_rates))
sr = min(sample_rates)
slice_width = sr * SLICE_SECONDS // FFT_HOP_LENGTH
# Slice them into fixed widths
spectrogram_slices = []
for spectrogram in spectrograms:    
    spectrogram = spectrogram[:, :-(spectrogram.shape[1] % slice_width)] # Take only the section of the spectrogram that will split into fixed slices
    n_slices = spectrogram.shape[1] / slice_width
    result = np.hsplit(spectrogram, n_slices)
    spectrogram_slices.append(result)

# Group up the filenames with their corresponding spectrogram slices
spectral_data = pd.DataFrame(sum([
    [
        (filename,) + tuple(spectrogram.flatten()) for spectrogram in spectrograms
    ]
    for spectrograms, filename in zip(spectrogram_slices, df.filename)
], []))
        

In [None]:
# Get the names of the species
species_names = list(df.columns.drop(['siteId', 'filename']))
# Join the new spectrogram data with the existing dataframe
df = df.merge(spectral_data, left_on='filename', right_on=0, how='right').drop(columns=0)
df

In [None]:
# Set up readable index for df
df.columns = pd.MultiIndex.from_arrays([   
    ['metadata'] * 2 + ['classes'] * len(species_names) + ['spectral'] * (n_model_input_parameters),
    ['siteId', 'filename'] + species_names + list(range(n_model_input_parameters))
])
df

In [None]:
x = df['spectral']  # Adjust this to include only feature columns
# Convert all column names to strings

x

In [None]:
y = df["classes"].applymap(int)
y


In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

classifier = RandomForestClassifier(n_estimators=600, max_depth=25, min_samples_leaf=3, n_jobs=-1)

classifier.fit(x_train, y_train)

y_pred = classifier.predict(
    x_test,
)



In [None]:
y_test

In [None]:
prediction_df = pd.DataFrame(y_pred, columns=y_test.columns, index=y_test.index)

prediction_df

In [None]:
accuracy_df = prediction_df == y_test
accuracy_df.sum() / 69

In [None]:
import pickle
with open("./backend/trainedRF.pkl", "wb") as f:
    pickle.dump(classifier, f)