## General Imports for the scripts

In [217]:
import os
import pandas as pd
import numpy as np
import librosa
import plotly.express as px
from datetime import datetime
from maad import sound, features, util

## Utils functions

In [218]:
def build_audio_dataframe(root_folder):
    """
    Recursively scans audio folders and extracts:
    - Temporal metadata from the .WAV filename
    - Folder name (to link with GPS coordinates)
    
    Parameters
    ----------
    root_folder : str
        Path to the root folder containing audio subfolders.
    
    Returns
    -------
    df : pandas.DataFrame
        Columns:
        ['folder', 'path', 'file', 'datetime', 'year', 'month', 'day', 'hour', 'minute', 'second']
    """
    gps_data = pd.read_csv(root_folder + '/gps_coord.csv')  # Load GPS coordinates if necessary
    
    data = []

    for folder, _, files in os.walk(root_folder):
        # Current folder name (last part of the path)
        folder_name = os.path.basename(folder)
        try:
            folder_num = int(folder_name.split('_')[1])
        except Exception as e:
            print(f"⚠️ Folder {folder_name} has an invalid format")

        for file in files:
            if file.lower().endswith('.wav'):
                try:
                    # Example filename: 20251001_170000.WAV
                    base = os.path.splitext(file)[0]
                    date_str, time_str = base.split('_')
                    dt = datetime.strptime(date_str + time_str, "%Y%m%d%H%M%S")

                    lat_series = gps_data.loc[gps_data['folder'] == folder_num, 'latitude']
                    lon_series = gps_data.loc[gps_data['folder'] == folder_num, 'longitude']
                    alt_series = gps_data.loc[gps_data['folder'] == folder_num, 'altitude']

                    data.append({
                        'id': folder_name + '_' + base,
                        'folder': folder_name,
                        'path': os.path.join(folder, file),
                        'file': file,
                        'datetime': dt,
                        'year': dt.year,
                        'month': dt.month,
                        'day': dt.day,
                        'hour': dt.hour,
                        'minute': dt.minute,
                        'second': dt.second,
                        'latitude': lat_series.values[0] if len(lat_series) > 0 else None,
                        'longitude': lon_series.values[0] if len(lon_series) > 0 else None,
                        'altitude': alt_series.values[0] if len(alt_series) > 0 else None
                    })
                except Exception as e:
                    print(f"⚠️ Error with file {file} in {folder_name}: {e}")

    # Create DataFrame
    df = pd.DataFrame(data)

    # Sort chronologically
    if not df.empty:
        df.sort_values(by="datetime", inplace=True)
        df.reset_index(drop=True, inplace=True)

    return df


def save_dataframe_csv(df, filename):
    """
    Saves a DataFrame to a CSV file, asking for confirmation if the file already exists.
    """
    # Check if file already exists
    if os.path.exists(filename):
        response = input(f"⚠️ The file '{filename}' already exists. Do you want to overwrite it? (y/n): ").strip().lower()
        if response not in ['y', 'yes', 'o', 'oui']:
            print("❌ Save cancelled.")
            return  # Exit the function without overwriting

    # Save the file
    df.to_csv(filename, index=False)
    print(f"✅ File saved: {filename}")



## Old data analysis functions (ACI) Useless

In [None]:
## Old homemade function to compute ACI
# 
def calculer_ACI(chemin_wav, n_fft=1024, hop_length=512, custom_computing =False):
    if custom_computing:
        y, sr = librosa.load(chemin_wav, sr=None)
        S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length))
        diff = np.abs(np.diff(S))
        aci_freq = np.sum(diff, axis=1) / (np.sum(S, axis=1) + 1e-10)
        ACI = np.sum(aci_freq)
        return ACI
    else:
        s, fs = maad.sound.load(chemin_wav)
        Sxx, tn, fn, ext = maad.sound.spectrogram (s, fs, mode='amplitude')  
        _, _ , ACI  = maad.features.acoustic_complexity_index(Sxx)
        return ACI


## Old homemade function to compute ACI dataframe
#    
def ACI_dataframe(df):
    """
    Calcule l'ACI pour chaque fichier audio dans le DataFrame.

    Paramètres
    ----------
    df : pandas.DataFrame
        DataFrame contenant les chemins des fichiers audio.

    Retour
    ------
    df : pandas.DataFrame
        DataFrame avec une colonne supplémentaire 'ACI'.
    """

    aci_values = []
    for id in df['id']:
        chemin = df.loc[df['id'] == id, 'chemin'].values[0]
        print(f"Calcul ACI pour {chemin}...")
        try:
            aci = calculer_ACI(chemin)
            aci_values.append({
                    'id': id,
                    'ACI': aci}
            )

        except Exception as e:
            print(f"⚠️ Erreur de calcul ACI pour {chemin} : {e}")
            aci_values.append(None)

    res = pd.DataFrame(aci_values)
    return res


## Data Analysis function to compute bioacoustic indices

In [219]:
def calculate_bioacoustic_indices(df, indices=['ACI']):
    """
    Calculates bioacoustic indices for each audio file in the DataFrame.
    
    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing audio file paths.
    indices : list
        List of indices to compute. Default is ['ACI'].
    
    Returns
    -------
    df_indices : pandas.DataFrame
        DataFrame with additional columns for each requested index.
    """
    
    index_values = {index: [] for index in indices}
    index_values['id'] = []
    
    for id_ in df['id']:
        path = df.loc[df['id'] == id_, 'path'].values[0]
        # print(f"Computing indices for {path}...")
        index_values['id'].append(id_)
        
        try:
            s, fs = sound.load(path)
            Sxx, tn, fn, ext = sound.spectrogram(s, fs, mode='amplitude')
            S_power = sound.avg_power_spectro(Sxx)
            
            if 'ACI' in indices:
                _, _, ACI = features.acoustic_complexity_index(Sxx)
                index_values['ACI'].append(ACI)

            if 'ADI' in indices:
                ADI = features.acoustic_diversity_index(Sxx, fn)
                index_values['ADI'].append(ADI)

            if 'NDSI' in indices:
                NDSI, _, _, _ = features.soundscape_index(Sxx, fn)
                index_values['NDSI'].append(NDSI)

            if 'BI' in indices:
                BI = features.bioacoustics_index(Sxx, fn)
                index_values['BI'].append(BI)

            if 'H' in indices:
                Hf_Havrda, Hf_Renyi, Hf_pairedShannon, Hf_gamma, Hf_GiniSimpson = features.more_entropy(S_power, order=3)
                # H_Havrda, H_Renyi, H_pairedShannon, H_gamma, H_GiniSimpson = features.spectral_entropy(Sxx, fn)
                index_values['H'].append(Hf_pairedShannon)

        except Exception as e:
            print(f"⚠️ Error computing indices for {path}: {e}")
            for index in indices:
                index_values[index].append(None)
    
    df_indices = pd.DataFrame(index_values)
    return df_indices


## Dataframe construction (Reading raw files + computing indices)

In [226]:
save_file = False ## Set to True to save results to CSV, will ask you if file exists
read_file = True ## Set to True to read existing CSV file instead of computing

DATA_PATH = r"./Ecoacoustics_Longitudinal_Altitude_Project"
bioac_indices = ['ACI', 'ADI', 'NDSI', 'BI', 'H']#, 'NDSI', 'BI', 'H']

if read_file :
    # df_res = pd.read_csv('bioacoustic_indices_results.csv')
    df_detec = pd.read_csv('bioacoustic_indices_results_test.csv')
else :
    df_audio = build_audio_dataframe(DATA_PATH)
    # df_aci = ACI_dataframe(df_audio)

    # bioac_indices = ['ACI']#, 'ADI', 'NDSI', 'BI', 'H']#, 'NDSI', 'BI', 'H']

    df_bioac_indices = calculate_bioacoustic_indices(df_audio, indices=bioac_indices)

    # df_res = pd.merge(df_audio, df_aci, on='id', how='left')
    df_res = pd.merge(df_audio, df_bioac_indices, on='id', how='left')

if save_file :
    save_dataframe_csv(df_res, 'bioacoustic_indices_results_test.csv')



In [222]:
save_dataframe_csv(df_res, 'bioacoustic_indices_results_test.csv')

✅ File saved: bioacoustic_indices_results_test.csv


## Correction of values (gps, dates, altitude) if needed 

### With hardcoded values

In [99]:
# Correction table for altitudes
corrections_alt = pd.DataFrame([
    ('Point_1', 1250),
    ('Point_2', 1250),
    ('Point_3', 1250),
    ('Point_4', 1200),
    ('Point_7', 1200),
    ('Point_10', 1200),
    ('Point_5', 1150),
    ('Point_8', 1150),
    ('Point_11', 1150),
    ('Point_6', 1100),
    ('Point_9', 1100),
    ('Point_12', 1100)
], columns=['folder', 'alt_corr'])

# Create a dictionary for mapping
corr_dict = dict(zip(corrections_alt['folder'], corrections_alt['alt_corr']))

df_test = df_res.copy()
df_test['altitude'] = df_test['folder'].map(corr_dict)
df_test = df_test[df_test['folder'] != 'Point_1_test'].reset_index(drop=True)

array([1100., 1250., 1200., 1150.])

### With values from a file

In [211]:

# --- Reading the CSV file containing GPS coordinates ---
gps_coord = pd.read_csv("./Ecoacoustics_Longitudinal_Altitude_Project/gps_coord.csv")

# --- Mapping preparation ---
# Ensure the 'folder' column is a string
gps_coord['folder'] = gps_coord['folder'].astype(str)

df_test = df_detec.copy()

# Extraction of the point number using regex
df_test['point_num'] = df_test['folder'].str.extract(r'Point_(\d+)')

# --- Mapping dictionaries creation ---
alt_dict = dict(zip(gps_coord['folder'], gps_coord['altitude']))
lat_dict = dict(zip(gps_coord['folder'], gps_coord['latitude']))
lon_dict = dict(zip(gps_coord['folder'], gps_coord['longitude']))


# --- Apply corrections
df_test['altitude']  = df_test['point_num'].map(alt_dict)
df_test['latitude']  = df_test['point_num'].map(lat_dict)
df_test['longitude'] = df_test['point_num'].map(lon_dict)
df_test["date"] = df_test["file_name"].apply(
    lambda x: datetime.strptime(
        x.split('_')[0] + x.split('_')[1].split('.')[0],
        "%Y%m%d%H%M%S"
    )
)
# --- Deleting the useless column ---
df_test.drop(columns='point_num', inplace=True)

print("✅ Coordinates mapped successfully using the gps file.")


✅ Coordonnées corrigées selon gps_coord.csv


In [None]:
save_dataframe_csv(df_test, 'birdnet_detections_with_gps.csv')

✅ Fichier sauvegardé : birdnet_detections_with_gps.csv


## Dataframe manipulations and shaping

In [133]:
# Sort the DataFrame by multiple columns (alphanumerical order) to ensure specific order 
# Will consider the first column first, then the second, and so on.
df_res.sort_values(by=['dossier', 'mois', 'jour', 'heure', 'minute'], inplace=True)

# Create a filtered copy of df_res that excludes rows with specific conditions (here: 'dossier' equals 'Point_1_test')
# The index is reset to keep it clean and sequential
df_res_test = df_res[df_res['dossier'] != 'Point_1_test'].reset_index(drop=True)

# Reset the index of the original DataFrame as well (without creating a new copy)
# This makes sure index values go from 0 to len(df_res)-1 after sorting
df_res.reset_index(drop=True, inplace=True)

# Reshape the DataFrame from wide to long format for bioacoustic indices
df_long = pd.melt(df_res,
                id_vars= [col for col in df_res.columns if col not in bioac_indices],
                value_vars=bioac_indices,
                var_name="index",
                value_name="value")


## Compute metrics on data

In [None]:
# Exemple : Median computation by group
df_median_ACI = (
    df_res.groupby(['folder', 'day', 'hour', 'altitude'], as_index=False)['ACI']
      .median()
)


df_median_NDSI = (
    df_res.groupby(['folder', 'day', 'hour', 'altitude'], as_index=False)['NDSI']
      .median()
)


## Plotting data

### With long file in order to see each index with the others

In [None]:

fig = px.line(
    df_long,
    x='datetime',
    y='value',
    color='altitude',
    log_y=False,
    facet_row='index',
)

fig.update_yaxes(matches=None)

fig.show()

### Focusing on one specific index

In [208]:
indice = 'ACI'  # choose a specific index ADI, NDSI, BI, H

fig = px.scatter(
    df_res,
    x='datetime',
    y=indice,
    color='dossier',      # ou point GPS
    # hover_data=['latitude', 'longitude'],
    title=f'Evolution de l\'indice {indice} en fonction du temps',
    log_y=False,
    facet_col='altitude',  # pour avoir un graphique par dossier
    range_y=[295, 320]
)
fig.show()

### Specific metrics (median, mean)

In [55]:
indice = 'ACI'  # Choisir l’indice à visualiser ADI, NDSI, BI, H

fig = px.line(
    df_median_ACI,
    x='heure',
    y=indice,
    color='dossier',
    title=f'Evolution de l\'indice {indice} en fonction du temps',
    log_y=False,
    facet_col='altitude',  
)
fig.show()

### Data normalization to be able to plot every index on the same graph

In [60]:
## Normalisation

from sklearn.preprocessing import MinMaxScaler

df_norm = df_res.copy()
cols_to_norm = ['ACI', 'ADI', 'NDSI', 'BI', 'H']  # les colonnes à normaliser
scaler = MinMaxScaler()

df_norm[cols_to_norm] = scaler.fit_transform(df_norm[cols_to_norm])


In [None]:
fig = px.scatter(
    df_long,
    x='heure',
    y='value',
    color='indice',
    log_y=False,
    facet_col='altitude',
    facet_row='dossier',
    )
fig.show()