<a href="https://colab.research.google.com/github/DataBytes-Organisation/Project-Echo/blob/ee%2Fjaje%2Fonboarding/Project_echo_ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ensemble Learning

## Importing Libraries

In [None]:
import warnings
warnings.filterwarnings('ignore')
import os
import pathlib
import glob
import subprocess
import threading
import time
from IPython.display import clear_output
import matplotlib.pyplot as plt



In [None]:
from google.colab import drive

# Mount your Google Drive account
drive.mount('/content/drive')

# Update your input parameters
source_folder = "/content/drive/MyDrive/Test dataset"
output_file_type = ".wav"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# input parameters
source_folder = "/content/drive/MyDrive/Test dataset"

# options are currently ".mp3" or ".wav" (there is scope to add more)
output_file_type = ".wav"

## Generate list of all folders in the source path

In [None]:
sub_folders = pathlib.Path(source_folder).glob("**/*") #generator object
sub_folders
sub_folder_paths = [x for x in sub_folders if x.is_dir()]
print(len(sub_folder_paths))
sub_folder_paths[:4]

96


[PosixPath('/content/drive/MyDrive/Test dataset/Uperoleia altissima'),
 PosixPath('/content/drive/MyDrive/Test dataset/Vulpes vulpes'),
 PosixPath('/content/drive/MyDrive/Test dataset/Strepera versicolor'),
 PosixPath('/content/drive/MyDrive/Test dataset/Uperoleia mimula')]

## Generate list of all audio files paths in the sub-folder

In [None]:
# this creates a new version of the input file, converted to the output format
def convert_file(input_file_path):
    parent_path = pathlib.PurePath(input_file_path).parent
    file_name = pathlib.Path(input_file_path).stem
    output_file_path = str(parent_path) + "/" + file_name + output_file_type
    # write the new wav file forcing overwrite
    subprocess.call(['ffmpeg', '-y', '-i', input_file_path,
                 output_file_path],
    stdout=subprocess.DEVNULL,
    stderr=subprocess.STDOUT)
    return

In [None]:
# check each file type and convert it if necessary
# if creates a new version of the file, then deletes the old
# it skips over the file if it is already in the output format

def check_file_type(audio_file_path):
    # check this file name and return if it already exists
    if (pathlib.Path(audio_file_path).suffix == output_file_type): return
# audio file type conversions
    if (pathlib.Path(audio_file_path).suffix == ".wav"): convert_file(audio_file_path); # print("wav")
    if (pathlib.Path(audio_file_path).suffix == ".mp3"): convert_file(audio_file_path); # print("mp3")
    if (pathlib.Path(audio_file_path).suffix == ".flac"): convert_file(audio_file_path); # print("flac")
    if (pathlib.Path(audio_file_path).suffix == ".ogg"): convert_file(audio_file_path); # print("ogg")
# video file type conversions:
    if (pathlib.Path(audio_file_path).suffix == ".m4a"): convert_file(audio_file_path)
    if (pathlib.Path(audio_file_path).suffix == ".mp4"): convert_file(audio_file_path)
# delete the old file
    os.remove(audio_file_path)
    return

In [None]:
# this function is called by each thread
# it takes a subfolder within the input directory and iterates through every file within it
# to convert to output format

def threaded_function(sub_folder_path):
    audio_files = pathlib.Path(sub_folder_path).glob("**/*") #generator object
    audio_files
    audio_file_paths =  [x for x in audio_files if x.is_file()] #PosixPath list
    for x in range(len(audio_file_paths)):
        check_file_type(audio_file_paths[x])
    print("folder {} done".format(sub_folder_path.stem))
    time.sleep(1)
    clear_output()
    return

In [None]:
# create a list of threads
# the list is currently set to the number of folders

def create_threads(sub_folder_paths):
    max_threads = lambda x : 10 if (x > 10) else x
    num_threads =  max_threads(len(sub_folder_paths))
    num_threads = len(sub_folder_paths)
    thread_list = []
    for i in range(num_threads):
        thread_list.append(threading.Thread(target=threaded_function, args=([sub_folder_paths[i]])))
    return thread_list

## Main function that converts all files to the specified format

In [None]:
# create a thread for every folder
thread_list = create_threads(sub_folder_paths)
# start converting files in each folder concurrently
for thread in thread_list:
    thread.start()
# wait for all threads to complete
for thread in thread_list:
    thread.join()
print("done all")
print("Conversion of {}\n to format \"{}\" complete!".format(source_folder, output_file_type))

done all
Conversion of /content/drive/MyDrive/Test dataset
 to format ".wav" complete!


In [None]:
import os
import librosa

def check_sampling_rate(directory):
    all_files_correct = True
    for file_name in os.listdir(directory):
        if file_name.endswith('.wav'):  # Assuming audio files are in WAV format
            file_path = os.path.join(directory, file_name)
            sr = librosa.get_samplerate(file_path)
            if sr != 16000:
                print(f"File {file_name} has incorrect sampling rate: {sr} Hz")
                all_files_correct = False
    if all_files_correct:
        print("All audio files have correct sampling rate (16 kHz)")
    else:
        print("Some audio files have incorrect sampling rate")

# Example usage
directory_path = "/content/drive/MyDrive/Test dataset"
check_sampling_rate(directory_path)


All audio files have correct sampling rate (16 kHz)


## EDA

In [None]:
import os

def count_audio_files(folder_path):
    audio_extensions = ['.mp3', '.flac', '.wav']
    num_files = 0
    for root, _, files in os.walk(folder_path):
        num_files += len([filename for filename in files
                          if any(filename.endswith(ext) for ext in audio_extensions)])
    return num_files

# Specified source folder with subfolders
source_folder = "/content/drive/MyDrive/Test dataset"

# Counting audio files in the source folder and its subfolders
num_audio_files = count_audio_files(source_folder)

print(f"Number of audio files in the source folder and its subfolders: {num_audio_files}")



Number of audio files in the source folder and its subfolders: 5119


In [None]:
def print_audio_files(sub_folder_path):
    audio_files = pathlib.Path(sub_folder_path).glob("**/*.wav")
    wav_files_count = sum(1 for _ in audio_files)
    print(f"Number of .wav files in {sub_folder_path}: {wav_files_count}")

# Modify the threaded_function to call the print_audio_files function
def threaded_function(sub_folder_path):
    audio_files = pathlib.Path(sub_folder_path).glob("**/*")
    for x in range(len(audio_files)):
        check_file_type(audio_files[x])
    print("Folder {} done".format(sub_folder_path.stem))
    time.sleep(1)
    clear_output()
    print_audio_files(sub_folder_path)  # Print the audio files in the folder
    return

# After the threads are joined, print the count and list of audio files in each folder
for thread, folder_path in zip(thread_list, sub_folder_paths):
    thread.join()
    print_audio_files(folder_path)

print("Done all")
print("Conversion of {}\n to format \"{}\" complete!".format(source_folder, output_file_type))


Number of .wav files in /content/drive/MyDrive/Test dataset/Uperoleia altissima: 67
Number of .wav files in /content/drive/MyDrive/Test dataset/Vulpes vulpes: 103
Number of .wav files in /content/drive/MyDrive/Test dataset/Strepera versicolor: 18
Number of .wav files in /content/drive/MyDrive/Test dataset/Uperoleia mimula: 47
Number of .wav files in /content/drive/MyDrive/Test dataset/Symposiachrus trivirgatus: 14
Number of .wav files in /content/drive/MyDrive/Test dataset/Tregellasia capito: 17
Number of .wav files in /content/drive/MyDrive/Test dataset/Vanellus miles: 58
Number of .wav files in /content/drive/MyDrive/Test dataset/Sus Scrofa: 40
Number of .wav files in /content/drive/MyDrive/Test dataset/Trichosurus vulpecula: 7
Number of .wav files in /content/drive/MyDrive/Test dataset/Spilopelia chinensis: 7
Number of .wav files in /content/drive/MyDrive/Test dataset/Rattus Norvegicus: 102
Number of .wav files in /content/drive/MyDrive/Test dataset/Scythrops novaehollandiae: 7
Numb

## Making a dataframe for data manipulation Making a dataframe for data manipulation

In [None]:
import os
import librosa
import pandas as pd
from pathlib import Path

def load_wav_files_into_dataframe(folder_path):
    data = []
    for root, _, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.wav'):
                file_path = os.path.join(root, filename)
                try:
                    audio, sr = librosa.load(file_path, sr=None)
                    duration = librosa.get_duration(y=audio, sr=sr)
                    data.append({'file_path': file_path, 'filename': filename, 'sampling_rate': sr, 'duration': duration})
                except Exception as e:
                    print(f"Error loading {filename}: {str(e)}")

    df = pd.DataFrame(data)
    return df

# Load ".wav" files into a DataFrame
source_folder = "/content/drive/MyDrive/Test dataset"
df = load_wav_files_into_dataframe(source_folder)

# Display the DataFrame
print(df)


                                              file_path  \
0     /content/drive/MyDrive/Test dataset/Uperoleia ...   
1     /content/drive/MyDrive/Test dataset/Uperoleia ...   
2     /content/drive/MyDrive/Test dataset/Uperoleia ...   
3     /content/drive/MyDrive/Test dataset/Uperoleia ...   
4     /content/drive/MyDrive/Test dataset/Uperoleia ...   
...                                                 ...   
5114  /content/drive/MyDrive/Test dataset/Acanthiza ...   
5115  /content/drive/MyDrive/Test dataset/Acanthiza ...   
5116  /content/drive/MyDrive/Test dataset/Acanthiza ...   
5117  /content/drive/MyDrive/Test dataset/Acanthiza ...   
5118  /content/drive/MyDrive/Test dataset/Acanthiza ...   

                        filename  sampling_rate  duration  
0         region_1.600-2.950.wav          44100  1.398617  
1     region_101.650-103.650.wav          44100  2.051678  
2     region_105.650-107.650.wav          44100  2.051678  
3     region_103.650-105.650.wav          44100  2.

In [None]:
import os
import librosa
import pandas as pd
from pathlib import Path

def extract_species_name(file_path):
    """
    Extract the species name from the file path.

    Args:
    - file_path: Path to the audio file.

    Returns:
    - Species name extracted from the path.
    """
    # Split the path and extract the species name
    return Path(file_path).parts[-2]

def load_wav_files_into_dataframe(folder_path):
    """
    Load WAV files into a DataFrame and extract additional features.

    Args:
    - folder_path: Path to the folder containing audio files.

    Returns:
    - DataFrame containing file paths, species names, sampling rates, durations, and other features.
    """
    data = []
    for root, _, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.wav'):
                file_path = os.path.join(root, filename)
                try:
                    # Load audio file
                    audio, sr = librosa.load(file_path, sr=None)
                    duration = librosa.get_duration(y=audio, sr=sr)
                    # Extract species name from the file path
                    species_name = extract_species_name(file_path)
                    data.append({
                        'file_path': file_path,
                        'species_name': species_name,
                        'filename': filename,
                        'sampling_rate': sr,
                        'duration': duration
                    })
                except Exception as e:
                    print(f"Error loading {filename}: {str(e)}")

    df = pd.DataFrame(data)
    return df

# Load ".wav" files into a DataFrame
source_folder = "/content/drive/MyDrive/Test dataset"
df = load_wav_files_into_dataframe(source_folder)

# Display the DataFrame
print(df)


                                              file_path         species_name  \
0     /content/drive/MyDrive/Test dataset/Uperoleia ...  Uperoleia altissima   
1     /content/drive/MyDrive/Test dataset/Uperoleia ...  Uperoleia altissima   
2     /content/drive/MyDrive/Test dataset/Uperoleia ...  Uperoleia altissima   
3     /content/drive/MyDrive/Test dataset/Uperoleia ...  Uperoleia altissima   
4     /content/drive/MyDrive/Test dataset/Uperoleia ...  Uperoleia altissima   
...                                                 ...                  ...   
5114  /content/drive/MyDrive/Test dataset/Acanthiza ...    Acanthiza pusilla   
5115  /content/drive/MyDrive/Test dataset/Acanthiza ...    Acanthiza pusilla   
5116  /content/drive/MyDrive/Test dataset/Acanthiza ...    Acanthiza pusilla   
5117  /content/drive/MyDrive/Test dataset/Acanthiza ...    Acanthiza pusilla   
5118  /content/drive/MyDrive/Test dataset/Acanthiza ...    Acanthiza pusilla   

                        filename  sampl

In [None]:
df

Unnamed: 0,file_path,species_name,filename,sampling_rate,duration
0,/content/drive/MyDrive/Test dataset/Uperoleia ...,Uperoleia altissima,region_1.600-2.950.wav,44100,1.398617
1,/content/drive/MyDrive/Test dataset/Uperoleia ...,Uperoleia altissima,region_101.650-103.650.wav,44100,2.051678
2,/content/drive/MyDrive/Test dataset/Uperoleia ...,Uperoleia altissima,region_105.650-107.650.wav,44100,2.051678
3,/content/drive/MyDrive/Test dataset/Uperoleia ...,Uperoleia altissima,region_103.650-105.650.wav,44100,2.051678
4,/content/drive/MyDrive/Test dataset/Uperoleia ...,Uperoleia altissima,region_11.700-13.700.wav,44100,2.051678
...,...,...,...,...,...
5114,/content/drive/MyDrive/Test dataset/Acanthiza ...,Acanthiza pusilla,region_9.350-11.350.wav,44100,2.051678
5115,/content/drive/MyDrive/Test dataset/Acanthiza ...,Acanthiza pusilla,region_9.500-11.500.wav,44100,2.051678
5116,/content/drive/MyDrive/Test dataset/Acanthiza ...,Acanthiza pusilla,region_95.900-97.900.wav,48000,2.028979
5117,/content/drive/MyDrive/Test dataset/Acanthiza ...,Acanthiza pusilla,region_96.000-98.000.wav,48000,2.028979


## Pitch Extraction from Audio Files using Librosa

**Code defines a function extract_min_max_pitch that takes an audio time series y and its sampling rate sr as input and returns the minimum and maximum pitch frequencies detected in the audio, excluding zero values. It uses the librosa library for audio processing.**

In [None]:
import librosa
import numpy as np
import pandas as pd

def extract_min_max_pitch(y, sr):
    """
    Extract the minimum and maximum pitch frequencies from an audio signal, excluding zeros.

    Args:
    - y: Audio time series.
    - sr: Sampling rate of y.

    Returns:
    - (min_pitch, max_pitch): A tuple containing the minimum and maximum pitch frequencies.
    """
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    # Flatten the array and filter out 0 values
    non_zero_pitches = pitches[pitches > 0].flatten()

    if len(non_zero_pitches) == 0:  # If no pitches detected, return zeros
        return (0, 0)

    min_pitch = np.min(non_zero_pitches)
    max_pitch = np.max(non_zero_pitches)

    return (min_pitch, max_pitch)

# Initialize lists to store min and max pitches for each file
min_pitches = []
max_pitches = []

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    # Load the audio file
    y, sr = librosa.load(row['file_path'], sr=None)

    # Extract min and max pitch
    min_pitch, max_pitch = extract_min_max_pitch(y, sr)

    # Append to lists
    min_pitches.append(min_pitch)
    max_pitches.append(max_pitch)

# Add the min and max pitches as new columns to the DataFrame
df['min_pitch'] = min_pitches
df['max_pitch'] = max_pitches

# Display the first few rows to verify the new columns
print(df.head())


                                           file_path         species_name  \
0  /content/drive/MyDrive/Test dataset/Uperoleia ...  Uperoleia altissima   
1  /content/drive/MyDrive/Test dataset/Uperoleia ...  Uperoleia altissima   
2  /content/drive/MyDrive/Test dataset/Uperoleia ...  Uperoleia altissima   
3  /content/drive/MyDrive/Test dataset/Uperoleia ...  Uperoleia altissima   
4  /content/drive/MyDrive/Test dataset/Uperoleia ...  Uperoleia altissima   

                     filename  sampling_rate  duration   min_pitch  \
0      region_1.600-2.950.wav          44100  1.398617  145.629547   
1  region_101.650-103.650.wav          44100  2.051678  251.557739   
2  region_105.650-107.650.wav          44100  2.051678  217.134094   
3  region_103.650-105.650.wav          44100  2.051678  180.630020   
4    region_11.700-13.700.wav          44100  2.051678  145.699783   

     max_pitch  
0  3973.529297  
1  3992.867676  
2  3992.621338  
3  3988.047852  
4  3988.889893  


## Mean Spectral Bandwidth Calculation

**The code calculates the mean spectral bandwidth of audio files stored in a DataFrame and filters them based on a threshold for voice recognition. It adds a new column with mean spectral bandwidth values, filters files within a specific bandwidth range, and prints the count of valid files along with the total number of files. This process helps identify suitable audio files for voice recognition based on spectral characteristics.**

In [None]:
def calculate_mean_spectral_bandwidth(file_path, sr=16000):
    """
    Calculate the mean spectral bandwidth of an audio file.

    Args:
    - file_path: Path to the audio file.
    - sr: Sampling rate to use for loading the audio.

    Returns:
    - Mean spectral bandwidth of the audio file.
    """
    y, sr = librosa.load(file_path, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    mean_spec_bw = np.mean(spec_bw)
    return mean_spec_bw

# Add a new column to CREMA_df for the mean spectral bandwidths
df['mean_spectral_bandwidth'] = df['file_path'].apply(calculate_mean_spectral_bandwidth)

# Define the maximum spectral bandwidth for voice recognition
max_bw_for_voice_recognition = 4000  # Hz

# Filter the DataFrame for audio files that lie within the required spectral bandwidth range
valid_bw_files_df = df[df['mean_spectral_bandwidth'] <= max_bw_for_voice_recognition]

# Count the number of files that meet the criterion
num_valid_bw_files = len(valid_bw_files_df)

print(f"Number of audio files within the required spectral bandwidth range for voice recognition: {num_valid_bw_files}")
print(f"Total number of audio files: {len(df)}")

Number of audio files within the required spectral bandwidth range for voice recognition: 5119
Total number of audio files: 5119


In [None]:
df

Unnamed: 0,file_path,species_name,filename,sampling_rate,duration,min_pitch,max_pitch,mean_spectral_bandwidth
0,/content/drive/MyDrive/Test dataset/Uperoleia ...,Uperoleia altissima,region_1.600-2.950.wav,44100,1.398617,145.629547,3973.529297,1527.923501
1,/content/drive/MyDrive/Test dataset/Uperoleia ...,Uperoleia altissima,region_101.650-103.650.wav,44100,2.051678,251.557739,3992.867676,994.649829
2,/content/drive/MyDrive/Test dataset/Uperoleia ...,Uperoleia altissima,region_105.650-107.650.wav,44100,2.051678,217.134094,3992.621338,1057.277848
3,/content/drive/MyDrive/Test dataset/Uperoleia ...,Uperoleia altissima,region_103.650-105.650.wav,44100,2.051678,180.630020,3988.047852,1102.745193
4,/content/drive/MyDrive/Test dataset/Uperoleia ...,Uperoleia altissima,region_11.700-13.700.wav,44100,2.051678,145.699783,3988.889893,1035.548732
...,...,...,...,...,...,...,...,...
5114,/content/drive/MyDrive/Test dataset/Acanthiza ...,Acanthiza pusilla,region_9.350-11.350.wav,44100,2.051678,149.041901,3524.452881,1892.090204
5115,/content/drive/MyDrive/Test dataset/Acanthiza ...,Acanthiza pusilla,region_9.500-11.500.wav,44100,2.051678,143.310974,3994.210205,1854.505000
5116,/content/drive/MyDrive/Test dataset/Acanthiza ...,Acanthiza pusilla,region_95.900-97.900.wav,48000,2.028979,153.909592,3993.556152,1855.511728
5117,/content/drive/MyDrive/Test dataset/Acanthiza ...,Acanthiza pusilla,region_96.000-98.000.wav,48000,2.028979,153.158005,3987.334961,1810.934309


In [None]:
df_new1 = df.copy()

## RMS Loudness Level Calculation in dBFS

**The code computes the RMS loudness level in dBFS for audio files in a DataFrame (df_new). It adds a new column with these values and displays the DataFrame with the added column. This enables the analysis of audio loudness levels in decibels relative to full scale.**

In [None]:
def calculate_rms_loudness_dbfs(file_path):
    """
    Calculate the RMS loudness level of an audio file in dBFS.

    Args:
    - file_path: Path to the audio file.

    Returns:
    - RMS loudness level of the audio file in dBFS.
    """
    y, sr = librosa.load(file_path, sr=None)
    # Calculate the RMS value
    S, phase = librosa.magphase(librosa.stft(y))
    rms = librosa.feature.rms(S=S).mean()
    # Convert to dBFS
    rms_dbfs = librosa.power_to_db(rms, ref=1.0)  # Assuming max amplitude of 1 is 0 dBFS
    return rms_dbfs

# Calculate the RMS loudness in dBFS for each audio file and add it as a new column
df_new1['rms_loudness_db'] = df_new1['file_path'].apply(calculate_rms_loudness_dbfs)

# Display the DataFrame with the new 'rms_loudness_db' column
print(df_new1.head())

                                           file_path         species_name  \
0  /content/drive/MyDrive/Test dataset/Uperoleia ...  Uperoleia altissima   
1  /content/drive/MyDrive/Test dataset/Uperoleia ...  Uperoleia altissima   
2  /content/drive/MyDrive/Test dataset/Uperoleia ...  Uperoleia altissima   
3  /content/drive/MyDrive/Test dataset/Uperoleia ...  Uperoleia altissima   
4  /content/drive/MyDrive/Test dataset/Uperoleia ...  Uperoleia altissima   

                     filename  sampling_rate  duration   min_pitch  \
0      region_1.600-2.950.wav          44100  1.398617  145.629547   
1  region_101.650-103.650.wav          44100  2.051678  251.557739   
2  region_105.650-107.650.wav          44100  2.051678  217.134094   
3  region_103.650-105.650.wav          44100  2.051678  180.630020   
4    region_11.700-13.700.wav          44100  2.051678  145.699783   

     max_pitch  mean_spectral_bandwidth  rms_loudness_db  
0  3973.529297              1527.923501       -20.571272 

In [None]:
df_new1

Unnamed: 0,file_path,species_name,filename,sampling_rate,duration,min_pitch,max_pitch,mean_spectral_bandwidth,rms_loudness_db
0,/content/drive/MyDrive/Test dataset/Uperoleia ...,Uperoleia altissima,region_1.600-2.950.wav,44100,1.398617,145.629547,3973.529297,1527.923501,-20.571272
1,/content/drive/MyDrive/Test dataset/Uperoleia ...,Uperoleia altissima,region_101.650-103.650.wav,44100,2.051678,251.557739,3992.867676,994.649829,-16.949379
2,/content/drive/MyDrive/Test dataset/Uperoleia ...,Uperoleia altissima,region_105.650-107.650.wav,44100,2.051678,217.134094,3992.621338,1057.277848,-17.570379
3,/content/drive/MyDrive/Test dataset/Uperoleia ...,Uperoleia altissima,region_103.650-105.650.wav,44100,2.051678,180.630020,3988.047852,1102.745193,-17.456577
4,/content/drive/MyDrive/Test dataset/Uperoleia ...,Uperoleia altissima,region_11.700-13.700.wav,44100,2.051678,145.699783,3988.889893,1035.548732,-17.530997
...,...,...,...,...,...,...,...,...,...
5114,/content/drive/MyDrive/Test dataset/Acanthiza ...,Acanthiza pusilla,region_9.350-11.350.wav,44100,2.051678,149.041901,3524.452881,1892.090204,-20.283746
5115,/content/drive/MyDrive/Test dataset/Acanthiza ...,Acanthiza pusilla,region_9.500-11.500.wav,44100,2.051678,143.310974,3994.210205,1854.505000,-15.437894
5116,/content/drive/MyDrive/Test dataset/Acanthiza ...,Acanthiza pusilla,region_95.900-97.900.wav,48000,2.028979,153.909592,3993.556152,1855.511728,-18.074503
5117,/content/drive/MyDrive/Test dataset/Acanthiza ...,Acanthiza pusilla,region_96.000-98.000.wav,48000,2.028979,153.158005,3987.334961,1810.934309,-17.599400


## Evaluation of RMS Loudness Levels

**The code defines ideal RMS loudness levels for voice recognition and assesses the number of audio files needing amplification (too quiet) or gain reduction (too loud) based on these ideals. It computes the counts and percentages of files requiring adjustment and prints the results. This analysis helps in identifying files that may need volume adjustments to meet optimal loudness criteria for voice recognition.**

In [None]:
# Define ideal RMS loudness levels (in dBFS) for voice recognition
ideal_min_rms_dbfs = -23  # Minimum ideal RMS loudness level
ideal_max_rms_dbfs = -20  # Maximum ideal RMS loudness level

# Assuming 'rms_loudness_db' is a column in CREMA_df representing RMS loudness in dBFS
# Count files needing amplification (too quiet)
files_needing_amplification = df_new1[df_new1['rms_loudness_db'] < ideal_min_rms_dbfs].shape[0]

# Count files needing gain reduction (too loud)
files_needing_gain_reduction = df_new1[df_new1['rms_loudness_db'] > ideal_max_rms_dbfs].shape[0]

# Print the counts
print(f"Number of files needing amplification (too quiet): {files_needing_amplification}")
print(f"Number of files needing gain reduction (too loud): {files_needing_gain_reduction}")

# Total number of files evaluated
total_files_evaluated = df_new1.shape[0]

# Printing the percentage of files needing adjustment
percentage_needing_amplification = (files_needing_amplification / total_files_evaluated) * 100
percentage_needing_gain_reduction = (files_needing_gain_reduction / total_files_evaluated) * 100

print(f"Percentage of files needing amplification: {percentage_needing_amplification:.2f}%")
print(f"Percentage of files needing gain reduction: {percentage_needing_gain_reduction:.2f}%")

Number of files needing amplification (too quiet): 251
Number of files needing gain reduction (too loud): 3540
Percentage of files needing amplification: 4.90%
Percentage of files needing gain reduction: 69.15%


In [None]:
df_new1

Unnamed: 0,species_name,filename,sampling_rate,duration,min_pitch,max_pitch,mean_spectral_bandwidth,rms_loudness_db
0,Uperoleia altissima,region_1.600-2.950.wav,44100,1.398617,145.629547,3973.529297,1527.923501,-20.571272
1,Uperoleia altissima,region_101.650-103.650.wav,44100,2.051678,251.557739,3992.867676,994.649829,-16.949379
2,Uperoleia altissima,region_105.650-107.650.wav,44100,2.051678,217.134094,3992.621338,1057.277848,-17.570379
3,Uperoleia altissima,region_103.650-105.650.wav,44100,2.051678,180.630020,3988.047852,1102.745193,-17.456577
4,Uperoleia altissima,region_11.700-13.700.wav,44100,2.051678,145.699783,3988.889893,1035.548732,-17.530997
...,...,...,...,...,...,...,...,...
5114,Acanthiza pusilla,region_9.350-11.350.wav,44100,2.051678,149.041901,3524.452881,1892.090204,-20.283746
5115,Acanthiza pusilla,region_9.500-11.500.wav,44100,2.051678,143.310974,3994.210205,1854.505000,-15.437894
5116,Acanthiza pusilla,region_95.900-97.900.wav,48000,2.028979,153.909592,3993.556152,1855.511728,-18.074503
5117,Acanthiza pusilla,region_96.000-98.000.wav,48000,2.028979,153.158005,3987.334961,1810.934309,-17.599400


In [None]:
df_new1['duration'] = df_new1['duration'].round(1)
df_new1['min_pitch'] = df_new1['min_pitch'].round(1)
df_new1['max_pitch'] = df_new1['max_pitch'].round(1)
df_new1['mean_spectral_bandwidth'] = df_new1['mean_spectral_bandwidth'].round(1)
df_new1['rms_loudness_db'] = df_new1['rms_loudness_db'].round(1)

In [None]:
df_new1

Unnamed: 0,file_path,species_name,filename,sampling_rate,duration,min_pitch,max_pitch,mean_spectral_bandwidth,rms_loudness_db
0,/content/drive/MyDrive/Test dataset/Uperoleia ...,Uperoleia altissima,region_1.600-2.950.wav,44100,1.4,145.6,3973.5,1527.9,-20.6
1,/content/drive/MyDrive/Test dataset/Uperoleia ...,Uperoleia altissima,region_101.650-103.650.wav,44100,2.1,251.6,3992.9,994.6,-16.9
2,/content/drive/MyDrive/Test dataset/Uperoleia ...,Uperoleia altissima,region_105.650-107.650.wav,44100,2.1,217.1,3992.6,1057.3,-17.6
3,/content/drive/MyDrive/Test dataset/Uperoleia ...,Uperoleia altissima,region_103.650-105.650.wav,44100,2.1,180.6,3988.0,1102.7,-17.5
4,/content/drive/MyDrive/Test dataset/Uperoleia ...,Uperoleia altissima,region_11.700-13.700.wav,44100,2.1,145.7,3988.9,1035.5,-17.5
...,...,...,...,...,...,...,...,...,...
5114,/content/drive/MyDrive/Test dataset/Acanthiza ...,Acanthiza pusilla,region_9.350-11.350.wav,44100,2.1,149.0,3524.5,1892.1,-20.3
5115,/content/drive/MyDrive/Test dataset/Acanthiza ...,Acanthiza pusilla,region_9.500-11.500.wav,44100,2.1,143.3,3994.2,1854.5,-15.4
5116,/content/drive/MyDrive/Test dataset/Acanthiza ...,Acanthiza pusilla,region_95.900-97.900.wav,48000,2.0,153.9,3993.6,1855.5,-18.1
5117,/content/drive/MyDrive/Test dataset/Acanthiza ...,Acanthiza pusilla,region_96.000-98.000.wav,48000,2.0,153.2,3987.3,1810.9,-17.6


In [None]:
missing_counts = df_new1.isna().sum()
for feature, count in missing_counts.items():
    print(f"Feature: {feature}, Number of Missing Entries: {count}")


Feature: file_path, Number of Missing Entries: 0
Feature: species_name, Number of Missing Entries: 0
Feature: filename, Number of Missing Entries: 0
Feature: sampling_rate, Number of Missing Entries: 0
Feature: duration, Number of Missing Entries: 0
Feature: min_pitch, Number of Missing Entries: 0
Feature: max_pitch, Number of Missing Entries: 0
Feature: mean_spectral_bandwidth, Number of Missing Entries: 0
Feature: rms_loudness_db, Number of Missing Entries: 0


In [None]:
from collections import Counter


c = Counter(df_new1['species_name'])

# Print the counts of each location
print(c)


Counter({'Colluricincla harmonica': 607, 'Acanthiza pusilla': 246, 'Litoria inermis': 211, 'Cisticola exilis': 211, 'Barnardius zonarius': 202, 'Cophixalus infacetus': 171, 'Philemon citreogularis': 168, 'Acanthiza reguloides': 157, 'Cincloramphus mathewsi': 147, 'Acanthiza nana': 144, 'Petroica goodenovii': 123, 'Haliastur sphenurus': 116, 'Philemon corniculatus': 109, 'Vulpes vulpes': 103, 'Rattus Norvegicus': 102, 'Acanthorhynchus tenuirostris': 100, 'Melithreptus gularis': 98, 'Accipiter cirrocephalus': 86, 'Carterornis leucotis': 76, 'Cinclosoma punctatum': 71, 'Uperoleia altissima': 67, 'Parvipsitta pusilla': 63, 'Acanthiza uropygialis': 62, 'Manorina melanophrys': 61, 'Carduelis carduelis': 59, 'Vanellus miles': 58, 'Capra Hircus': 54, 'Falco berigora': 48, 'Cormobates leucophaea': 48, 'Uperoleia mimula': 47, 'Plectorhyncha lanceolata': 47, 'Petrochelidon nigricans': 46, 'Eurystomus orientalis': 46, 'Dicaeum hirundinaceum': 45, 'Felis Catus': 44, 'Falco peregrinus': 43, 'Daphoen

In [None]:
df_new1['species_name'].value_counts()

Unnamed: 0_level_0,count
species_name,Unnamed: 1_level_1
Colluricincla harmonica,607
Acanthiza pusilla,246
Litoria inermis,211
Cisticola exilis,211
Barnardius zonarius,202
Cophixalus infacetus,171
Philemon citreogularis,168
Acanthiza reguloides,157
Cincloramphus mathewsi,147
Acanthiza nana,144


In [None]:
df_new1.drop(columns=['file_path', 'filename'], inplace=True)

In [None]:
df_new2 = df_new1

In [None]:
df_new2

Unnamed: 0,species_name,sampling_rate,duration,min_pitch,max_pitch,mean_spectral_bandwidth,rms_loudness_db
0,Uperoleia altissima,44100,1.4,145.6,3973.5,1527.9,-20.6
1,Uperoleia altissima,44100,2.1,251.6,3992.9,994.6,-16.9
2,Uperoleia altissima,44100,2.1,217.1,3992.6,1057.3,-17.6
3,Uperoleia altissima,44100,2.1,180.6,3988.0,1102.7,-17.5
4,Uperoleia altissima,44100,2.1,145.7,3988.9,1035.5,-17.5
...,...,...,...,...,...,...,...
5114,Acanthiza pusilla,44100,2.1,149.0,3524.5,1892.1,-20.3
5115,Acanthiza pusilla,44100,2.1,143.3,3994.2,1854.5,-15.4
5116,Acanthiza pusilla,48000,2.0,153.9,3993.6,1855.5,-18.1
5117,Acanthiza pusilla,48000,2.0,153.2,3987.3,1810.9,-17.6


In [None]:
from sklearn.preprocessing import LabelEncoder

# Create an instance of LabelEncoder
labelencoder = LabelEncoder()

# Fit and transform the 'location' column in the DataFrame
df_new2['species_name'] = labelencoder.fit_transform(df_new2['species_name'])

In [None]:
df_new2['species_name'].value_counts()

Unnamed: 0_level_0,count
species_name,Unnamed: 1_level_1
27,607
3,246
59,211
25,211
13,202
30,171
75,168
4,157
23,147
2,144


In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report

# Features and target variable
X = df_new2.drop('species_name', axis=1)
y = df_new2['species_name']

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
xgb_mod = xgb.XGBClassifier(booster = 'gbtree', learning_rate = 0.1, max_depth = 19, n_estimators = 100)

# Fit the model
xgb_mod.fit(X_train, y_train)

# Make predictions
y_pred = xgb_mod.predict(X_test)

In [None]:
# Evaluate the model
from sklearn.metrics import classification_report, confusion_matrix

print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[ 1  0  0 ...  0  0  0]
 [ 0  2  0 ...  0  0  0]
 [ 0  0 12 ...  1  0  0]
 ...
 [ 0  0  0 ...  4  0  0]
 [ 0  0  0 ...  0  1  0]
 [ 0  0  0 ...  0  0 25]]
              precision    recall  f1-score   support

           0       0.25      0.14      0.18         7
           1       0.50      0.29      0.36         7
           2       0.48      0.50      0.49        24
           3       0.17      0.24      0.20        46
           4       0.38      0.50      0.43        20
           5       0.00      0.00      0.00        13
           6       0.22      0.08      0.12        25
           7       0.31      0.20      0.24        20
           8       0.00      0.00      0.00         0
           9       0.33      0.29      0.31         7
          10       0.00      0.00      0.00         1
          11       0.50      0.38      0.43         8
          12       0.46      0.86      0.60         7
          13       0.19      0.27      0.22        37
          14       0.00      0.00