<a href="https://colab.research.google.com/github/DataBytes-Organisation/Project-Echo/blob/EE%2Fpd%2Fonboarding/Project_echo_Ensem_Learn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Ensemble Learning

## Importing Libraries

In [17]:
import warnings
warnings.filterwarnings('ignore')
import os
import pathlib
import glob
import subprocess
import threading
import time
from IPython.display import clear_output
import matplotlib.pyplot as plt



In [18]:
from google.colab import drive

# Mount your Google Drive account
drive.mount('drive')

# Update your input parameters
source_folder = "drive/MyDrive/Test dataset"
output_file_type = ".wav"

Drive already mounted at drive; to attempt to forcibly remount, call drive.mount("drive", force_remount=True).


## Generate list of all folders in the source path

In [19]:
sub_folders = pathlib.Path(source_folder).glob("**/*") #generator object
sub_folders
sub_folder_paths = [x for x in sub_folders if x.is_dir()]
print(len(sub_folder_paths))
sub_folder_paths[:4]

96


[PosixPath('drive/MyDrive/Test dataset/Pezoporus wallicus'),
 PosixPath('drive/MyDrive/Test dataset/Corvus mellori'),
 PosixPath('drive/MyDrive/Test dataset/Dama Dama'),
 PosixPath('drive/MyDrive/Test dataset/Pitta iris')]

## Generate list of all audio files paths in the sub-folder

In [20]:
# this creates a new version of the input file, converted to the output format
def convert_file(input_file_path):
    parent_path = pathlib.PurePath(input_file_path).parent
    file_name = pathlib.Path(input_file_path).stem
    output_file_path = str(parent_path) + "/" + file_name + output_file_type
    # write the new wav file forcing overwrite
    subprocess.call(['ffmpeg', '-y', '-i', input_file_path,
                 output_file_path],
    stdout=subprocess.DEVNULL,
    stderr=subprocess.STDOUT)
    return

In [21]:
# check each file type and convert it if necessary
# if creates a new version of the file, then deletes the old
# it skips over the file if it is already in the output format

def check_file_type(audio_file_path):
    # check this file name and return if it already exists
    if (pathlib.Path(audio_file_path).suffix == output_file_type): return
# audio file type conversions
    if (pathlib.Path(audio_file_path).suffix == ".wav"): convert_file(audio_file_path); # print("wav")
    if (pathlib.Path(audio_file_path).suffix == ".mp3"): convert_file(audio_file_path); # print("mp3")
    if (pathlib.Path(audio_file_path).suffix == ".flac"): convert_file(audio_file_path); # print("flac")
    if (pathlib.Path(audio_file_path).suffix == ".ogg"): convert_file(audio_file_path); # print("ogg")
# video file type conversions:
    if (pathlib.Path(audio_file_path).suffix == ".m4a"): convert_file(audio_file_path)
    if (pathlib.Path(audio_file_path).suffix == ".mp4"): convert_file(audio_file_path)
# delete the old file
    os.remove(audio_file_path)
    return

In [22]:
# this function is called by each thread
# it takes a subfolder within the input directory and iterates through every file within it
# to convert to output format

def threaded_function(sub_folder_path):
    audio_files = pathlib.Path(sub_folder_path).glob("**/*") #generator object
    audio_files
    audio_file_paths =  [x for x in audio_files if x.is_file()] #PosixPath list
    for x in range(len(audio_file_paths)):
        check_file_type(audio_file_paths[x])
    print("folder {} done".format(sub_folder_path.stem))
    time.sleep(1)
    clear_output()
    return

In [23]:
# create a list of threads
# the list is currently set to the number of folders

def create_threads(sub_folder_paths):
    max_threads = lambda x : 10 if (x > 10) else x
    num_threads =  max_threads(len(sub_folder_paths))
    num_threads = len(sub_folder_paths)
    thread_list = []
    for i in range(num_threads):
        thread_list.append(threading.Thread(target=threaded_function, args=([sub_folder_paths[i]])))
    return thread_list

## Main function that converts all files to the specified format

In [24]:
# create a thread for every folder
thread_list = create_threads(sub_folder_paths)
# start converting files in each folder concurrently
for thread in thread_list:
    thread.start()
# wait for all threads to complete
for thread in thread_list:
    thread.join()
print("done all")
print("Conversion of {}\n to format \"{}\" complete!".format(source_folder, output_file_type))

done all
Conversion of drive/MyDrive/Test dataset
 to format ".wav" complete!


# CHECKING SAMPLING RATE

In [25]:
import os
import librosa

def check_sampling_rate(directory):
    all_files_correct = True
    for file_name in os.listdir(directory):
        if file_name.endswith('.wav'):  # Assuming audio files are in WAV format
            file_path = os.path.join(directory, file_name)
            sr = librosa.get_samplerate(file_path)
            if sr != 16000:
                print(f"File {file_name} has incorrect sampling rate: {sr} Hz")
                all_files_correct = False
    if all_files_correct:
        print("All audio files have correct sampling rate (16 kHz)")
    else:
        print("Some audio files have incorrect sampling rate")

# Example usage
directory_path = "/content/drive/MyDrive/Test dataset"
check_sampling_rate(directory_path)


All audio files have correct sampling rate (16 kHz)


## EDA

In [26]:
import os

def count_audio_files(folder_path):
    audio_extensions = ['.mp3', '.flac', '.wav']
    num_files = 0
    for root, _, files in os.walk(folder_path):
        num_files += len([filename for filename in files
                          if any(filename.endswith(ext) for ext in audio_extensions)])
    return num_files

# Specified source folder with subfolders
source_folder = "drive/MyDrive/Test dataset"

# Counting audio files in the source folder and its subfolders
num_audio_files = count_audio_files(source_folder)

print(f"Number of audio files in the source folder and its subfolders: {num_audio_files}")



Number of audio files in the source folder and its subfolders: 5022


In [27]:
def print_audio_files(sub_folder_path):
    audio_files = pathlib.Path(sub_folder_path).glob("**/*.wav")
    wav_files_count = sum(1 for _ in audio_files)
    print(f"Number of .wav files in {sub_folder_path}: {wav_files_count}")

# Modify the threaded_function to call the print_audio_files function
def threaded_function(sub_folder_path):
    audio_files = pathlib.Path(sub_folder_path).glob("**/*")
    for x in range(len(audio_files)):
        check_file_type(audio_files[x])
    print("Folder {} done".format(sub_folder_path.stem))
    time.sleep(1)
    clear_output()
    print_audio_files(sub_folder_path)  # Print the audio files in the folder
    return

# After the threads are joined, print the count and list of audio files in each folder
for thread, folder_path in zip(thread_list, sub_folder_paths):
    thread.join()
    print_audio_files(folder_path)

print("Done all")
print("Conversion of {}\n to format \"{}\" complete!".format(source_folder, output_file_type))


Number of .wav files in drive/MyDrive/Test dataset/Pezoporus wallicus: 14
Number of .wav files in drive/MyDrive/Test dataset/Corvus mellori: 30
Number of .wav files in drive/MyDrive/Test dataset/Dama Dama: 36
Number of .wav files in drive/MyDrive/Test dataset/Pitta iris: 16
Number of .wav files in drive/MyDrive/Test dataset/Pelecanus conspicillatus: 5
Number of .wav files in drive/MyDrive/Test dataset/Sus Scrofa: 38
Number of .wav files in drive/MyDrive/Test dataset/Parvipsitta pusilla: 62
Number of .wav files in drive/MyDrive/Test dataset/Strepera versicolor: 18
Number of .wav files in drive/MyDrive/Test dataset/Falco cenchroides: 8
Number of .wav files in drive/MyDrive/Test dataset/Platycercus elegans: 18
Number of .wav files in drive/MyDrive/Test dataset/Barnardius zonarius: 197
Number of .wav files in drive/MyDrive/Test dataset/Elseyornis melanops: 16
Number of .wav files in drive/MyDrive/Test dataset/Plectorhyncha lanceolata: 45
Number of .wav files in drive/MyDrive/Test dataset/C

## Making a dataframe for data manipulation Making a dataframe for data manipulation

**The code is designed to load .wav files from a specified folder into a pandas DataFrame, extracting the file path, filename, sampling rate, and duration of each audio file using the librosa library.**

In [33]:
import os
import librosa
import pandas as pd
from pathlib import Path

def load_wav_files_into_dataframe(folder_path):
    data = []
    for root, _, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.wav'):
                file_path = os.path.join(root, filename)
                try:
                    audio, sr = librosa.load(file_path, sr=None)
                    duration = librosa.get_duration(y=audio, sr=sr)
                    data.append({'file_path': file_path, 'filename': filename, 'sampling_rate': sr, 'duration': duration})
                except Exception as e:
                    print(f"Error loading {filename}: {str(e)}")

    df = pd.DataFrame(data)
    return df

# Load ".wav" files into a DataFrame
source_folder = "/drive/MyDrive/Test dataset"
df = load_wav_files_into_dataframe(source_folder)

# Display the DataFrame
print(df)



Empty DataFrame
Columns: []
Index: []


In [29]:
import os
import librosa
import pandas as pd
from pathlib import Path

def extract_species_name(file_path):
    """
    Extract the species name from the file path.

    Args:
    - file_path: Path to the audio file.

    Returns:
    - Species name extracted from the path.
    """
    # Split the path and extract the species name
    return Path(file_path).parts[-2]

def load_wav_files_into_dataframe(folder_path):
    """
    Load WAV files into a DataFrame and extract additional features.

    Args:
    - folder_path: Path to the folder containing audio files.

    Returns:
    - DataFrame containing file paths, species names, sampling rates, durations, and other features.
    """
    data = []
    for root, _, files in os.walk(folder_path):
        for filename in files:
            if filename.endswith('.wav'):
                file_path = os.path.join(root, filename)
                try:
                    # Load audio file
                    audio, sr = librosa.load(file_path, sr=None)
                    duration = librosa.get_duration(y=audio, sr=sr)
                    # Extract species name from the file path
                    species_name = extract_species_name(file_path)
                    data.append({
                        'file_path': file_path,
                        'species_name': species_name,
                        'filename': filename,
                        'sampling_rate': sr,
                        'duration': duration
                    })
                except Exception as e:
                    print(f"Error loading {filename}: {str(e)}")

    df = pd.DataFrame(data)
    return df

# Load ".wav" files into a DataFrame
source_folder = "/drive/MyDrive/Test dataset"
df = load_wav_files_into_dataframe(source_folder)

# Display the DataFrame
print(df)


Empty DataFrame
Columns: []
Index: []


In [None]:
df

## Pitch Extraction from Audio Files using Librosa

**Code defines a function extract_min_max_pitch that takes an audio time series y and its sampling rate sr as input and returns the minimum and maximum pitch frequencies detected in the audio, excluding zero values. It uses the librosa library for audio processing.**

In [30]:
import librosa
import numpy as np
import pandas as pd

def extract_min_max_pitch(y, sr):
    """
    Extract the minimum and maximum pitch frequencies from an audio signal, excluding zeros.

    Args:
    - y: Audio time series.
    - sr: Sampling rate of y.

    Returns:
    - (min_pitch, max_pitch): A tuple containing the minimum and maximum pitch frequencies.
    """
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    # Flatten the array and filter out 0 values
    non_zero_pitches = pitches[pitches > 0].flatten()

    if len(non_zero_pitches) == 0:  # If no pitches detected, return zeros
        return (0, 0)

    min_pitch = np.min(non_zero_pitches)
    max_pitch = np.max(non_zero_pitches)

    return (min_pitch, max_pitch)

# Initialize lists to store min and max pitches for each file
min_pitches = []
max_pitches = []

# Iterate through each row in the DataFrame
for index, row in df.iterrows():
    # Load the audio file
    y, sr = librosa.load(row['file_path'], sr=None)

    # Extract min and max pitch
    min_pitch, max_pitch = extract_min_max_pitch(y, sr)

    # Append to lists
    min_pitches.append(min_pitch)
    max_pitches.append(max_pitch)

# Add the min and max pitches as new columns to the DataFrame
df['min_pitch'] = min_pitches
df['max_pitch'] = max_pitches

# Display the first few rows to verify the new columns
print(df.head())


Empty DataFrame
Columns: [min_pitch, max_pitch]
Index: []


## Mean Spectral Bandwidth Calculation

**The code calculates the mean spectral bandwidth of audio files stored in a DataFrame and filters them based on a threshold for voice recognition. It adds a new column with mean spectral bandwidth values, filters files within a specific bandwidth range, and prints the count of valid files along with the total number of files. This process helps identify suitable audio files for voice recognition based on spectral characteristics.**

In [31]:
def calculate_mean_spectral_bandwidth(file_path, sr=16000):
    """
    Calculate the mean spectral bandwidth of an audio file.

    Args:
    - file_path: Path to the audio file.
    - sr: Sampling rate to use for loading the audio.

    Returns:
    - Mean spectral bandwidth of the audio file.
    """
    y, sr = librosa.load(file_path, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    mean_spec_bw = np.mean(spec_bw)
    return mean_spec_bw

# Add a new column to CREMA_df for the mean spectral bandwidths
df['mean_spectral_bandwidth'] = df['file_path'].apply(calculate_mean_spectral_bandwidth)

# Define the maximum spectral bandwidth for voice recognition
max_bw_for_voice_recognition = 4000  # Hz

# Filter the DataFrame for audio files that lie within the required spectral bandwidth range
valid_bw_files_df = df[df['mean_spectral_bandwidth'] <= max_bw_for_voice_recognition]

# Count the number of files that meet the criterion
num_valid_bw_files = len(valid_bw_files_df)

print(f"Number of audio files within the required spectral bandwidth range for voice recognition: {num_valid_bw_files}")
print(f"Total number of audio files: {len(df)}")

KeyError: 'file_path'

In [None]:
df

In [None]:
df_new1 = df.copy()

## RMS Loudness Level Calculation in dBFS

**The code computes the RMS loudness level in dBFS for audio files in a DataFrame (df_new). It adds a new column with these values and displays the DataFrame with the added column. This enables the analysis of audio loudness levels in decibels relative to full scale.**

In [None]:
def calculate_rms_loudness_dbfs(file_path):
    """
    Calculate the RMS loudness level of an audio file in dBFS.

    Args:
    - file_path: Path to the audio file.

    Returns:
    - RMS loudness level of the audio file in dBFS.
    """
    y, sr = librosa.load(file_path, sr=None)
    # Calculate the RMS value
    S, phase = librosa.magphase(librosa.stft(y))
    rms = librosa.feature.rms(S=S).mean()
    # Convert to dBFS
    rms_dbfs = librosa.power_to_db(rms, ref=1.0)  # Assuming max amplitude of 1 is 0 dBFS
    return rms_dbfs

# Calculate the RMS loudness in dBFS for each audio file and add it as a new column
df_new1['rms_loudness_db'] = df_new1['file_path'].apply(calculate_rms_loudness_dbfs)

# Display the DataFrame with the new 'rms_loudness_db' column
print(df_new1.head())

In [None]:
df_new1

## Evaluation of RMS Loudness Levels

**The code defines ideal RMS loudness levels for voice recognition and assesses the number of audio files needing amplification (too quiet) or gain reduction (too loud) based on these ideals. It computes the counts and percentages of files requiring adjustment and prints the results. This analysis helps in identifying files that may need volume adjustments to meet optimal loudness criteria for voice recognition.**

In [None]:
# Define ideal RMS loudness levels (in dBFS) for voice recognition
ideal_min_rms_dbfs = -23  # Minimum ideal RMS loudness level
ideal_max_rms_dbfs = -20  # Maximum ideal RMS loudness level

# Assuming 'rms_loudness_db' is a column in CREMA_df representing RMS loudness in dBFS
# Count files needing amplification (too quiet)
files_needing_amplification = df_new1[df_new1['rms_loudness_db'] < ideal_min_rms_dbfs].shape[0]

# Count files needing gain reduction (too loud)
files_needing_gain_reduction = df_new1[df_new1['rms_loudness_db'] > ideal_max_rms_dbfs].shape[0]

# Print the counts
print(f"Number of files needing amplification (too quiet): {files_needing_amplification}")
print(f"Number of files needing gain reduction (too loud): {files_needing_gain_reduction}")

# Total number of files evaluated
total_files_evaluated = df_new1.shape[0]

# Printing the percentage of files needing adjustment
percentage_needing_amplification = (files_needing_amplification / total_files_evaluated) * 100
percentage_needing_gain_reduction = (files_needing_gain_reduction / total_files_evaluated) * 100

print(f"Percentage of files needing amplification: {percentage_needing_amplification:.2f}%")
print(f"Percentage of files needing gain reduction: {percentage_needing_gain_reduction:.2f}%")

In [None]:
df_new1

In [None]:
df_new1

In [None]:
missing_counts = df_new1.isna().sum()
for feature, count in missing_counts.items():
    print(f"Feature: {feature}, Number of Missing Entries: {count}")


In [None]:
from collections import Counter


c = Counter(df_new1['species_name'])

# Print the counts of each location
print(c)


In [None]:
df_new1['species_name'].value_counts()

In [None]:
df_new1.drop(columns=['file_path', 'filename'], inplace=True)

In [None]:

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler



label_encoder = LabelEncoder()
df['species_label'] = label_encoder.fit_transform(df['species_name'])

# Separate features and labels
X = df[['sampling_rate', 'duration', 'min_pitch', 'max_pitch', 'mean_spectral_bandwidth', 'rms_loudness_db']]
y = df['species_label']

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Step 3: Train the AdaBoost model
# Using a DecisionTreeClassifier as the base estimator
base_estimator = DecisionTreeClassifier(max_depth=1, random_state=42)
ada_model = AdaBoostClassifier(base_estimator=base_estimator, n_estimators=50, random_state=42)

# Train the model
ada_model.fit(X_train, y_train)

# Step 4: Evaluate the model
# Predict on the test set
y_pred = ada_model.predict(X_test)

# Print accuracy and classification report
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Step 5: (Optional) Feature Importance
# Feature importance for AdaBoost model
importances = ada_model.feature_importances_
feature_importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': importances
}).sort_values(by='importance', ascending=False)

print("Feature Importances:")
print(feature_importance_df)

# Step 6: Save the model (if needed)
import joblib
joblib.dump(ada_model, 'ada_boost_model.pkl')

# Save the scaler and label encoder for future use (optional)
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')