# LDED Audiovisual Fusion 

Author: Chen Lequn.
Created on 13 Sep 2023.

- Material: Maraging Steel 300
- Process: Robotic Llser-directed energy deposition
- Recorded data: position, veolocity, coaxial ccd features, acoustic feature
- Quality labels generated: keyhole pores, cracks, defect-free

### Notebook 2: Feature extraction
- Extract handcrafted features from video and audio stream
- Vision features: melt pool geometric features, including width, length, moment of area, convex hull, etc.
- Audio features: spectral centroid, spectral bandwidth, flux, etc.

### System setup

In [1]:
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm

import os
# Scikit learn
#from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle, resample, class_weight
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit
from collections import defaultdict

## plot
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
%matplotlib inline
import seaborn as sns

In [2]:
import librosa
import essentia.standard as es
from essentia.standard import Spectrum, Windowing, SpectralCentroidTime, SpectralComplexity, SpectralContrast
from essentia.standard import Decrease, Energy, EnergyBandRatio, FlatnessDB, Flux, RollOff, StrongPeak, CentralMoments
from essentia.standard import DistributionShape, Crest, MelBands, MFCC
import soundfile as sf  # for reading audio files

[   INFO   ] MusicExtractorSVM: no classifier models were configured by default


https://essentia.upf.edu/algorithms_reference.html

In [3]:
PROJECT_ROOT_DIR = "../"
IMAGE_PATH = os.path.join(PROJECT_ROOT_DIR, "result_images", 'feature_extraction')
os.makedirs(IMAGE_PATH, exist_ok=True)

Multimodal_dataset_PATH = "/home/lequn/Dataset/LDED_acoustic_visual_monitoring_dataset"
Dataset_path = os.path.join(Multimodal_dataset_PATH, f'25Hz')
                            

## function for automatically save the diagram/graph into the folder 
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGE_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

plt.rcParams["axes.edgecolor"] = "black"
plt.rcParams["axes.linewidth"] = 2.50

In [4]:
def get_sample_directories(base_path, sample_numbers):
    sample_directories = []
    for sample_number in sample_numbers:
        sample_directories.append(os.path.join(base_path, f'{sample_number}'))
    return sample_directories


samples = [21, 22, 23, 24, 26, 32]
sample_directories = get_sample_directories(Dataset_path, samples)

# Get lists of image and audio directories for each sample
image_directories = [os.path.join(sample_dir, 'images') for sample_dir in sample_directories]
audio_directories = [os.path.join(sample_dir, 'raw_audio') for sample_dir in sample_directories]

In [5]:
image_directories

['/home/lequn/Dataset/LDED_acoustic_visual_monitoring_dataset/25Hz/21/images',
 '/home/lequn/Dataset/LDED_acoustic_visual_monitoring_dataset/25Hz/22/images',
 '/home/lequn/Dataset/LDED_acoustic_visual_monitoring_dataset/25Hz/23/images',
 '/home/lequn/Dataset/LDED_acoustic_visual_monitoring_dataset/25Hz/24/images',
 '/home/lequn/Dataset/LDED_acoustic_visual_monitoring_dataset/25Hz/26/images',
 '/home/lequn/Dataset/LDED_acoustic_visual_monitoring_dataset/25Hz/32/images']

In [6]:
audio_directories

['/home/lequn/Dataset/LDED_acoustic_visual_monitoring_dataset/25Hz/21/raw_audio',
 '/home/lequn/Dataset/LDED_acoustic_visual_monitoring_dataset/25Hz/22/raw_audio',
 '/home/lequn/Dataset/LDED_acoustic_visual_monitoring_dataset/25Hz/23/raw_audio',
 '/home/lequn/Dataset/LDED_acoustic_visual_monitoring_dataset/25Hz/24/raw_audio',
 '/home/lequn/Dataset/LDED_acoustic_visual_monitoring_dataset/25Hz/26/raw_audio',
 '/home/lequn/Dataset/LDED_acoustic_visual_monitoring_dataset/25Hz/32/raw_audio']

In [36]:
# Combine all annotation files into one DataFrame
all_annotation_dfs = []
for sample_dir, sample_number in zip(sample_directories, samples):
    annotation_file = os.path.join(sample_dir, f'annotations_{sample_number}.csv')  # Update the file name
    annotation_df = pd.read_csv(annotation_file)
    all_annotation_dfs.append(annotation_df)
combined_annotation_df = pd.concat(all_annotation_dfs)
combined_annotation_df

Unnamed: 0,sample index,audio_file_name,image_file_name,class_name,class_name_v2,Layer number,Sample number
0,1,sample_21_1.wav,sample_21_1.jpg,Laser-off,Laser-off,1.0,21
1,2,sample_21_2.wav,sample_21_2.jpg,Defect-free,Defect-free,1.0,21
2,3,sample_21_3.wav,sample_21_3.jpg,Defect-free,Defect-free,1.0,21
3,4,sample_21_4.wav,sample_21_4.jpg,Defect-free,Defect-free,1.0,21
4,5,sample_21_5.wav,sample_21_5.jpg,Defect-free,Defect-free,1.0,21
...,...,...,...,...,...,...,...
13523,13524,sample_32_13524.wav,sample_32_13524.jpg,,,,32
13524,13525,sample_32_13525.wav,sample_32_13525.jpg,,,,32
13525,13526,sample_32_13526.wav,sample_32_13526.jpg,,,,32
13526,13527,sample_32_13527.wav,sample_32_13527.jpg,,,,32


## Extracting melt pool visual features

In [8]:
def general_contour_extraction(image, threshold=100):
    """
    Extract general contour features from a given image.
    
    Parameters:
        image (ndarray): The input image.
        threshold (int): The threshold value for image processing.
    
    Returns:
        dict: A dictionary containing the extracted features.
    """
    # Initialize the result dictionary with zeros
    result = {
        'max_contour_area': 0,
        'rectangle_angle': 0,
        'rectangle_width': 0,
        'rectangle_height': 0,
        'ellipse_angle': 0,
        'ellipse_width': 0,
        'ellipse_height': 0
    }
    
    # Convert the image to grayscale
    src_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply blur
    src_gray = cv2.blur(src_gray, (3, 3))
    
    # Apply threshold
    _, threshold_output = cv2.threshold(src_gray, threshold, 255, cv2.THRESH_BINARY)
    
    # Find contours
    contours, _ = cv2.findContours(threshold_output, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    
    if not contours:
        return result  # Return result with zeros if no contours are found
    
    # Find the rotated rectangles and ellipses for each contour
    min_rects = [cv2.minAreaRect(np.array(contour)) for contour in contours]
    contour_areas = [cv2.contourArea(np.array(contour)) for contour in contours]
    
    # Get the index of the max contour area
    max_contour_area_index = np.argmax(contour_areas)
    max_contour_area = contour_areas[max_contour_area_index]
    
    # Store the max contour area
    result['max_contour_area'] = max_contour_area
    
    # Store rectangle features
    rect = min_rects[max_contour_area_index]
    result['rectangle_angle'] = rect[-1]
    result['rectangle_width'] = rect[1][0]
    result['rectangle_height'] = rect[1][1]
    
    # Store ellipse features if enough points for fitEllipse
    if len(contours[max_contour_area_index]) > 5:
        ellipse = cv2.fitEllipse(np.array(contours[max_contour_area_index]))
        result['ellipse_angle'] = ellipse[-1]
        result['ellipse_width'] = ellipse[1][0]
        result['ellipse_height'] = ellipse[1][1]
    
    return result

In [9]:
def convex_hull_extract(frame, threshold=100):
    """
    Extract convex hull features from a given image.
    
    Parameters:
        image_path (str): The path to the image file.
        threshold (int): The threshold value for binary conversion.
    
    Returns:
        max_hull_area (float): The maximum area among all convex hulls.
    """
    
    # Convert to grayscale if the image is colored
    if frame.shape[-1] > 1:
        src_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    else:
        src_gray = frame

    # Blur the image
    src_gray = cv2.blur(src_gray, (3, 3))
    
    # Apply threshold
    ret, threshold_output = cv2.threshold(src_gray, threshold, 255, cv2.THRESH_BINARY)
    
    # Find contours
    contours, _ = cv2.findContours(threshold_output, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    
    # Initialize return values
    max_hull_area = 0.0

    # Check if any contour is detected
    if contours:
        # Find the convex hull object for each contour
        hull = [cv2.convexHull(cnt) for cnt in contours]
        
        # Find the bounding convex hull area for each contour
        hull_area = [cv2.contourArea(h) for h in hull]
        
        # Get the maximum convex hull area
        max_hull_area = max(hull_area)
        
#         # Draw contours and convex hull on the original image (for visualization)
#         drawing = np.zeros((threshold_output.shape[0], threshold_output.shape[1], 3), dtype=np.uint8)
#         for i in range(len(contours)):
#             color = (np.random.randint(0,256), np.random.randint(0,256), np.random.randint(0,256))
#             cv2.drawContours(drawing, contours, i, color)
#             cv2.drawContours(drawing, hull, i, color, 2)
        
#         # Show the output image with contours and convex hull
#         plt.imshow(cv2.cvtColor(drawing, cv2.COLOR_BGR2RGB))
#         plt.title('Contours and Convex Hull')
#         plt.axis('off')
#         plt.show()
        
    return max_hull_area

In [10]:
# Feature extraction for moments
def moment_extract(image, threshold):
    # Initialize moments as zeros
    features = {
        'm00': 0,
        'm10': 0,
        'm01': 0,
        'm20': 0,
        'm11': 0,
        'm02': 0,
        'm30': 0,
        'm21': 0,
        'm12': 0,
        'm03': 0,
        'mu20': 0,
        'mu11': 0,
        'mu02': 0,
        'mu30': 0,
        'mu21': 0,
        'mu12': 0,
        'mu03': 0,
        'nu20': 0,
        'nu11': 0,
        'nu02': 0,
        'nu30': 0,
        'nu21': 0,
        'nu12': 0,
        'nu03': 0,
        'center_x': 0,
        'center_y': 0,
        'contour_area': 0,
        'contour_length': 0
    }
    
    # Convert to grayscale if the image is colored
    if len(image.shape) > 2:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        gray = image

    # Thresholding
    _, thresh = cv2.threshold(gray, threshold, 255, cv2.THRESH_BINARY)

    # Find contours
    contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    
    # Sort contours by area
    contours = sorted(contours, key=cv2.contourArea, reverse=True)
    
    if contours:
        largest_contour = max(contours, key=cv2.contourArea)
        moments = cv2.moments(largest_contour)
        
        # Avoid division by zero
        if moments['m00'] != 0:
            for moment_name, moment_value in moments.items():
                features[moment_name] = moment_value
                
            features['center_x'] = moments['m10'] / moments['m00']
            features['center_y'] = moments['m01'] / moments['m00']
            features['contour_area'] = cv2.contourArea(largest_contour)
            features['contour_length'] = cv2.arcLength(largest_contour, True)
            
    return features

### Extract all visual features

In [11]:
def extract_visual_features(image_directories, threshold=100):
    all_features_list = []
    total_images = sum([len(os.listdir(img_dir)) for img_dir in image_directories if os.path.isdir(img_dir)])
    pbar = tqdm(total=total_images, desc="Processing images")

    for img_dir in image_directories:
        if os.path.isdir(img_dir):
            for img_name in os.listdir(img_dir):
                if img_name.lower().endswith(('.png', '.jpg', '.jpeg')):
                    img_path = os.path.join(img_dir, img_name)
                    img = cv2.imread(img_path)
                    
                    features_contour = general_contour_extraction(img, threshold=threshold)
                    max_hull = convex_hull_extract(img, threshold=threshold)
                    features_moments = moment_extract(img, threshold=threshold)
                    
                    # Merge all dictionaries into one
                    merged_features = {'image_file_name': img_name, **features_contour, 'max_hull': max_hull, **features_moments}
                    all_features_list.append(merged_features)
                    
                    pbar.update(1)
    
    pbar.close()
    return pd.DataFrame(all_features_list)

In [12]:
df_visual = extract_visual_features(image_directories)
df_visual.head()

Processing images: 100%|█████████████████| 61995/61995 [06:39<00:00, 155.35it/s]


Unnamed: 0,image_file_name,max_contour_area,rectangle_angle,rectangle_width,rectangle_height,ellipse_angle,ellipse_width,ellipse_height,max_hull,m00,...,nu11,nu02,nu30,nu21,nu12,nu03,center_x,center_y,contour_area,contour_length
0,sample_21_183.jpg,305116.5,90.0,478.999939,638.999878,89.741119,486.003723,2792.482422,305694.0,305168.0,...,-0.000209,0.062289,1.5e-05,-7.3e-05,-5.7e-05,3e-06,319.227245,238.790615,305168.0,2346.485281
1,sample_21_1448.jpg,244869.5,90.0,479.0,547.0,0.807755,625.024231,768.526367,249194.0,245077.5,...,-0.004759,0.07532,0.000586,-0.001628,-0.001046,0.000924,257.498587,233.704246,245077.5,2296.080295
2,sample_21_4538.jpg,213872.5,0.0,496.999939,478.999939,138.959488,562.858459,628.942139,217121.0,213961.5,...,0.001328,0.082037,0.000933,-0.00061,-0.00194,7e-06,229.475871,235.544848,213961.5,2049.678274
3,sample_21_3876.jpg,145008.0,90.0,477.0,431.0,6.471483,388.942871,542.194519,180298.5,145029.5,...,-0.010655,0.130111,-0.001434,-0.00165,0.002597,-0.009541,240.855165,245.231884,145029.5,3417.201343
4,sample_21_3050.jpg,173090.0,90.0,477.0,443.0,2.054851,430.469025,540.491028,177690.0,173210.0,...,-0.001045,0.092186,-0.000136,0.000465,0.000231,-0.001076,238.826297,237.025172,173210.0,1952.704746


## Extract Audio Features

In [13]:
audio_path = os.path.join(audio_directories[1], "sample_22_9.wav")
audio_signal, sample_rate = sf.read(audio_path, dtype='float32')
# print(sample_rate)
# print (len(audio_signal))
# print (len(audio_signal)/sample_rate)
# plt.plot(audio_signal)

In [14]:
def check_audio_lengths(audio_file_paths):
    length_dict = defaultdict(list)
    
    for file_path in audio_file_paths:
        audio_signal, sr = librosa.load(file_path, sr=None)
        length_in_seconds = len(audio_signal) / sr
        length_dict[length_in_seconds].append(file_path)
        
    if len(length_dict) == 1:
        print(f"All audio files have the same length: {list(length_dict.keys())[0]} seconds.")
        return True
    else:
        print("Not all audio files have the same length.")
        for length, files in length_dict.items():
            print(f"Length: {length} seconds -> Files: {files}")
        return False

In [15]:
# !pip install librosa==0.9.2 timit-utils==0.9.0 torchaudio

In [17]:
import os

def example_usage_check_audio_lengths(audio_directories):
    # Initialize an empty list to store audio file paths
    audio_file_paths = []
    
    # Iterate over each directory in audio_directories to collect audio file paths
    for directory in audio_directories:
        for file_name in os.listdir(directory):
            if file_name.endswith(".wav"):
                audio_file_paths.append(os.path.join(directory, file_name))
    
    # Call the check_audio_lengths function
    return check_audio_lengths(audio_file_paths)


# Uncomment the line below to run the function
example_usage_check_audio_lengths(audio_directories)


KeyboardInterrupt



In [18]:
def extract_time_domain_features(audio_signal, sample_rate=44100):
    """
    Extract time domain features from an audio signal using Essentia.
    
    Parameters:
    - audio_signal: numpy array, the audio signal from which to extract features
    - sample_rate: int, the sample rate of the audio signal
    
    Returns:
    - features: dict, a dictionary containing the extracted features
    """
    
    features = {}
    
    # RMS Energy
    rms_algo = es.RMS()
    rms_energy = rms_algo(audio_signal)
    features['rms_energy'] = rms_energy
    
    # Amplitude Envelope
    envelope_algo = es.Envelope()
    amplitude_envelope = envelope_algo(audio_signal)
    features['amplitude_envelope_mean'] = amplitude_envelope.mean()
    features['amplitude_envelope_std'] = amplitude_envelope.std()
    
    # Zero Crossing Rate
    zcr_algo = es.ZeroCrossingRate()
    zero_crossing_rate = zcr_algo(audio_signal)
    features['zero_crossing_rate'] = zero_crossing_rate
    
    # Dynamic Complexity and Loudness
    dyn_algo = es.DynamicComplexity()
    dynamic_complexity, loudness = dyn_algo(audio_signal)
    features['dynamic_complexity'] = dynamic_complexity
    features['loudness'] = loudness

    # Loudness Vickers
    loudness_algo = es.LoudnessVickers()
    loudness_vickers = loudness_algo(audio_signal)
    features['loudness_vickers'] = loudness_vickers

    return features

Essentia provides a variety of spectral descriptors that you can use for feature extraction:

1. **Spectral Centroid**: Computes the center of mass of the spectrum.
2. **Spectral Complexity**: Measures the amount of peak-like components in the spectrum.
3. **Spectral Contrast**: Computes the spectral contrast features from an audio signal.
4. **Spectral Decrease**: Computes the decrease of the spectrum.
5. **Spectral Energy**: Computes the energy of the frequency domain signal.
6. **Spectral Energy Band Ratio**: Computes the ratio of energy in specific bands to the total energy.
7. **Spectral Flatness**: Computes the flatness of a spectrum.
8. **Spectral Flux**: Computes the flux of the spectrum.
9. **Spectral Rolloff**: Computes the rolloff frequency of an audio signal.
10. **Spectral Strong Peak**: Computes the strong peak of the spectrum.
12. **Spectral Variance, skewness, kurtosis**: Computes the variance of the spectral peaks.
14. **MFCC (Mel Frequency Cepstral Coefficients)**: Widely used spectral feature in audio and speech processing.


In [19]:
def extract_spectral_features(audio_signal, sample_rate, frame_size=1024, hop_size=512):
    # Initialize the algorithms
    window_algo = Windowing(type='hann')
    spectrum_algo = Spectrum()
    centroid_algo = SpectralCentroidTime(sampleRate=sample_rate)
    complexity_algo = SpectralComplexity(sampleRate=sample_rate)
    contrast_algo = SpectralContrast(frameSize=frame_size, highFrequencyBound=sample_rate/2, lowFrequencyBound=200, sampleRate=sample_rate)
    decrease_algo = Decrease()
    energy_algo = Energy()
    energy_band_ratio_algo = EnergyBandRatio(sampleRate=sample_rate, stopFrequency=7000)
    flatness_algo = FlatnessDB()
    spectral_flux = Flux()
    rolloff_algo = RollOff(sampleRate=sample_rate)
    strong_peak_algo = StrongPeak()
    central_moment_algo = CentralMoments()
    distrubution_shape = DistributionShape()
    spectral_crest_factor = Crest()
    mel_bands_algo = MelBands()
    mfcc_algo = MFCC(inputSize=hop_size+1, highFrequencyBound=sample_rate/2, numberCoefficients=13, sampleRate=sample_rate)
    
    # Initialize features dictionary with defaultdict to store lists
    # features = {}
    features = defaultdict(list)
    
    for frame in es.FrameGenerator(audio_signal, frameSize=frame_size, hopSize=hop_size):
        windowed_frame = window_algo(frame)
        spectrum = spectrum_algo(windowed_frame)

        features['spectral_centroid'].append(centroid_algo(spectrum))
        features['spectral_complexity'].append(complexity_algo(spectrum))
        spectral_contrast, spectral_valley = contrast_algo(spectrum)
        for i, val in enumerate(spectral_contrast):
            features[f'spectral_contrast_{i}'].append(val)
        for i, val in enumerate(spectral_valley):
            features[f'spectral_valley_{i}'].append(val)
        features['spectral_decrease'].append(decrease_algo(spectrum))
        features['spectral_energy'].append(energy_algo(spectrum))
        features['spectral_energy_band_ratio'].append(energy_band_ratio_algo(spectrum))
        features['spectral_flatness'].append(flatness_algo(spectrum))
        features['spectral_flux'].append(spectral_flux(spectrum))
        features['spectral_rolloff'].append(rolloff_algo(spectrum))
        features['spectral_strong_peak'].append(strong_peak_algo(spectrum))
        central_moments = central_moment_algo(spectrum)
        features['spectral_variance'].append(distrubution_shape(central_moments)[0])
        features['spectral_skewness'].append(distrubution_shape(central_moments)[1])
        features['spectral_kurtosis'].append(distrubution_shape(central_moments)[2])
        features['spectral_crest_factor'].append(spectral_crest_factor(spectrum))

        mfcc_bands, mfcc_coeffs = mfcc_algo(spectrum)
        for i, coeff in enumerate(mfcc_coeffs):
            features[f'mfcc_{i}'].append(coeff)
            
    # Prepare a dictionary to store mean and std separately
    features_separated = {}
    for key, value in features.items():
        mean_val = np.mean(value)
        std_val = np.std(value)
        features_separated[f"{key}_mean"] = mean_val
        features_separated[f"{key}_std"] = std_val
    
    return features_separated

In [20]:
# Example usage
sample_rate = 44100
audio_signal = np.random.rand(4410).astype(np.float32)  
features = extract_spectral_features(audio_signal, sample_rate, frame_size=1024)
features

{'spectral_centroid_mean': 6229.11337890625,
 'spectral_centroid_std': 1974.0776033242607,
 'spectral_complexity_mean': 27.6,
 'spectral_complexity_std': 5.919459434779497,
 'spectral_contrast_0_mean': -0.75605726,
 'spectral_contrast_0_std': 0.020388855,
 'spectral_contrast_1_mean': -0.75854343,
 'spectral_contrast_1_std': 0.052207116,
 'spectral_contrast_2_mean': -0.76933986,
 'spectral_contrast_2_std': 0.01737803,
 'spectral_contrast_3_mean': -0.7793497,
 'spectral_contrast_3_std': 0.03202435,
 'spectral_contrast_4_mean': -0.77245694,
 'spectral_contrast_4_std': 0.030848444,
 'spectral_contrast_5_mean': -0.7731403,
 'spectral_contrast_5_std': 0.019982597,
 'spectral_valley_0_mean': -4.546207,
 'spectral_valley_0_std': 0.2448793,
 'spectral_valley_1_mean': -4.757674,
 'spectral_valley_1_std': 0.30818248,
 'spectral_valley_2_mean': -4.726373,
 'spectral_valley_2_std': 0.3610033,
 'spectral_valley_3_mean': -4.7453737,
 'spectral_valley_3_std': 0.39807102,
 'spectral_valley_4_mean': -4.

### Extract all audio features

In [21]:
def extract_all_audio_features(audio_directories, frame_size=1024, hop_size=512):
    all_features_list = []
    
    # Count total audio files for progress bar
    total_audio_files = sum([len(os.listdir(audio_dir)) for audio_dir in audio_directories if os.path.isdir(audio_dir)])
    
    pbar = tqdm(total=total_audio_files, desc="Processing audio files")

    for audio_dir in audio_directories:
        if os.path.isdir(audio_dir):
            for audio_name in os.listdir(audio_dir):
                if audio_name.lower().endswith(('.wav', '.flac', '.mp3')):
                    audio_path = os.path.join(audio_dir, audio_name)
                    
                    # Read audio file
                    audio_signal, sample_rate = sf.read(audio_path, dtype='float32')
                    
                    # Extract features
                    time_domain_features = extract_time_domain_features(audio_signal, sample_rate)
                    spectral_features = extract_spectral_features(audio_signal, sample_rate, frame_size, hop_size)
                    
                    # Merge all dictionaries into one
                    merged_features = {'audio_file_name': audio_name, **time_domain_features, **spectral_features}
                    all_features_list.append(merged_features)
                    
                    pbar.update(1)
    
    pbar.close()
    return pd.DataFrame(all_features_list)


In [22]:
audio_directories

['/home/lequn/Dataset/LDED_acoustic_visual_monitoring_dataset/25Hz/21/raw_audio',
 '/home/lequn/Dataset/LDED_acoustic_visual_monitoring_dataset/25Hz/22/raw_audio',
 '/home/lequn/Dataset/LDED_acoustic_visual_monitoring_dataset/25Hz/23/raw_audio',
 '/home/lequn/Dataset/LDED_acoustic_visual_monitoring_dataset/25Hz/24/raw_audio',
 '/home/lequn/Dataset/LDED_acoustic_visual_monitoring_dataset/25Hz/26/raw_audio',
 '/home/lequn/Dataset/LDED_acoustic_visual_monitoring_dataset/25Hz/32/raw_audio']

In [23]:
time_domain_features = extract_time_domain_features(audio_signal, sample_rate)
time_domain_features

{'rms_energy': 0.5720088481903076,
 'amplitude_envelope_mean': 0.6366074,
 'amplitude_envelope_std': 0.1890899,
 'zero_crossing_rate': 0.0,
 'dynamic_complexity': 0.0,
 'loudness': -100.0,
 'loudness_vickers': -11.100326538085938}

In [24]:
audio_features_df = extract_all_audio_features(audio_directories, frame_size=1024, hop_size=512)

Processing audio files: 100%|████████████| 61996/61996 [02:47<00:00, 369.59it/s]


## Save extracted features

In [25]:
audio_features_df

Unnamed: 0,audio_file_name,rms_energy,amplitude_envelope_mean,amplitude_envelope_std,zero_crossing_rate,dynamic_complexity,loudness,loudness_vickers,spectral_centroid_mean,spectral_centroid_std,...,mfcc_8_mean,mfcc_8_std,mfcc_9_mean,mfcc_9_std,mfcc_10_mean,mfcc_10_std,mfcc_11_mean,mfcc_11_std,mfcc_12_mean,mfcc_12_std
0,sample_21_5061.wav,0.021425,0.016344,0.006435,0.234127,0.0,-100.0,-37.311604,3158.075635,1194.677833,...,18.972294,6.530155,2.002637,4.283215,-3.746114,3.515599,5.357268,4.389891,-8.488627,5.216426
1,sample_21_1609.wav,0.027732,0.019372,0.011031,0.109977,0.0,-100.0,-39.758411,3707.981836,1907.854511,...,6.696869,2.616619,-6.531511,7.269816,2.900550,3.867695,3.060268,10.179080,-4.805249,5.627741
2,sample_21_3832.wav,0.044808,0.032082,0.012797,0.061224,0.0,-100.0,-37.084881,4039.971851,1840.789898,...,23.996601,5.227235,2.212011,5.448719,9.179332,6.913830,12.587943,4.534578,-1.173351,6.376263
3,sample_21_4991.wav,0.039034,0.030804,0.013613,0.155329,0.0,-100.0,-32.731792,2965.390076,1206.620565,...,20.003372,7.728793,-2.622741,4.897816,7.086169,8.972412,8.160907,3.901221,-12.042410,7.861053
4,sample_21_1801.wav,0.024328,0.018567,0.005932,0.112245,0.0,-100.0,-41.143456,3259.826697,1293.981594,...,11.501012,6.039043,3.603982,1.894414,5.184237,3.480808,8.518371,4.137726,-5.157437,5.720777
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61991,sample_32_4428.wav,0.053498,0.035133,0.019140,0.012472,0.0,-100.0,-40.145836,4354.579663,2429.486481,...,18.121904,6.061812,11.664889,6.051739,6.184272,3.629905,7.306276,5.094743,-2.412226,4.795838
61992,sample_32_689.wav,0.024218,0.023381,0.005685,0.113379,0.0,-100.0,-41.865253,3604.107678,1915.216847,...,5.061664,7.360607,10.557548,3.593134,-3.723825,3.388584,1.159705,5.967342,-2.912736,1.962973
61993,sample_32_6496.wav,0.021276,0.015564,0.005265,0.099206,0.0,-100.0,-41.056362,3669.805859,1334.831746,...,21.437021,6.894727,13.434031,4.689430,4.055672,4.363562,16.391119,1.564742,-7.109515,3.260695
61994,sample_32_8802.wav,0.048664,0.034854,0.015590,0.028345,0.0,-100.0,-41.444977,3902.260352,1801.351172,...,15.170557,9.445569,1.604835,6.415935,11.255735,5.765014,5.028805,3.346834,2.104390,2.568861


In [26]:
df_visual

Unnamed: 0,image_file_name,max_contour_area,rectangle_angle,rectangle_width,rectangle_height,ellipse_angle,ellipse_width,ellipse_height,max_hull,m00,...,nu11,nu02,nu30,nu21,nu12,nu03,center_x,center_y,contour_area,contour_length
0,sample_21_183.jpg,305116.5,90.0,478.999939,638.999878,89.741119,486.003723,2792.482422,305694.0,305168.0,...,-0.000209,0.062289,0.000015,-0.000073,-0.000057,0.000003,319.227245,238.790615,305168.0,2346.485281
1,sample_21_1448.jpg,244869.5,90.0,479.000000,547.000000,0.807755,625.024231,768.526367,249194.0,245077.5,...,-0.004759,0.075320,0.000586,-0.001628,-0.001046,0.000924,257.498587,233.704246,245077.5,2296.080295
2,sample_21_4538.jpg,213872.5,0.0,496.999939,478.999939,138.959488,562.858459,628.942139,217121.0,213961.5,...,0.001328,0.082037,0.000933,-0.000610,-0.001940,0.000007,229.475871,235.544848,213961.5,2049.678274
3,sample_21_3876.jpg,145008.0,90.0,477.000000,431.000000,6.471483,388.942871,542.194519,180298.5,145029.5,...,-0.010655,0.130111,-0.001434,-0.001650,0.002597,-0.009541,240.855165,245.231884,145029.5,3417.201343
4,sample_21_3050.jpg,173090.0,90.0,477.000000,443.000000,2.054851,430.469025,540.491028,177690.0,173210.0,...,-0.001045,0.092186,-0.000136,0.000465,0.000231,-0.001076,238.826297,237.025172,173210.0,1952.704746
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61990,sample_32_13333.jpg,230316.5,90.0,479.000000,508.000000,163.139206,577.749756,785.149780,233123.5,230326.5,...,0.001704,0.079386,0.000148,-0.000453,-0.000640,-0.000073,244.976237,237.135437,230326.5,2111.653889
61991,sample_32_9903.jpg,33880.5,90.0,186.000000,236.000000,110.776207,176.589127,249.155396,35248.0,33839.0,...,0.019571,0.064730,-0.003217,-0.002232,0.001154,0.001448,193.509018,326.474059,33839.0,1072.430580
61992,sample_32_3401.jpg,244477.5,90.0,479.000000,548.000000,147.688782,627.643372,724.354858,248611.0,244549.5,...,0.001333,0.074695,0.000561,0.000320,-0.001494,-0.000222,256.647856,240.107323,244549.5,2388.281306
61993,sample_32_12232.jpg,283534.5,0.0,628.999878,478.999939,170.896729,673.700378,734.218933,290550.5,283752.0,...,-0.005144,0.064730,0.001002,-0.001605,-0.001299,0.000656,297.783671,233.663629,283752.0,2730.465071


In [27]:
df_visual.columns

Index(['image_file_name', 'max_contour_area', 'rectangle_angle',
       'rectangle_width', 'rectangle_height', 'ellipse_angle', 'ellipse_width',
       'ellipse_height', 'max_hull', 'm00', 'm10', 'm01', 'm20', 'm11', 'm02',
       'm30', 'm21', 'm12', 'm03', 'mu20', 'mu11', 'mu02', 'mu30', 'mu21',
       'mu12', 'mu03', 'nu20', 'nu11', 'nu02', 'nu30', 'nu21', 'nu12', 'nu03',
       'center_x', 'center_y', 'contour_area', 'contour_length'],
      dtype='object')

In [28]:
audio_features_df.columns

Index(['audio_file_name', 'rms_energy', 'amplitude_envelope_mean',
       'amplitude_envelope_std', 'zero_crossing_rate', 'dynamic_complexity',
       'loudness', 'loudness_vickers', 'spectral_centroid_mean',
       'spectral_centroid_std', 'spectral_complexity_mean',
       'spectral_complexity_std', 'spectral_contrast_0_mean',
       'spectral_contrast_0_std', 'spectral_contrast_1_mean',
       'spectral_contrast_1_std', 'spectral_contrast_2_mean',
       'spectral_contrast_2_std', 'spectral_contrast_3_mean',
       'spectral_contrast_3_std', 'spectral_contrast_4_mean',
       'spectral_contrast_4_std', 'spectral_contrast_5_mean',
       'spectral_contrast_5_std', 'spectral_valley_0_mean',
       'spectral_valley_0_std', 'spectral_valley_1_mean',
       'spectral_valley_1_std', 'spectral_valley_2_mean',
       'spectral_valley_2_std', 'spectral_valley_3_mean',
       'spectral_valley_3_std', 'spectral_valley_4_mean',
       'spectral_valley_4_std', 'spectral_valley_5_mean',
       's

In [30]:
combined_annotation_df

Unnamed: 0,sample index,audio_file_name,image_file_name,class_name,class_name_v2,Layer number,Sample number,X,Y,Z
0,1,sample_21_1.wav,sample_21_1.jpg,Laser-off,Laser-off,1.0,21,0.136620,0.265042,9.588183
1,2,sample_21_2.wav,sample_21_2.jpg,Defect-free,Defect-free,1.0,21,0.128219,0.275541,3.966295
2,3,sample_21_3.wav,sample_21_3.jpg,Defect-free,Defect-free,1.0,21,0.126358,0.276802,2.995122
3,4,sample_21_4.wav,sample_21_4.jpg,Defect-free,Defect-free,1.0,21,0.124890,0.278469,2.108443
4,5,sample_21_5.wav,sample_21_5.jpg,Defect-free,Defect-free,1.0,21,0.124157,0.278818,1.804407
...,...,...,...,...,...,...,...,...,...,...
13523,13524,sample_32_13524.wav,sample_32_13524.jpg,,,,32,,,
13524,13525,sample_32_13525.wav,sample_32_13525.jpg,,,,32,,,
13525,13526,sample_32_13526.wav,sample_32_13526.jpg,,,,32,,,
13526,13527,sample_32_13527.wav,sample_32_13527.jpg,,,,32,,,


In [37]:
# Merge the annotation dataframe with the audio and visual dataframes
df_audiovisual = combined_annotation_df.merge(audio_features_df, how='left', on='audio_file_name')
df_audiovisual = df_audiovisual.merge(df_visual, how='left', on='image_file_name')

# Show the first few rows of the merged dataframe
df_audiovisual

Unnamed: 0,sample index,audio_file_name,image_file_name,class_name,class_name_v2,Layer number,Sample number,rms_energy,amplitude_envelope_mean,amplitude_envelope_std,...,nu11,nu02,nu30,nu21,nu12,nu03,center_x,center_y,contour_area,contour_length
0,1,sample_21_1.wav,sample_21_1.jpg,Laser-off,Laser-off,1.0,21,0.009018,0.003034,0.003803,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
1,2,sample_21_2.wav,sample_21_2.jpg,Defect-free,Defect-free,1.0,21,0.019286,0.012109,0.006188,...,0.000000,0.062337,0.000000,0.000000,0.000000,0.000000,319.500000,239.000000,305442.0,2234.000000
2,3,sample_21_3.wav,sample_21_3.jpg,Defect-free,Defect-free,1.0,21,0.019593,0.015114,0.005868,...,0.000000,0.062337,0.000000,0.000000,0.000000,0.000000,319.500000,239.000000,305442.0,2234.000000
3,4,sample_21_4.wav,sample_21_4.jpg,Defect-free,Defect-free,1.0,21,0.030937,0.021501,0.010559,...,-0.008049,0.062216,0.001940,-0.002304,-0.001599,0.000891,307.440996,230.264496,291865.5,2279.781744
4,5,sample_21_5.wav,sample_21_5.jpg,Defect-free,Defect-free,1.0,21,0.038329,0.029851,0.013688,...,-0.009879,0.065910,0.001738,-0.003345,-0.000898,0.001388,296.636877,229.109962,281970.0,2407.847760
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61989,13524,sample_32_13524.wav,sample_32_13524.jpg,,,,32,0.048554,0.036195,0.015806,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
61990,13525,sample_32_13525.wav,sample_32_13525.jpg,,,,32,0.040282,0.032946,0.013378,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
61991,13526,sample_32_13526.wav,sample_32_13526.jpg,,,,32,0.052285,0.042469,0.018343,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
61992,13527,sample_32_13527.wav,sample_32_13527.jpg,,,,32,0.023477,0.016409,0.006989,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


In [38]:
for col in ['audio_file_name', 'image_file_name', 'class_name', 'class_name_v2']:
    df_audiovisual[col] = df_audiovisual[col].astype('category')

In [39]:
df_audiovisual.to_hdf(os.path.join(Dataset_path, 'data_audio_visual_with_annotations.h5'), key='df', mode='w', format='table')