# LDED Audiovisual Fusion 

Author: Chen Lequn.
Created on 13 Sep 2023.

- Material: Maraging Steel 300
- Process: Robotic Llser-directed energy deposition
- Recorded data: position, veolocity, coaxial ccd features, acoustic feature
- Quality labels generated: keyhole pores, cracks, defect-free

### Notebook 2: Feature extraction
- Extract handcrafted features from video and audio stream
- Vision features: melt pool geometric features, including width, length, moment of area, convex hull, etc.
- Audio features: spectral centroid, spectral bandwidth, flux, etc.

### System setup

In [1]:
import cv2
import numpy as np
import pandas as pd
from tqdm import tqdm

import os
# Scikit learn
#from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle, resample, class_weight
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit

## plot
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib as mpl
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)
%matplotlib inline
import seaborn as sns

In [2]:
import essentia.standard as es
from essentia.standard import Spectrum, Windowing, SpectralCentroidTime, SpectralComplexity, SpectralContrast
from essentia.standard import Decrease, Energy, EnergyBandRatio, FlatnessDB, Flux, RollOff, StrongPeak, CentralMoments
from essentia.standard import DistributionShape, Crest, MelBands, MFCC
import soundfile as sf  # for reading audio files

[   INFO   ] MusicExtractorSVM: no classifier models were configured by default


https://essentia.upf.edu/algorithms_reference.html

In [26]:
PROJECT_ROOT_DIR = "../"
IMAGE_PATH = os.path.join(PROJECT_ROOT_DIR, "result_images", 'feature_extraction')
os.makedirs(IMAGE_PATH, exist_ok=True)

Multimodal_dataset_PATH = "/home/chenlequn/Dataset/LDED_acoustic_visual_monitoring_dataset"
Dataset_path = os.path.join(Multimodal_dataset_PATH, f'segmented_25Hz_buffered')
                            

## function for automatically save the diagram/graph into the folder 
def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGE_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

plt.rcParams["axes.edgecolor"] = "black"
plt.rcParams["axes.linewidth"] = 2.50

In [32]:
def get_sample_directories(base_path, sample_numbers):
    sample_directories = []
    for sample_number in sample_numbers:
        sample_directories.append(os.path.join(base_path, f'{sample_number}'))
    return sample_directories


samples = [21, 22, 23, 26]
sample_directories = get_sample_directories(Dataset_path, samples)

# Get lists of image and audio directories for each sample
image_directories = [os.path.join(sample_dir, 'images') for sample_dir in sample_directories]
audio_directories = [os.path.join(sample_dir, 'raw_audio') for sample_dir in sample_directories]

In [41]:
sample_directories

['/home/chenlequn/Dataset/LDED_acoustic_visual_monitoring_dataset/segmented_25Hz_buffered/21',
 '/home/chenlequn/Dataset/LDED_acoustic_visual_monitoring_dataset/segmented_25Hz_buffered/22',
 '/home/chenlequn/Dataset/LDED_acoustic_visual_monitoring_dataset/segmented_25Hz_buffered/23',
 '/home/chenlequn/Dataset/LDED_acoustic_visual_monitoring_dataset/segmented_25Hz_buffered/26']

In [42]:
# Combine all annotation files into one DataFrame
all_annotation_dfs = []
for sample_dir, sample_number in zip(sample_directories, samples):
    annotation_file = os.path.join(sample_dir, f'annotations_{sample_number}.csv')  # Update the file name
    annotation_df = pd.read_csv(annotation_file)
    all_annotation_dfs.append(annotation_df)
combined_annotation_df = pd.concat(all_annotation_dfs)
combined_annotation_df

Unnamed: 0,sample index,audio_file_name,image_file_name,name,class_ID,class_name_v2,class_ID_2,Layer number,Sample number,class_ID_v2
0,1,sample_21_1.wav,sample_21_1.jpg,Laser-off,0.0,Laser-off,0.0,1.0,21,
1,2,sample_21_2.wav,sample_21_2.jpg,Crack,2.0,Defective,2.0,1.0,21,
2,3,sample_21_3.wav,sample_21_3.jpg,Crack,2.0,Defective,2.0,1.0,21,
3,4,sample_21_4.wav,sample_21_4.jpg,Crack,2.0,Defective,2.0,1.0,21,
4,5,sample_21_5.wav,sample_21_5.jpg,Defect-free,1.0,Defect-free,1.0,1.0,21,
...,...,...,...,...,...,...,...,...,...,...
10981,10982,sample_26_10982.wav,sample_26_10982.jpg,Laser-off,,,,50.0,26,
10982,10983,sample_26_10983.wav,sample_26_10983.jpg,Laser-off,,,,50.0,26,
10983,10984,sample_26_10984.wav,sample_26_10984.jpg,Laser-off,,,,50.0,26,
10984,10985,sample_26_10985.wav,sample_26_10985.jpg,Laser-off,,,,50.0,26,


## Extracting melt pool visual features

In [6]:
def general_contour_extraction(image, threshold=100):
    """
    Extract general contour features from a given image.
    
    Parameters:
        image (ndarray): The input image.
        threshold (int): The threshold value for image processing.
    
    Returns:
        dict: A dictionary containing the extracted features.
    """
    # Initialize the result dictionary with zeros
    result = {
        'max_contour_area': 0,
        'rectangle_angle': 0,
        'rectangle_width': 0,
        'rectangle_height': 0,
        'ellipse_angle': 0,
        'ellipse_width': 0,
        'ellipse_height': 0
    }
    
    # Convert the image to grayscale
    src_gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply blur
    src_gray = cv2.blur(src_gray, (3, 3))
    
    # Apply threshold
    _, threshold_output = cv2.threshold(src_gray, threshold, 255, cv2.THRESH_BINARY)
    
    # Find contours
    contours, _ = cv2.findContours(threshold_output, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    
    if not contours:
        return result  # Return result with zeros if no contours are found
    
    # Find the rotated rectangles and ellipses for each contour
    min_rects = [cv2.minAreaRect(np.array(contour)) for contour in contours]
    contour_areas = [cv2.contourArea(np.array(contour)) for contour in contours]
    
    # Get the index of the max contour area
    max_contour_area_index = np.argmax(contour_areas)
    max_contour_area = contour_areas[max_contour_area_index]
    
    # Store the max contour area
    result['max_contour_area'] = max_contour_area
    
    # Store rectangle features
    rect = min_rects[max_contour_area_index]
    result['rectangle_angle'] = rect[-1]
    result['rectangle_width'] = rect[1][0]
    result['rectangle_height'] = rect[1][1]
    
    # Store ellipse features if enough points for fitEllipse
    if len(contours[max_contour_area_index]) > 5:
        ellipse = cv2.fitEllipse(np.array(contours[max_contour_area_index]))
        result['ellipse_angle'] = ellipse[-1]
        result['ellipse_width'] = ellipse[1][0]
        result['ellipse_height'] = ellipse[1][1]
    
    return result

In [7]:
def convex_hull_extract(frame, threshold=100):
    """
    Extract convex hull features from a given image.
    
    Parameters:
        image_path (str): The path to the image file.
        threshold (int): The threshold value for binary conversion.
    
    Returns:
        max_hull_area (float): The maximum area among all convex hulls.
    """
    
    # Convert to grayscale if the image is colored
    if frame.shape[-1] > 1:
        src_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    else:
        src_gray = frame

    # Blur the image
    src_gray = cv2.blur(src_gray, (3, 3))
    
    # Apply threshold
    ret, threshold_output = cv2.threshold(src_gray, threshold, 255, cv2.THRESH_BINARY)
    
    # Find contours
    contours, _ = cv2.findContours(threshold_output, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    
    # Initialize return values
    max_hull_area = 0.0

    # Check if any contour is detected
    if contours:
        # Find the convex hull object for each contour
        hull = [cv2.convexHull(cnt) for cnt in contours]
        
        # Find the bounding convex hull area for each contour
        hull_area = [cv2.contourArea(h) for h in hull]
        
        # Get the maximum convex hull area
        max_hull_area = max(hull_area)
        
#         # Draw contours and convex hull on the original image (for visualization)
#         drawing = np.zeros((threshold_output.shape[0], threshold_output.shape[1], 3), dtype=np.uint8)
#         for i in range(len(contours)):
#             color = (np.random.randint(0,256), np.random.randint(0,256), np.random.randint(0,256))
#             cv2.drawContours(drawing, contours, i, color)
#             cv2.drawContours(drawing, hull, i, color, 2)
        
#         # Show the output image with contours and convex hull
#         plt.imshow(cv2.cvtColor(drawing, cv2.COLOR_BGR2RGB))
#         plt.title('Contours and Convex Hull')
#         plt.axis('off')
#         plt.show()
        
    return max_hull_area

In [8]:
# Feature extraction for moments
def moment_extract(image, threshold):
    # Initialize moments as zeros
    features = {
        'm00': 0,
        'm10': 0,
        'm01': 0,
        'm20': 0,
        'm11': 0,
        'm02': 0,
        'm30': 0,
        'm21': 0,
        'm12': 0,
        'm03': 0,
        'mu20': 0,
        'mu11': 0,
        'mu02': 0,
        'mu30': 0,
        'mu21': 0,
        'mu12': 0,
        'mu03': 0,
        'nu20': 0,
        'nu11': 0,
        'nu02': 0,
        'nu30': 0,
        'nu21': 0,
        'nu12': 0,
        'nu03': 0,
        'center_x': 0,
        'center_y': 0,
        'contour_area': 0,
        'contour_length': 0
    }
    
    # Convert to grayscale if the image is colored
    if len(image.shape) > 2:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        gray = image

    # Thresholding
    _, thresh = cv2.threshold(gray, threshold, 255, cv2.THRESH_BINARY)

    # Find contours
    contours, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    
    if contours:
        largest_contour = max(contours, key=cv2.contourArea)
        moments = cv2.moments(largest_contour)
        
        # Avoid division by zero
        if moments['m00'] != 0:
            for moment_name, moment_value in moments.items():
                features[moment_name] = moment_value
                
            features['center_x'] = moments['m10'] / moments['m00']
            features['center_y'] = moments['m01'] / moments['m00']
            features['contour_area'] = cv2.contourArea(largest_contour)
            features['contour_length'] = cv2.arcLength(largest_contour, True)
            
    return features

### Extract all visual features

In [9]:
image_directories

['/home/chenlequn/Dataset/LDED_acoustic_visual_monitoring_dataset/segmented_25Hz_buffered/22/images']

In [10]:
def extract_visual_features(image_directories, threshold=100):
    all_features_list = []
    total_images = sum([len(os.listdir(img_dir)) for img_dir in image_directories if os.path.isdir(img_dir)])
    pbar = tqdm(total=total_images, desc="Processing images")

    for img_dir in image_directories:
        if os.path.isdir(img_dir):
            for img_name in os.listdir(img_dir):
                if img_name.lower().endswith(('.png', '.jpg', '.jpeg')):
                    img_path = os.path.join(img_dir, img_name)
                    img = cv2.imread(img_path)
                    
                    features_contour = general_contour_extraction(img, threshold=threshold)
                    max_hull = convex_hull_extract(img, threshold=threshold)
                    features_moments = moment_extract(img, threshold=threshold)
                    
                    # Merge all dictionaries into one
                    merged_features = {'image_name': img_name, **features_contour, 'max_hull': max_hull, **features_moments}
                    all_features_list.append(merged_features)
                    
                    pbar.update(1)
    
    pbar.close()
    return pd.DataFrame(all_features_list)

In [33]:
df_visual = extract_visual_features(image_directories)
df_visual.head()

Processing images: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32149/32149 [02:57<00:00, 180.79it/s]


Unnamed: 0,image_name,max_contour_area,rectangle_angle,rectangle_width,rectangle_height,ellipse_angle,ellipse_width,ellipse_height,max_hull,m00,...,nu11,nu02,nu30,nu21,nu12,nu03,center_x,center_y,contour_area,contour_length
0,sample_21_2038.jpg,231979.0,90.0,479.0,528.0,2.577053,574.159973,818.167725,236628.0,232057.0,...,-0.006111,0.078487,0.000851,-0.002744,-0.000768,0.001435,247.358734,229.456223,232057.0,2326.891477
1,sample_21_2716.jpg,247762.5,90.0,451.0,639.0,81.376549,537.173401,825.702881,260181.0,247724.0,...,-0.007437,0.053936,0.004801,-0.003457,-0.001749,0.001352,309.548025,195.965763,247724.0,2372.04877
2,sample_21_2351.jpg,214554.0,90.0,479.0,501.0,3.983349,522.245422,614.861877,219719.5,214637.0,...,-0.00185,0.081376,0.000582,-0.000139,-0.00097,-4e-05,234.949085,237.435629,214637.0,2272.71485
3,sample_21_1539.jpg,211149.5,90.0,478.999939,483.999939,3.726851,513.294983,621.059265,215107.5,211283.0,...,-0.001874,0.083399,0.000717,-0.00086,-0.001375,0.000327,228.26165,234.260897,211283.0,2079.376759
4,sample_21_4902.jpg,258557.5,0.0,565.999878,478.999939,178.969452,587.517456,886.574585,263607.5,258438.0,...,-0.002776,0.072466,0.00032,-0.000871,-0.000617,0.000196,270.647355,236.184484,258438.0,2342.783832


## Extract Audio Features

In [12]:
def extract_time_domain_features(audio_signal, sample_rate=44100):
    """
    Extract time domain features from an audio signal using Essentia.
    
    Parameters:
    - audio_signal: numpy array, the audio signal from which to extract features
    - sample_rate: int, the sample rate of the audio signal
    
    Returns:
    - features: dict, a dictionary containing the extracted features
    """
    
    features = {}
    
    # RMS Energy
    rms_algo = es.RMS()
    rms_energy = rms_algo(audio_signal)
    features['rms_energy'] = rms_energy
    
    # Amplitude Envelope
    envelope_algo = es.Envelope()
    amplitude_envelope = envelope_algo(audio_signal)
    features['amplitude_envelope_mean'] = amplitude_envelope.mean()
    features['amplitude_envelope_std'] = amplitude_envelope.std()
    
    # Zero Crossing Rate
    zcr_algo = es.ZeroCrossingRate()
    zero_crossing_rate = zcr_algo(audio_signal)
    features['zero_crossing_rate'] = zero_crossing_rate
    
    # Dynamic Complexity and Loudness
    dyn_algo = es.DynamicComplexity()
    dynamic_complexity, loudness = dyn_algo(audio_signal)
    features['dynamic_complexity'] = dynamic_complexity
    features['loudness'] = loudness

    # Loudness Vickers
    loudness_algo = es.LoudnessVickers()
    loudness_vickers = loudness_algo(audio_signal)
    features['loudness_vickers'] = loudness_vickers

    return features

Essentia provides a variety of spectral descriptors that you can use for feature extraction:

1. **Spectral Centroid**: Computes the center of mass of the spectrum.
2. **Spectral Complexity**: Measures the amount of peak-like components in the spectrum.
3. **Spectral Contrast**: Computes the spectral contrast features from an audio signal.
4. **Spectral Decrease**: Computes the decrease of the spectrum.
5. **Spectral Energy**: Computes the energy of the frequency domain signal.
6. **Spectral Energy Band Ratio**: Computes the ratio of energy in specific bands to the total energy.
7. **Spectral Flatness**: Computes the flatness of a spectrum.
8. **Spectral Flux**: Computes the flux of the spectrum.
9. **Spectral Rolloff**: Computes the rolloff frequency of an audio signal.
10. **Spectral Strong Peak**: Computes the strong peak of the spectrum.
12. **Spectral Variance, skewness, kurtosis**: Computes the variance of the spectral peaks.
14. **MFCC (Mel Frequency Cepstral Coefficients)**: Widely used spectral feature in audio and speech processing.


In [13]:
def extract_spectral_features(audio_signal, sample_rate, frame_size=1024, hop_size=512):
    # Initialize the algorithms
    window_algo = Windowing(type='hann')
    spectrum_algo = Spectrum()
    centroid_algo = SpectralCentroidTime(sampleRate=sample_rate)
    complexity_algo = SpectralComplexity(sampleRate=sample_rate)
    contrast_algo = SpectralContrast(frameSize=frame_size, highFrequencyBound=sample_rate/2, lowFrequencyBound=200, sampleRate=sample_rate)
    decrease_algo = Decrease()
    energy_algo = Energy()
    energy_band_ratio_algo = EnergyBandRatio(sampleRate=sample_rate, stopFrequency=7000)
    flatness_algo = FlatnessDB()
    spectral_flux = Flux()
    rolloff_algo = RollOff(sampleRate=sample_rate)
    strong_peak_algo = StrongPeak()
    central_moment_algo = CentralMoments()
    distrubution_shape = DistributionShape()
    spectral_crest_factor = Crest()
    mel_bands_algo = MelBands()
    mfcc_algo = MFCC(highFrequencyBound=sample_rate/2, numberCoefficients=13, sampleRate=sample_rate)
    
    features = {}
    for frame in es.FrameGenerator(audio_signal, frameSize=frame_size, hopSize=hop_size):
        windowed_frame = window_algo(frame)
        spectrum = spectrum_algo(windowed_frame)
        
        features['spectral_centroid'] = centroid_algo(spectrum)
        features['spectral_complexity'] = complexity_algo(spectrum)

        spectral_contrast, spectral_valley = contrast_algo(spectrum)
        # Store spectral contrast and valley values separately
        for i, val in enumerate(spectral_contrast):
            features[f'spectral_contrast_{i}'] = val
        for i, val in enumerate(spectral_valley):
            features[f'spectral_valley_{i}'] = val
            
        features['spectral_decrease'] = decrease_algo(spectrum)
        features['spectral_energy'] = energy_algo(spectrum)
        features['spectral_energy_band_ratio'] = energy_band_ratio_algo(spectrum)
        features['spectral_flatness'] = flatness_algo(spectrum)
        features['spectral_flux'] = spectral_flux(spectrum)
        features['spectral_rolloff'] = rolloff_algo(spectrum)
        features['spectral_strong_peak'] = strong_peak_algo(spectrum)
        central_moments = central_moment_algo(spectrum)
        features['spectral_variance'], features['spectral_skewness'], features['spectral_kurtosis'] = distrubution_shape(central_moments)
        features['spectral_crest_factor'] = spectral_crest_factor(spectrum)
        
        mfcc_bands, mfcc_coeffs = mfcc_algo(spectrum)
        for i, coeff in enumerate(mfcc_coeffs):
            features[f'mfcc_{i}'] = coeff
    
    return features

In [14]:
# Example usage
sample_rate = 44100
audio_signal = np.random.rand(22050).astype(np.float32)  # 0.5 seconds of audio
features = extract_spectral_features(audio_signal, sample_rate)
features

[   INFO   ] TriangularBands: input spectrum size (513) does not correspond to the "inputSize" parameter (1025). Recomputing the filter bank.


{'spectral_centroid': 146.51573181152344,
 'spectral_complexity': 0.0,
 'spectral_contrast_0': -0.9784076,
 'spectral_contrast_1': -0.9550213,
 'spectral_contrast_2': -0.9579363,
 'spectral_contrast_3': -0.98199797,
 'spectral_contrast_4': -0.9356236,
 'spectral_contrast_5': -0.9099213,
 'spectral_valley_0': -8.804347,
 'spectral_valley_1': -9.463403,
 'spectral_valley_2': -10.160862,
 'spectral_valley_3': -10.557283,
 'spectral_valley_4': -10.771971,
 'spectral_valley_5': -11.456935,
 'spectral_decrease': -7.899501360952854e-05,
 'spectral_energy': 1.3164477650207118e-06,
 'spectral_energy_band_ratio': 0.8209258913993835,
 'spectral_flatness': 0.02195845916867256,
 'spectral_flux': 0.8212764859199524,
 'spectral_rolloff': 8828.61328125,
 'spectral_strong_peak': 0.0,
 'spectral_variance': 0.0809355229139328,
 'spectral_skewness': 0.6429382562637329,
 'spectral_kurtosis': -0.8099098205566406,
 'spectral_crest_factor': 5.275919437408447,
 'mfcc_0': -1078.7957,
 'mfcc_1': 86.83628,
 'mfcc

### Extract all audio features

In [15]:
def extract_all_audio_features(audio_directories, frame_size=1024, hop_size=512):
    all_features_list = []
    
    # Count total audio files for progress bar
    total_audio_files = sum([len(os.listdir(audio_dir)) for audio_dir in audio_directories if os.path.isdir(audio_dir)])
    
    pbar = tqdm(total=total_audio_files, desc="Processing audio files")

    for audio_dir in audio_directories:
        if os.path.isdir(audio_dir):
            for audio_name in os.listdir(audio_dir):
                if audio_name.lower().endswith(('.wav', '.flac', '.mp3')):
                    audio_path = os.path.join(audio_dir, audio_name)
                    
                    # Read audio file
                    audio_signal, sample_rate = sf.read(audio_path, dtype='float32')
                    
                    # Extract features
                    time_domain_features = extract_time_domain_features(audio_signal, sample_rate)
                    spectral_features = extract_spectral_features(audio_signal, sample_rate, frame_size, hop_size)
                    
                    # Merge all dictionaries into one
                    merged_features = {'audio_name': audio_name, **time_domain_features, **spectral_features}
                    all_features_list.append(merged_features)
                    
                    pbar.update(1)
    
    pbar.close()
    return pd.DataFrame(all_features_list)


In [16]:
audio_directories

['/home/chenlequn/Dataset/LDED_acoustic_visual_monitoring_dataset/segmented_25Hz_buffered/22/raw_audio']

In [40]:
# audio_path = os.path.join(audio_directories[0], "sample_22_9.wav")
# audio_signal, sample_rate = sf.read(audio_path, dtype='float32')
# print(sample_rate)
# print (len(audio_signal))
# print (len(audio_signal)/sample_rate)
# plt.plot(audio_signal)

In [18]:
time_domain_features = extract_time_domain_features(audio_signal, sample_rate)
time_domain_features

{'rms_energy': 0.03374376893043518,
 'amplitude_envelope_mean': 0.03788913,
 'amplitude_envelope_std': 0.013862286,
 'zero_crossing_rate': 0.07800453156232834,
 'dynamic_complexity': 0.0,
 'loudness': -100.0,
 'loudness_vickers': -37.499725341796875}

In [34]:
audio_features_df = extract_all_audio_features(audio_directories, frame_size=2048, hop_size=512)

Processing audio files: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 32149/32149 [02:45<00:00, 193.76it/s]


## Save extracted features

In [35]:
audio_features_df

Unnamed: 0,audio_name,rms_energy,amplitude_envelope_mean,amplitude_envelope_std,zero_crossing_rate,dynamic_complexity,loudness,loudness_vickers,spectral_centroid,spectral_complexity,...,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12
0,sample_21_1833.wav,0.025188,0.028686,0.010267,0.103175,0.0,-100.0,-39.429451,2119.552734,0.0,...,71.922470,-34.273552,31.318687,2.432793,6.501900,18.845898,-4.195328,7.091858,21.177139,4.567207
1,sample_21_1023.wav,0.028820,0.030098,0.013072,0.128345,0.0,-100.0,-37.971367,1536.433960,1.0,...,61.760509,-27.052120,21.286774,7.355404,-7.647884,26.487389,-0.237617,16.874611,26.828526,5.217075
2,sample_21_2382.wav,0.027742,0.028501,0.013807,0.107483,0.0,-100.0,-38.194988,2001.102539,0.0,...,60.653625,-42.837547,21.599880,2.448238,0.156937,12.066170,3.084717,3.786457,6.362461,-8.522699
3,sample_21_4248.wav,0.034371,0.042144,0.014231,0.158503,0.0,-100.0,-33.375977,1914.181396,0.0,...,65.590942,-63.772644,34.912823,-5.447128,-4.629456,18.778435,-5.988262,8.716049,6.765575,-5.733231
4,sample_21_2127.wav,0.028227,0.029618,0.012488,0.075283,0.0,-100.0,-38.530449,1643.389282,1.0,...,82.495544,-14.387405,33.749207,4.664982,13.819672,15.083054,-2.832371,16.566986,6.316761,-4.962399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32144,sample_26_3313.wav,0.020677,0.025701,0.008086,0.105896,0.0,-100.0,-40.225834,1637.311523,0.0,...,85.804420,-40.780308,31.178024,21.258568,-5.776703,27.944260,-3.502533,6.058319,11.870369,-6.087738
32145,sample_26_10270.wav,0.015368,0.016434,0.005804,0.100680,0.0,-100.0,-37.397079,1883.484375,1.0,...,43.054379,-46.340614,24.456303,1.901657,-13.114761,3.147602,-5.107796,-5.707840,-4.514523,-13.145458
32146,sample_26_3100.wav,0.024893,0.026770,0.010226,0.097732,0.0,-100.0,-39.785934,2041.216064,0.0,...,73.438194,-30.142529,30.548000,19.773670,-7.278645,9.469398,-2.542385,0.128513,10.017262,-4.405937
32147,sample_26_5308.wav,0.016844,0.020840,0.005775,0.140816,0.0,-100.0,-40.912750,1738.634155,0.0,...,72.537567,-40.020996,28.771599,17.327736,-6.821022,32.054031,6.572159,2.466095,-2.497440,-12.773994


In [36]:
df_visual

Unnamed: 0,image_name,max_contour_area,rectangle_angle,rectangle_width,rectangle_height,ellipse_angle,ellipse_width,ellipse_height,max_hull,m00,...,nu11,nu02,nu30,nu21,nu12,nu03,center_x,center_y,contour_area,contour_length
0,sample_21_2038.jpg,231979.0,90.000000,479.000000,528.000000,2.577053,574.159973,818.167725,236628.0,232057.0,...,-0.006111,0.078487,0.000851,-0.002744,-0.000768,0.001435,247.358734,229.456223,232057.0,2326.891477
1,sample_21_2716.jpg,247762.5,90.000000,451.000000,639.000000,81.376549,537.173401,825.702881,260181.0,247724.0,...,-0.007437,0.053936,0.004801,-0.003457,-0.001749,0.001352,309.548025,195.965763,247724.0,2372.048770
2,sample_21_2351.jpg,214554.0,90.000000,479.000000,501.000000,3.983349,522.245422,614.861877,219719.5,214637.0,...,-0.001850,0.081376,0.000582,-0.000139,-0.000970,-0.000040,234.949085,237.435629,214637.0,2272.714850
3,sample_21_1539.jpg,211149.5,90.000000,478.999939,483.999939,3.726851,513.294983,621.059265,215107.5,211283.0,...,-0.001874,0.083399,0.000717,-0.000860,-0.001375,0.000327,228.261650,234.260897,211283.0,2079.376759
4,sample_21_4902.jpg,258557.5,0.000000,565.999878,478.999939,178.969452,587.517456,886.574585,263607.5,258438.0,...,-0.002776,0.072466,0.000320,-0.000871,-0.000617,0.000196,270.647355,236.184484,258438.0,2342.783832
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32144,sample_26_2049.jpg,31829.0,78.366371,241.983490,164.808044,162.237259,166.149384,246.016617,33086.5,31828.5,...,0.018856,0.111874,-0.000220,0.000450,0.000942,-0.000186,215.128606,204.293511,31828.5,1015.903670
32145,sample_26_7293.jpg,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
32146,sample_26_5969.jpg,42572.5,4.236394,205.917343,253.823135,174.426727,211.302887,259.703735,43347.5,42558.5,...,0.002776,0.098630,-0.000280,-0.001273,0.000400,0.001620,227.732004,214.190385,42558.5,957.335128
32147,sample_26_10114.jpg,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


In [37]:
# Combine DataFrames horizontally
df_audiovisual = pd.concat([df_visual, audio_features_df], axis=1)
df_audiovisual

Unnamed: 0,image_name,max_contour_area,rectangle_angle,rectangle_width,rectangle_height,ellipse_angle,ellipse_width,ellipse_height,max_hull,m00,...,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12
0,sample_21_2038.jpg,231979.0,90.000000,479.000000,528.000000,2.577053,574.159973,818.167725,236628.0,232057.0,...,71.922470,-34.273552,31.318687,2.432793,6.501900,18.845898,-4.195328,7.091858,21.177139,4.567207
1,sample_21_2716.jpg,247762.5,90.000000,451.000000,639.000000,81.376549,537.173401,825.702881,260181.0,247724.0,...,61.760509,-27.052120,21.286774,7.355404,-7.647884,26.487389,-0.237617,16.874611,26.828526,5.217075
2,sample_21_2351.jpg,214554.0,90.000000,479.000000,501.000000,3.983349,522.245422,614.861877,219719.5,214637.0,...,60.653625,-42.837547,21.599880,2.448238,0.156937,12.066170,3.084717,3.786457,6.362461,-8.522699
3,sample_21_1539.jpg,211149.5,90.000000,478.999939,483.999939,3.726851,513.294983,621.059265,215107.5,211283.0,...,65.590942,-63.772644,34.912823,-5.447128,-4.629456,18.778435,-5.988262,8.716049,6.765575,-5.733231
4,sample_21_4902.jpg,258557.5,0.000000,565.999878,478.999939,178.969452,587.517456,886.574585,263607.5,258438.0,...,82.495544,-14.387405,33.749207,4.664982,13.819672,15.083054,-2.832371,16.566986,6.316761,-4.962399
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32144,sample_26_2049.jpg,31829.0,78.366371,241.983490,164.808044,162.237259,166.149384,246.016617,33086.5,31828.5,...,85.804420,-40.780308,31.178024,21.258568,-5.776703,27.944260,-3.502533,6.058319,11.870369,-6.087738
32145,sample_26_7293.jpg,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,43.054379,-46.340614,24.456303,1.901657,-13.114761,3.147602,-5.107796,-5.707840,-4.514523,-13.145458
32146,sample_26_5969.jpg,42572.5,4.236394,205.917343,253.823135,174.426727,211.302887,259.703735,43347.5,42558.5,...,73.438194,-30.142529,30.548000,19.773670,-7.278645,9.469398,-2.542385,0.128513,10.017262,-4.405937
32147,sample_26_10114.jpg,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,72.537567,-40.020996,28.771599,17.327736,-6.821022,32.054031,6.572159,2.466095,-2.497440,-12.773994


In [38]:
df_audiovisual.to_hdf(os.path.join(Dataset_path, 'data_audio_visual.h5'), key='df', mode='w')