In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
import librosa
import soundfile as sf
import scipy.stats

data = pd.read_csv('filtered_data_labeled_cleaned_working_samples.csv')

In [8]:
def extract_features(y, sr):
    features = {}

    # ================== 1. MFCCs ==================
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    delta_mfcc = librosa.feature.delta(mfccs)
    delta2_mfcc = librosa.feature.delta(mfccs, order=2)
    all_mfcc = np.vstack([mfccs, delta_mfcc, delta2_mfcc])

    # Summary stats (mean, std, etc.)
    for i, coeff in enumerate(all_mfcc):
        features[f'mfcc{i+1}_mean'] = np.mean(coeff)
        features[f'mfcc{i+1}_std'] = np.std(coeff)

    # ================== 2. Spectral Features ==================
    features['spectral_centroid'] = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
    features['spectral_bandwidth'] = np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    features['spectral_rolloff'] = np.mean(librosa.feature.spectral_rolloff(y=y, sr=sr))
    features['spectral_flatness'] = np.mean(librosa.feature.spectral_flatness(y=y))

    # ================== 3. Energy & ZCR ==================
    features['rms_energy'] = np.mean(librosa.feature.rms(y=y))
    features['zcr'] = np.mean(librosa.feature.zero_crossing_rate(y))

    # ================== 4. Pitch ==================
    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
    pitches = pitches[magnitudes > np.median(magnitudes)]
    if len(pitches) > 0:
        features['pitch_mean'] = np.mean(pitches)
        features['pitch_std'] = np.std(pitches)
    else:
        features['pitch_mean'] = 0
        features['pitch_std'] = 0

    return features


# Example usage
y, sr = librosa.load("trimmed_padded/common_voice_en_1463.wav", sr=None)
features = extract_features(y, sr)

features_df = pd.DataFrame([features])
print(features_df.head())


   mfcc1_mean  mfcc1_std  mfcc2_mean  mfcc2_std  mfcc3_mean  mfcc3_std  \
0 -404.964752   87.18145  132.553467  75.476418   27.972475  47.279434   

   mfcc4_mean  mfcc4_std  mfcc5_mean  mfcc5_std  ...  mfcc39_mean  mfcc39_std  \
0   -1.643079  35.354286    5.687986  24.201862  ...     0.017239    0.839563   

   spectral_centroid  spectral_bandwidth  spectral_rolloff  spectral_flatness  \
0        1316.820369         1389.683462        2150.14129           0.030038   

   rms_energy       zcr   pitch_mean  pitch_std  
0    0.070999  0.046292  1147.252808  878.04718  

[1 rows x 86 columns]


In [10]:
from joblib import Parallel, delayed
from tqdm import tqdm



def process_file(file_path, class_label=None):
    # Load the audio file
    y, sr = librosa.load(file_path, sr=None)

    # Extract features
    features = extract_features(y, sr)

    # Add the file name to the features
    features['file_name'] = os.path.basename(file_path)
    # Add the class label if provided
    if class_label is not None:
        features['class_label'] = class_label
    

    return features

# Get a list of all audio files from the data dataframe
audio_dir = 'trimmed_padded'
# Process the files in parallel
features_list = Parallel(n_jobs=-1)(
    delayed(process_file)(os.path.join(audio_dir, row.path), row.label)
    for row in tqdm(list(data.itertuples(index=False)))
)
# Convert the list of features into a DataFrame
features_df = pd.DataFrame(features_list)
# Save the features to a CSV file
features_df.to_csv('audio_features.csv', index=False)


  0%|          | 0/39178 [00:00<?, ?it/s]

100%|██████████| 39178/39178 [10:12<00:00, 63.96it/s]
