In [3]:
import os
import random
import pandas as pd
import librosa
import opensmile

# Path to audio files
path_to_audios = 'data/data_final/Audios'

# Collecting all .wav audio files
audios = []
for root, dirs, files in os.walk(path_to_audios):
    for name in files:
        if name.endswith('.wav'):
            audios.append(os.path.join(root, name))

# Select 10 random audio files
random_audios = random.sample(audios, 10)

# Function to read audio
def read_audio(path):
    y, sr = librosa.load(path, sr=44100)
    return y, sr

# Creating a DataFrame to hold audio data and features
df = pd.DataFrame(columns=['audiopath', 'audio_raw', 'sr', 'label'])
df['audiopath'] = random_audios

# Get audio data and sample rate
df[['audio_raw', 'sr']] = df['audiopath'].apply(lambda x: pd.Series(read_audio(x)))
df['label'] = df['audiopath'].apply(lambda x: x.split('/')[3])

# Initialize openSMILE feature extractor
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.Functionals,
)

# Function to extract openSMILE features
def extract_features(audio, sr):
    result = smile.process_signal(audio, sr)
    return result.values.flatten()

# Extract features for each audio file and store in DataFrame
df['features'] = df.apply(lambda row: extract_features(row['audio_raw'], row['sr']), axis=1)

# Display the DataFrame with extracted features
print(df.head())

# Save the DataFrame to a CSV file if needed
# df.to_csv('audio_features.csv', index=False)


                                           audiopath  \
0  data/data_final/Audios/Tonsill/U/3/Tonsill_ses...   
1  data/data_final/Audios/Contr/A3/1/Contr_ses1_a...   
2  data/data_final/Audios/Sept/Brasero/2/Sept_ses...   
3  data/data_final/Audios/Sept/Brasero/1/Sept_ses...   
4  data/data_final/Audios/Sept/Dia/2/Sept_ses2_di...   

                                           audio_raw     sr    label  \
0  [-0.10549927, -0.10662842, -0.111968994, -0.11...  44100  Tonsill   
1  [-0.00045776367, -0.004211426, -0.0067749023, ...  44100    Contr   
2  [-0.0008239746, -0.00088500977, -0.0008239746,...  44100     Sept   
3  [0.0026855469, 0.003540039, 0.0024719238, 0.00...  44100     Sept   
4  [0.00024414062, 0.0013122559, 0.0012512207, 0....  44100     Sept   

                                            features  
0  [0.46012223, 0.0, 0.7375, 1.2959145, 1.405374,...  
1  [0.26694196, 0.03468208, 0.9653179, 0.6284402,...  
2  [3.3922262, 0.12658228, 0.4841772, 0.77032125,...  
3  [2.6667

In [5]:
df['features'][0].shape

(6373,)