# Extracting Audio Features using openSMILE

In this notebook, we will demonstrate how to extract audio features from a set of 10 random audio files using openSMILE. OpenSMILE is a popular open-source toolkit for extracting features from audio signals, which is widely used in speech processing and affective computing.

## Why openSMILE?
openSMILE provides a comprehensive set of audio features, including both low-level descriptors (LLDs) such as pitch, energy, and MFCCs, as well as high-level statistical functionals like means and standard deviations of these LLDs. This makes it a powerful tool for various audio analysis tasks.

## Features Extracted
We will use the `ComParE_2016` feature set at the `Functionals` level, which includes features such as:
- **Loudness**
- **MFCCs (Mel-frequency cepstral coefficients)**
- **Pitch and voicing related features**
- **Spectral features**


In [2]:

import os
import random
import pandas as pd
import librosa
import opensmile

# Path to audio files
path_to_audios = 'data/version_to_zenodo/audios'

# Collecting all .wav audio files
audios = []
for root, dirs, files in os.walk(path_to_audios):
    for name in files:
        if name.endswith('.wav'):
            audios.append(os.path.join(root, name))

# Select 10 random audio files
random_audios = random.sample(audios, 10)

# Function to read audio
def read_audio(path):
    y, sr = librosa.load(path, sr=44100)
    return y, sr

# Creating a DataFrame to hold audio data and features
df = pd.DataFrame(columns=['audiopath', 'audio_raw', 'sr', 'label'])
df['audiopath'] = random_audios

# Get audio data and sample rate
df[['audio_raw', 'sr']] = df['audiopath'].apply(lambda x: pd.Series(read_audio(x)))
df['label'] = df['audiopath'].apply(lambda x: x.split('/')[3])

# Initialize openSMILE feature extractor
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.Functionals,
)

# Function to extract openSMILE features
def extract_features(audio, sr):
    result = smile.process_signal(audio, sr)
    return result.values.flatten()

# Extract features for each audio file and store in DataFrame
df['features'] = df.apply(lambda row: extract_features(row['audio_raw'], row['sr']), axis=1)

# Display the DataFrame with extracted features
df.head()

Unnamed: 0,audiopath,audio_raw,sr,label,features
0,data/version_to_zenodo/audios/PD_ACAMPADA_0117...,"[0.001373291, 0.0013427734, 0.0014343262, 0.00...",44100,PD_ACAMPADA_0117.wav,"[0.6541857, 0.33992094, 0.09486166, 0.11978859..."
1,data/version_to_zenodo/audios/HC_E2_0132.wav,"[0.110565186, 0.076049805, 0.048980713, 0.0186...",44100,HC_E2_0132.wav,"[0.44526064, 0.20382166, 0.9745223, 1.9924592,..."
2,data/version_to_zenodo/audios/PD_GANGA_0037.wav,"[0.00012207031, -0.00018310547, -6.1035156e-05...",44100,PD_GANGA_0037.wav,"[0.7941038, 0.37387386, 0.036036037, 0.1455623..."
3,data/version_to_zenodo/audios/HC_BARBAS_0076.wav,"[-0.0009460449, -0.0006713867, -0.0005493164, ...",44100,HC_BARBAS_0076.wav,"[1.7345581, 0.52839506, 0.64444447, 0.3214539,..."
4,data/version_to_zenodo/audios/PD_BURRO_0025.wav,"[0.00015258789, -3.0517578e-05, 0.00012207031,...",44100,PD_BURRO_0025.wav,"[0.53769004, 0.29515418, 0.0, 0.18032843, 0.26..."


In [5]:
import numpy as np
import math
import scipy

def cpp(x, fs, normOpt, dBScaleOpt): 
    """
    Computes cepstral peak prominence for a given signal 

    Parameters
    -----------
    x: ndarray
        The audio signal
    fs: integer
        The sampling frequency
    normOpt: string
        'line', 'mean' or 'nonorm' for selecting normalisation type
    dBScaleOpt: binary
        True or False for using decibel scale

    Returns
    -----------
    cpp: ndarray
        The CPP with time values 
    """
    # Settings
    frame_length = int(np.round(0.04*fs))
    frame_shift = int(np.round(0.01*fs))
    half_len = int(np.round(frame_length/2))
    x_len = len(x)
    frame_len = half_len*2 + 1
    NFFT = 2**(math.ceil(np.log(frame_len)/np.log(2)))
    quef = np.linspace(0, frame_len/1000, NFFT)  

    # Allowed quefrency range
    pitch_range=[60, 333.3]
    quef_lim = [int(np.round(fs/pitch_range[1])), int(np.round(fs/pitch_range[0]))]
    quef_seq = range(quef_lim[0]-1, quef_lim[1])

    # Time samples
    time_samples = np.array(range(frame_length+1, x_len-frame_length+1, frame_shift))
    N = len(time_samples)
    frame_start = time_samples-half_len
    frame_stop = time_samples+half_len

    # High-pass filtering
    HPfilt_b = [1 - 0.97]
    x = scipy.signal.lfilter( HPfilt_b, 1, x )

    # Frame matrix
    frameMat = np.zeros([NFFT, N])
    for n in range(0, N):
        frameMat[0: frame_len, n] = x[frame_start[n]-1:frame_stop[n]]

    # Hanning
    def hanning(N):
        x = np.array([i/(N+1) for i in range(1,int(np.ceil(N/2))+1)])
        w = 0.5-0.5*np.cos(2*np.pi*x)
        w_rev = w[::-1]
        return np.concatenate((w, w_rev[int((np.ceil(N%2))):]))
    win = hanning(frame_len)
    winmat = np.tile(win, (N, 1)).T
    frameMat = frameMat[0:frame_len, :]*winmat

    # Cepstrum
    SpecMat = np.abs(np.fft.fft(frameMat, axis=0))
    SpecdB = 20*np.log10(SpecMat+1e-6)
    if dBScaleOpt:
        ceps = 20*np.log10(np.abs(np.fft.fft(SpecdB, axis=0)))
    else:
        ceps = 2*np.log(np.abs(np.fft.fft(SpecdB, axis=0)))

    # Finding the peak
    ceps_lim = ceps[quef_seq, :]
    ceps_max = ceps_lim.max(axis=0)
    max_index = ceps_lim.argmax(axis=0)

    # Normalisation
    ceps_norm = np.zeros([N])
    if normOpt=='line':
        for n in range(0, N):
            try:
                p = np.polyfit(quef_seq, ceps_lim[:,n],1)
            except:
                p = [0, 0]
            ceps_norm[n] = np.polyval(p, quef_seq[max_index[n]])
    elif normOpt == 'mean':
        ceps_norm = np.mean(ceps_lim)

    cpp = ceps_max-ceps_norm

    return cpp, time_samples


# Path to audio files
path_to_audios = "data/version_to_zenodo/audios"

# Collecting all .wav audio files
audios = []
for root, dirs, files in os.walk(path_to_audios):
    for name in files:
        if name.endswith('.wav'):
            audios.append(os.path.join(root, name))
# Function to read audio
def read_audio(path):
    y, sr = librosa.load(path, sr=44100)
    return y, sr

# Creating a DataFrame to hold audio data and features
df = pd.DataFrame(columns=['audiopath', 'audio_raw', 'sr', 'label'])
df['audiopath'] = audios

# Get audio data and sample rate
df[['audio_raw', 'sr']] = df['audiopath'].apply(lambda x: pd.Series(read_audio(x)))
df['label'] = df['audiopath'].apply(lambda x: x.split('/')[3])

cpps = []
# For all audios, calculate the CPP
for i in range(len(df)):
    cpp_values, time_samples = cpp(df['audio_raw'][i], df['sr'][i], 'line', True)
    cpps.append(cpp_values)

# Add the CPP values to the DataFrame
df['cpp'] = cpps

In [65]:
# Remove audio raw and SR and save the DataFrame
df.drop(['audio_raw', 'sr'], axis=1, inplace=True)
df.to_csv('data/data_final/cpp_features.csv', index=False)


In [6]:
df

Unnamed: 0,audiopath,audio_raw,sr,label,cpp
0,data/version_to_zenodo/audios/PD_U1_0010.wav,"[0.0020751953, 0.0064697266, 0.0113220215, 0.0...",44100,PD_U1_0010.wav,"[23.027036296421826, 23.322980608487313, 23.47..."
1,data/version_to_zenodo/audios/PD_TOMAS_0043.wav,"[-0.00079345703, -0.0010681152, -0.0014038086,...",44100,PD_TOMAS_0043.wav,"[19.45812199002141, 12.820219162185822, 14.362..."
2,data/version_to_zenodo/audios/PD_E1_0037.wav,"[-0.00091552734, 0.0005493164, 0.002319336, 0....",44100,PD_E1_0037.wav,"[26.27815698970432, 25.644774524525445, 26.542..."
3,data/version_to_zenodo/audios/HC_SOMBRA_0130.wav,"[0.0009460449, 0.0014038086, 0.00064086914, 0....",44100,HC_SOMBRA_0130.wav,"[21.198842354479638, 21.102515694756327, 22.38..."
4,data/version_to_zenodo/audios/HC_U2_0140.wav,"[0.00048828125, 0.0024719238, 0.0039367676, 0....",44100,HC_U2_0140.wav,"[18.5074305428058, 18.717815655848163, 19.8830..."
...,...,...,...,...,...
2898,data/version_to_zenodo/audios/HC_PETACA_BLANCA...,"[0.00064086914, 0.00045776367, 0.0007324219, 0...",44100,HC_PETACA_BLANCA_0075.wav,"[16.89974285416281, 15.562563149916492, 15.735..."
2899,data/version_to_zenodo/audios/PD_PETACA_BLANCA...,"[-0.0009765625, -0.00088500977, -0.00091552734...",44100,PD_PETACA_BLANCA_0013.wav,"[17.746304779348733, 18.53178094221432, 17.094..."
2900,data/version_to_zenodo/audios/HC_CALLE_0034.wav,"[0.00021362305, -0.00018310547, -0.00015258789...",44100,HC_CALLE_0034.wav,"[14.689498440670562, 14.038039123209948, 15.62..."
2901,data/version_to_zenodo/audios/PD_I3_0019.wav,"[0.00045776367, 0.0014038086, 0.0018920898, 0....",44100,PD_I3_0019.wav,"[13.263694452468108, 17.010913615187818, 16.19..."


In [16]:
# Read old audiofeatures ses1, ses2 and ses3
df1 = pd.read_csv("data/version_to_zenodo/audio_features/audio_features.csv")

# Define an ID column that is the filename of "AudioPath"
df1['ID'] = df1['AudioPath'].apply(lambda x: x.split('/')[-1])

# Rename df["label"] to df["ID"]
df.rename(columns={"label": "ID"}, inplace=True)



In [21]:
# Merge df["cpp"] with df1 using ID
df2 = pd.merge(df1, df[["ID", "cpp"]], on="ID")

# Remove ID
df2.drop("ID", axis=1, inplace=True)



In [22]:
df2

Unnamed: 0,AudioPath,JITA,rJitter,RAP,rPPQ,rSPPQ,ShimmerDb,Shimmer,rAPQ,rSAPQ,FTRI,ATRI,FFTR,FATR,Nne,Hnr,CHNR,GNE,cpp
0,../data/audios/HC_A1_0034.wav,90.143459,1.389974,0.829234,0.867956,2.028272,0.306230,3.467983,2.583073,3.371838,4.090909,10.279720,0.031869,2.439872,-14.178498,-1.678300e+01,-1.478172e+01,-10.849439,"[20.685918084425715, 19.517920633888885, 22.63..."
1,../data/audios/HC_A1_0036.wav,45.845768,1.053877,0.648728,0.618244,0.710986,0.274391,3.138128,2.258142,3.131371,5.646607,10.022727,0.006287,0.815829,-18.391510,-1.338407e+01,-1.580306e+01,-11.950162,"[23.790506357480105, 23.237050450099233, 24.50..."
2,../data/audios/HC_A1_0045.wav,388.205755,3.534064,2.127018,1.955301,1.834815,0.902313,10.319354,6.424677,6.655715,9.111570,10.022727,0.020949,0.531205,-4.170562,-3.134168e+00,-2.363848e+00,-2.473592,"[16.72090753093498, 19.916394464637456, 19.644..."
3,../data/audios/HC_A1_0048.wav,38.528686,0.473524,0.284452,0.319551,0.516492,0.396074,4.519425,3.148559,5.212512,10.279720,10.279720,0.003805,0.762209,-15.074605,-1.091469e+01,-1.178872e+01,-16.150904,"[25.19213527734371, 20.43461294316871, 22.1890..."
4,../data/audios/HC_A1_0049.wav,35.880701,0.702138,0.433132,0.436095,0.727694,0.320643,3.663877,2.811715,5.060773,4.310850,10.022727,0.004411,0.401193,-20.660361,-3.232008e+01,-2.765997e+01,-29.763608,"[25.842983041306503, 25.872582022891464, 24.73..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1030,../data/audios/PD_U2_0111.wav,109.332173,1.477746,0.887074,0.941182,1.235602,0.535313,6.022998,3.895453,5.860139,10.279720,10.279720,0.063444,2.409262,-5.897461,-4.622049e+00,-4.125568e+00,-5.223017,"[20.97013841805441, 19.283365782086356, 17.929..."
1031,../data/audios/PD_U2_0113.wav,31.155713,0.589542,0.359416,0.360868,0.464276,0.307726,3.500630,2.330333,3.735762,10.279720,10.022727,0.007329,1.219577,-8.009848,-4.673027e+00,-4.268354e+00,-6.450324,"[24.119465209533374, 22.274671070771213, 21.78..."
1032,../data/audios/PD_U2_0115.wav,52.640104,0.905554,0.560517,0.548817,0.642155,0.180440,2.082157,1.523851,2.402717,10.022727,10.279720,0.006688,1.450736,-6.895147,-3.846118e+00,-4.663456e+00,-3.535668,"[26.2568797916593, 26.23868666131301, 24.93033..."
1033,../data/audios/PD_U2_0117.wav,48.523727,1.031747,0.627293,0.619180,0.990895,0.269947,3.022930,2.091094,2.726138,6.795069,10.022727,0.026779,2.255713,-4.343221,-5.883512e+00,-4.810677e+00,-5.726198,"[22.574497292086726, 22.382328196248658, 20.29..."


In [25]:
# Save new audio_Features.csv
df2.to_csv("data/version_to_zenodo/audio_features/audio_features.csv", index=False)