# Extracting Audio Features using openSMILE

In this notebook, we will demonstrate how to extract audio features from a set of 10 random audio files using openSMILE. OpenSMILE is a popular open-source toolkit for extracting features from audio signals, which is widely used in speech processing and affective computing.

## Why openSMILE?
openSMILE provides a comprehensive set of audio features, including both low-level descriptors (LLDs) such as pitch, energy, and MFCCs, as well as high-level statistical functionals like means and standard deviations of these LLDs. This makes it a powerful tool for various audio analysis tasks.

## Features Extracted
We will use the `ComParE_2016` feature set at the `Functionals` level, which includes features such as:
- **Loudness**
- **MFCCs (Mel-frequency cepstral coefficients)**
- **Pitch and voicing related features**
- **Spectral features**


In [1]:

import os
import random
import pandas as pd
import librosa
import opensmile

# Path to audio files
path_to_audios = 'data/data_final/Audios'

# Collecting all .wav audio files
audios = []
for root, dirs, files in os.walk(path_to_audios):
    for name in files:
        if name.endswith('.wav'):
            audios.append(os.path.join(root, name))

# Select 10 random audio files
random_audios = random.sample(audios, 10)

# Function to read audio
def read_audio(path):
    y, sr = librosa.load(path, sr=44100)
    return y, sr

# Creating a DataFrame to hold audio data and features
df = pd.DataFrame(columns=['audiopath', 'audio_raw', 'sr', 'label'])
df['audiopath'] = random_audios

# Get audio data and sample rate
df[['audio_raw', 'sr']] = df['audiopath'].apply(lambda x: pd.Series(read_audio(x)))
df['label'] = df['audiopath'].apply(lambda x: x.split('/')[3])

# Initialize openSMILE feature extractor
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.Functionals,
)

# Function to extract openSMILE features
def extract_features(audio, sr):
    result = smile.process_signal(audio, sr)
    return result.values.flatten()

# Extract features for each audio file and store in DataFrame
df['features'] = df.apply(lambda row: extract_features(row['audio_raw'], row['sr']), axis=1)

# Display the DataFrame with extracted features
df.head()

Unnamed: 0,audiopath,audio_raw,sr,label,features
0,data/data_final/Audios/Sept/Raw/aeiou/1/Sept_s...,"[-0.0015869141, -0.0016174316, -0.0014038086, ...",44100,Sept,"[1.961677, 0.07965085, 0.37315875, 0.039519638..."
1,data/data_final/Audios/Tonsill/U/3/Tonsill_ses...,"[0.0010375977, 0.0064086914, 0.012207031, 0.02...",44100,Tonsill,"[0.09012765, 0.28735632, 0.0, 0.98391235, 0.99..."
2,data/data_final/Audios/Tonsill/Speech/1/Tonsil...,"[-0.0006713867, -0.00024414062, 9.1552734e-05,...",44100,Tonsill,"[3.3231668, 0.46542978, 0.8689665, 0.26075757,..."
3,data/data_final/Audios/Tonsill/I/1/Tonsill_ses...,"[0.06726074, 0.065460205, 0.06283569, 0.061431...",44100,Tonsill,"[0.21542567, 0.989011, 0.10989011, 0.92784506,..."
4,data/data_final/Audios/Sept/A/1/Sept_ses1_a_00...,"[-0.011810303, -0.010375977, -0.0066833496, -0...",44100,Sept,"[0.31108177, 0.078125, 0.8984375, 0.83871275, ..."


In [2]:
import numpy as np
import math
import scipy

def cpp(x, fs, normOpt, dBScaleOpt): 
    """
    Computes cepstral peak prominence for a given signal 

    Parameters
    -----------
    x: ndarray
        The audio signal
    fs: integer
        The sampling frequency
    normOpt: string
        'line', 'mean' or 'nonorm' for selecting normalisation type
    dBScaleOpt: binary
        True or False for using decibel scale

    Returns
    -----------
    cpp: ndarray
        The CPP with time values 
    """
    # Settings
    frame_length = int(np.round(0.04*fs))
    frame_shift = int(np.round(0.01*fs))
    half_len = int(np.round(frame_length/2))
    x_len = len(x)
    frame_len = half_len*2 + 1
    NFFT = 2**(math.ceil(np.log(frame_len)/np.log(2)))
    quef = np.linspace(0, frame_len/1000, NFFT)  
    
    # Allowed quefrency range
    pitch_range=[60, 333.3]
    quef_lim = [int(np.round(fs/pitch_range[1])), int(np.round(fs/pitch_range[0]))]
    quef_seq = range(quef_lim[0]-1, quef_lim[1])
    
    # Time samples
    time_samples = np.array(range(frame_length+1, x_len-frame_length+1, frame_shift))
    N = len(time_samples)
    frame_start = time_samples-half_len
    frame_stop = time_samples+half_len
    
    # High-pass filtering
    HPfilt_b = [1 - 0.97]
    x = scipy.signal.lfilter( HPfilt_b, 1, x )
    
    # Frame matrix
    frameMat = np.zeros([NFFT, N])
    for n in range(0, N):
        frameMat[0: frame_len, n] = x[frame_start[n]-1:frame_stop[n]]
        
    # Hanning
    def hanning(N):
        x = np.array([i/(N+1) for i in range(1,int(np.ceil(N/2))+1)])
        w = 0.5-0.5*np.cos(2*np.pi*x)
        w_rev = w[::-1]
        return np.concatenate((w, w_rev[int((np.ceil(N%2))):]))
    win = hanning(frame_len)
    winmat = np.tile(win, (N, 1)).T
    frameMat = frameMat[0:frame_len, :]*winmat
    
    # Cepstrum
    SpecMat = np.abs(np.fft.fft(frameMat, axis=0))
    SpecdB = 20*np.log10(SpecMat+1e-6)
    if dBScaleOpt:
        ceps = 20*np.log10(np.abs(np.fft.fft(SpecdB, axis=0)))
    else:
        ceps = 2*np.log(np.abs(np.fft.fft(SpecdB, axis=0)))

    # Finding the peak
    ceps_lim = ceps[quef_seq, :]
    ceps_max = ceps_lim.max(axis=0)
    max_index = ceps_lim.argmax(axis=0)
    
    # Normalisation
    ceps_norm = np.zeros([N])
    if normOpt=='line':
        for n in range(0, N):
            try:
                p = np.polyfit(quef_seq, ceps_lim[:,n],1)
            except:
                p = [0, 0]
            ceps_norm[n] = np.polyval(p, quef_seq[max_index[n]])
    elif normOpt == 'mean':
        ceps_norm = np.mean(ceps_lim)

    cpp = ceps_max-ceps_norm
    
    return cpp, time_samples


# Path to audio files
path_to_audios = 'data/data_final/Audios'

# Collecting all .wav audio files
audios = []
for root, dirs, files in os.walk(path_to_audios):
    for name in files:
        if name.endswith('.wav'):
            audios.append(os.path.join(root, name))
# Function to read audio
def read_audio(path):
    y, sr = librosa.load(path, sr=44100)
    return y, sr

# Creating a DataFrame to hold audio data and features
df = pd.DataFrame(columns=['audiopath', 'audio_raw', 'sr', 'label'])
df['audiopath'] = audios

# Get audio data and sample rate
df[['audio_raw', 'sr']] = df['audiopath'].apply(lambda x: pd.Series(read_audio(x)))
df['label'] = df['audiopath'].apply(lambda x: x.split('/')[3])

cpps = []
# For all audios, calculate the CPP
for i in range(len(df)):
    cpp_values, time_samples = cpp(df['audio_raw'][i], df['sr'][i], 'line', True)
    cpps.append(cpp_values)

# Add the CPP values to the DataFrame
df['cpp'] = cpps



  ceps = 20*np.log10(np.abs(np.fft.fft(SpecdB, axis=0)))


In [7]:
df

Unnamed: 0,audiopath,audio_raw,sr,label,cpp
0,data/data_final/Audios/Sept/A/3/Sept_ses3_a_00...,"[-0.0046081543, -0.005493164, -0.0065612793, -...",44100,Sept,"[25.2553714618686, 21.42021469435928, 22.32146..."
1,data/data_final/Audios/Sept/A/3/Sept_ses3_a_00...,"[-0.35198975, -0.33584595, -0.31640625, -0.300...",44100,Sept,"[27.090470823049756, 27.053574475114154, 29.63..."
2,data/data_final/Audios/Sept/A/3/Sept_ses3_a_00...,"[0.0335083, 0.014190674, 0.005218506, 0.010742...",44100,Sept,"[24.62579234708584, 26.513420481440903, 25.515..."
3,data/data_final/Audios/Sept/A/3/Sept_ses3_a_00...,"[0.0663147, 0.06072998, 0.052764893, 0.0511474...",44100,Sept,"[26.22460600403754, 25.920807000874106, 28.237..."
4,data/data_final/Audios/Sept/A/3/Sept_ses3_a_00...,"[0.12411499, 0.12478638, 0.12557983, 0.1268615...",44100,Sept,"[20.160370461480476, 22.694232207101294, 23.14..."
...,...,...,...,...,...
5079,data/data_final/Audios/Fess/Brasero/2/FESS_ses...,"[-0.0008239746, -0.00045776367, -0.0005493164,...",44100,Fess,"[20.840937604195787, 28.86505347841323, 28.342..."
5080,data/data_final/Audios/Fess/Brasero/2/FESS_ses...,"[0.0045166016, 0.004760742, 0.004699707, 0.005...",44100,Fess,"[19.698031499855325, 20.29369598566047, 21.460..."
5081,data/data_final/Audios/Fess/Brasero/2/FESS_ses...,"[-0.0007324219, -0.00033569336, -0.00039672852...",44100,Fess,"[15.797125484006237, 16.65337346347186, 14.264..."
5082,data/data_final/Audios/Fess/Brasero/2/FESS_ses...,"[0.0008239746, 0.00076293945, -0.00061035156, ...",44100,Fess,"[20.111574720102652, 22.755877503907115, 24.08..."


In [8]:
# Remove audio raw and SR and save the DataFrame
df.drop(['audio_raw', 'sr'], axis=1, inplace=True)
df.to_csv('data/data_final/cpp_features.csv', index=False)
