# Extracting Audio Features using openSMILE

In this notebook, we will demonstrate how to extract audio features from a set of 10 random audio files using openSMILE. OpenSMILE is a popular open-source toolkit for extracting features from audio signals, which is widely used in speech processing and affective computing.

## Why openSMILE?
openSMILE provides a comprehensive set of audio features, including both low-level descriptors (LLDs) such as pitch, energy, and MFCCs, as well as high-level statistical functionals like means and standard deviations of these LLDs. This makes it a powerful tool for various audio analysis tasks.

## Features Extracted
We will use the `ComParE_2016` feature set at the `Functionals` level, which includes features such as:
- **Loudness**
- **MFCCs (Mel-frequency cepstral coefficients)**
- **Pitch and voicing related features**
- **Spectral features**


In [62]:

import os
import random
import pandas as pd
import librosa
import opensmile

# Path to audio files
path_to_audios = 'data/data_final/Audios'

# Collecting all .wav audio files
audios = []
for root, dirs, files in os.walk(path_to_audios):
    for name in files:
        if name.endswith('.wav'):
            audios.append(os.path.join(root, name))

# Select 10 random audio files
random_audios = random.sample(audios, 10)

# Function to read audio
def read_audio(path):
    y, sr = librosa.load(path, sr=44100)
    return y, sr

# Creating a DataFrame to hold audio data and features
df = pd.DataFrame(columns=['audiopath', 'audio_raw', 'sr', 'label'])
df['audiopath'] = random_audios

# Get audio data and sample rate
df[['audio_raw', 'sr']] = df['audiopath'].apply(lambda x: pd.Series(read_audio(x)))
df['label'] = df['audiopath'].apply(lambda x: x.split('/')[3])

# Initialize openSMILE feature extractor
smile = opensmile.Smile(
    feature_set=opensmile.FeatureSet.ComParE_2016,
    feature_level=opensmile.FeatureLevel.Functionals,
)

# Function to extract openSMILE features
def extract_features(audio, sr):
    result = smile.process_signal(audio, sr)
    return result.values.flatten()

# Extract features for each audio file and store in DataFrame
df['features'] = df.apply(lambda row: extract_features(row['audio_raw'], row['sr']), axis=1)

# Display the DataFrame with extracted features
df.head()

Unnamed: 0,audiopath,audio_raw,sr,label,features
0,data/data_final/Audios/Sept/A1/3/Sept_ses3_a1_...,"[-0.15716553, -0.13970947, -0.11477661, -0.074...",44100,Sept,"[1.9176102, 0.0, 0.9665428, 0.94963866, 1.0402..."
1,data/data_final/Audios/Fess/Mesa/3/FESS_ses3_m...,"[0.0044555664, 0.003692627, 0.0028686523, 0.00...",44100,Fess,"[1.6549885, 0.044025157, 0.8176101, 0.5119717,..."
2,data/data_final/Audios/Tonsill/Dia/2/Tonsill_s...,"[-0.0039367676, -0.004119873, -0.0056762695, -...",44100,Tonsill,"[0.6323694, 0.27480915, 0.16793893, 0.38490897..."
3,data/data_final/Audios/Contr/U/3/Contr_ses3_u_...,"[0.087768555, 0.08380127, 0.0809021, 0.0773620...",44100,Contr,"[0.19501221, 0.3898305, 0.91525424, 0.8437975,..."
4,data/data_final/Audios/Tonsill/Agua/1/Tonsill_...,"[-0.00030517578, 0.00012207031, 0.00021362305,...",44100,Tonsill,"[2.6028814, 0.15350877, 0.6622807, 0.48009247,..."


In [64]:
import numpy as np
import math
import scipy

def cpp(x, fs, normOpt, dBScaleOpt): 
    """
    Computes cepstral peak prominence for a given signal 

    Parameters
    -----------
    x: ndarray
        The audio signal
    fs: integer
        The sampling frequency
    normOpt: string
        'line', 'mean' or 'nonorm' for selecting normalisation type
    dBScaleOpt: binary
        True or False for using decibel scale

    Returns
    -----------
    cpp: ndarray
        The CPP with time values 
    """
    # Settings
    frame_length = int(np.round(0.04*fs))
    frame_shift = int(np.round(0.01*fs))
    half_len = int(np.round(frame_length/2))
    x_len = len(x)
    frame_len = half_len*2 + 1
    NFFT = 2**(math.ceil(np.log(frame_len)/np.log(2)))
    quef = np.linspace(0, frame_len/1000, NFFT)  
    
    # Allowed quefrency range
    pitch_range=[60, 333.3]
    quef_lim = [int(np.round(fs/pitch_range[1])), int(np.round(fs/pitch_range[0]))]
    quef_seq = range(quef_lim[0]-1, quef_lim[1])
    
    # Time samples
    time_samples = np.array(range(frame_length+1, x_len-frame_length+1, frame_shift))
    N = len(time_samples)
    frame_start = time_samples-half_len
    frame_stop = time_samples+half_len
    
    # High-pass filtering
    HPfilt_b = [1 - 0.97]
    x = scipy.signal.lfilter( HPfilt_b, 1, x )
    
    # Frame matrix
    frameMat = np.zeros([NFFT, N])
    for n in range(0, N):
        frameMat[0: frame_len, n] = x[frame_start[n]-1:frame_stop[n]]
        
    # Hanning
    def hanning(N):
        x = np.array([i/(N+1) for i in range(1,int(np.ceil(N/2))+1)])
        w = 0.5-0.5*np.cos(2*np.pi*x)
        w_rev = w[::-1]
        return np.concatenate((w, w_rev[int((np.ceil(N%2))):]))
    win = hanning(frame_len)
    winmat = np.tile(win, (N, 1)).T
    frameMat = frameMat[0:frame_len, :]*winmat
    
    # Cepstrum
    SpecMat = np.abs(np.fft.fft(frameMat, axis=0))
    SpecdB = 20*np.log10(SpecMat+1e-6)
    if dBScaleOpt:
        ceps = 20*np.log10(np.abs(np.fft.fft(SpecdB, axis=0)))
    else:
        ceps = 2*np.log(np.abs(np.fft.fft(SpecdB, axis=0)))

    # Finding the peak
    ceps_lim = ceps[quef_seq, :]
    ceps_max = ceps_lim.max(axis=0)
    max_index = ceps_lim.argmax(axis=0)
    
    # Normalisation
    ceps_norm = np.zeros([N])
    if normOpt=='line':
        for n in range(0, N):
            try:
                p = np.polyfit(quef_seq, ceps_lim[:,n],1)
            except:
                p = [0, 0]
            ceps_norm[n] = np.polyval(p, quef_seq[max_index[n]])
    elif normOpt == 'mean':
        ceps_norm = np.mean(ceps_lim)

    cpp = ceps_max-ceps_norm
    
    return cpp, time_samples


# Path to audio files
path_to_audios = 'data/data_final/Audios'

# Collecting all .wav audio files
audios = []
for root, dirs, files in os.walk(path_to_audios):
    for name in files:
        if name.endswith('.wav'):
            audios.append(os.path.join(root, name))
# Function to read audio
def read_audio(path):
    y, sr = librosa.load(path, sr=44100)
    return y, sr

# Creating a DataFrame to hold audio data and features
df = pd.DataFrame(columns=['audiopath', 'audio_raw', 'sr', 'label'])
df['audiopath'] = audios

# Get audio data and sample rate
df[['audio_raw', 'sr']] = df['audiopath'].apply(lambda x: pd.Series(read_audio(x)))
df['label'] = df['audiopath'].apply(lambda x: x.split('/')[3])

# cpps = []
# # For all audios, calculate the CPP
# for i in range(len(df)):
#     cpp_values, time_samples = cpp(df['audio_raw'][i], df['sr'][i], 'line', True)
#     cpps.append(cpp_values)

# Add the CPP values to the DataFrame
df['cpp'] = cpps



In [65]:
# Remove audio raw and SR and save the DataFrame
df.drop(['audio_raw', 'sr'], axis=1, inplace=True)
df.to_csv('data/data_final/cpp_features.csv', index=False)


In [68]:
df

Unnamed: 0,audiopath,label,cpp,GROUP,AUDIO_MATERIAL,SESSION,PATIENT_ID
0,data/data_final/Audios/Sept/A/3/Sept_ses3_a_00...,Sept,"[25.2553714618686, 21.42021469435928, 22.32146...",Sept,a,ses3,91
1,data/data_final/Audios/Sept/A/3/Sept_ses3_a_00...,Sept,"[27.090470823049756, 27.053574475114154, 29.63...",Sept,a,ses3,5
2,data/data_final/Audios/Sept/A/3/Sept_ses3_a_00...,Sept,"[24.62579234708584, 26.513420481440903, 25.515...",Sept,a,ses3,90
3,data/data_final/Audios/Sept/A/3/Sept_ses3_a_00...,Sept,"[26.22460600403754, 25.920807000874106, 28.237...",Sept,a,ses3,23
4,data/data_final/Audios/Sept/A/3/Sept_ses3_a_00...,Sept,"[20.160370461480476, 22.694232207101294, 23.14...",Sept,a,ses3,64
...,...,...,...,...,...,...,...
5079,data/data_final/Audios/Fess/Brasero/2/FESS_ses...,Fess,"[20.840937604195787, 28.86505347841323, 28.342...",FESS,brasero,ses2,118
5080,data/data_final/Audios/Fess/Brasero/2/FESS_ses...,Fess,"[19.698031499855325, 20.29369598566047, 21.460...",FESS,brasero,ses2,30
5081,data/data_final/Audios/Fess/Brasero/2/FESS_ses...,Fess,"[15.797125484006237, 16.65337346347186, 14.264...",FESS,brasero,ses2,86
5082,data/data_final/Audios/Fess/Brasero/2/FESS_ses...,Fess,"[20.111574720102652, 22.755877503907115, 24.08...",FESS,brasero,ses2,124


In [67]:
 # Remove nan values
df = df.dropna()

# Get group
df["audiopath"].iloc[0].split("/")[-1].split("_")[0]
# Get audio_material
df["audiopath"].iloc[0].split("/")[-1].split("_")[2]

# Get group and audo_material for all audios and sotre in new columns named "GROUP" and "AUDIO_MATERIAL"
for i in range(len(df)):
    group = df["audiopath"].iloc[i].split("/")[-1].split("_")[0]
    session = df["audiopath"].iloc[i].split("/")[-1].split("_")[1]
    patient_id = int(df["audiopath"].iloc[i].split("/")[-1].split("_")[-1].split(".")[0])
    try:
        audios_material = df["audiopath"].iloc[i].split("/")[-1].split("_")[2]
    except:
        audios_material = "unknown"
    df.loc[i, "GROUP"] = group
    df.loc[i, "AUDIO_MATERIAL"] = audios_material
    df.loc[i, "SESSION"] = session
    df.loc[i, "PATIENT_ID"] = patient_id

# Remove nan values
df = df.dropna()
df["PATIENT_ID"] = df["PATIENT_ID"].astype(int)


  df.loc[i, "GROUP"] = group
  df.loc[i, "AUDIO_MATERIAL"] = audios_material
  df.loc[i, "SESSION"] = session


In [76]:
# Read old audiofeatures ses1, ses2 and ses3
df1 = pd.read_csv("data/data_final/Audio_Features/audiofeatures_Ses1.csv")
df2 = pd.read_csv("data/data_final/Audio_Features/audiofeatures_Ses2.csv")
df3 = pd.read_csv("data/data_final/Audio_Features/audiofeatures_Ses3.csv")


# Remove from df all audio material that is not a1, a2 or a3
df = df[(df["AUDIO_MATERIAL"] == "a1") | (df["AUDIO_MATERIAL"] == "a2") | (df["AUDIO_MATERIAL"] == "a3")]

# From df1, substitue in SESSION "2wbs" to "ses1", "2was" to "ses2" and "3mas" to "ses3"
df1["SESSION"] = df1["SESSION"].replace("2wbs", "ses1")
df2["SESSION"] = df2["SESSION"].replace("2was", "ses2")
df3["SESSION"] = df3["SESSION"].replace("3mas", "ses3")

# rename id for PATIENT_ID
df1.rename(columns={"id": "PATIENT_ID"}, inplace=True)
df2.rename(columns={"id": "PATIENT_ID"}, inplace=True)
df3.rename(columns={"id": "PATIENT_ID"}, inplace=True)

# rENAME GROUP BY LABEL
df1.rename(columns={"GROUP": "label"}, inplace=True)
df2.rename(columns={"GROUP": "label"}, inplace=True)
df3.rename(columns={"GROUP": "label"}, inplace=True)

In [84]:
df1final = pd.merge(df1, df, on=["PATIENT_ID", "AUDIO_MATERIAL", "SESSION", "label"], how="inner")
df2final = pd.merge(df2, df, on=["PATIENT_ID", "AUDIO_MATERIAL", "SESSION", "label"], how="inner")
df3final = pd.merge(df3, df, on=["PATIENT_ID", "AUDIO_MATERIAL", "SESSION", "label"], how="inner")

# Name last column as "CPP"
df1final.rename(columns={"cpp": "CPP"}, inplace=True)
df2final.rename(columns={"cpp": "CPP"}, inplace=True)
df3final.rename(columns={"cpp": "CPP"}, inplace=True)

In [85]:
# This are the new audio_features  
df1final.to_csv("data/data_final/Audio_Features/audiofeatures_Ses1.csv", index=False)
df2final.to_csv("data/data_final/Audio_Features/audiofeatures_Ses2.csv", index=False)
df3final.to_csv("data/data_final/Audio_Features/audiofeatures_Ses3.csv", index=False)