### Import dependencies

In [2]:
import os
import numpy as np
import pandas as pd
import librosa # for audio analysis capabilities

### Parse wav files

In [4]:
base_dir = "audio_speech"
mfcc_features = []
file_names = []

# number of features to extract
n_mfcc = 13

for actor in os.listdir(base_dir):
    actor_path = os.path.join(base_dir, actor)

    if os.path.isdir(actor_path):
        for file in os.listdir(actor_path):
            if file.endswith(".wav"):
                file_path = os.path.join(actor_path, file)

                # load audio
                y, sr = librosa.load(file_path, sr=None) 
                # y: audio time series (1d array of amplitudes)
                # sr: sampling rate of y (number of samples per second, in Hz)
                # sr=None to preserve native sampling rate of the audio file

                # extract MFCC features
                mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
                # n_mfcc: number of features to extract, 13 is conventionally default (but 20-60 also widely used)
                # source: https://www.sciencedirect.com/science/article/pii/S0003682X2400450X#:~:text=The%20default%20MFCC%20parameters%20include,of%2010%20ms%20between%20frames. (3.2. Feature extraction)

                # average across time to get fixed length vector
                mfcc_mean = np.mean(mfcc, axis=1)
                # mfcc.shape = (n_mfcc, n_frames) -> which varies depending on length of clip
                # i.e. MFCCs for each recording is stored as rows x cols, where 1 row = 1 feature, and 1 col = 1 timeframe
                #   this means for each feature, average across all timeframes
                # this is needed for classical ML models since they require fixed length vectors
                # for CNNs/RNNs, this is not needed

                # store results
                mfcc_features.append(mfcc_mean)
                file_names.append(file)

# create DataFrame
mfcc_df = pd.DataFrame(mfcc_features, columns=[f"mfcc_{i+1}" for i in range(n_mfcc)])
mfcc_df.insert(0, "file_name", file_names)

In [8]:
mfcc_df.head()

Unnamed: 0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,mfcc_11,mfcc_12,mfcc_13,file_name
0,-726.217224,68.54142,3.293398,12.2053,5.510278,13.66741,-2.983828,3.098029,-3.310813,-1.564384,-7.861652,-2.124282,2.849204,03-01-01-01-01-01-01.wav
1,-719.128296,70.201569,1.168397,13.122543,7.83695,14.41129,-4.11136,4.468973,-3.539367,-3.658607,-7.648504,-1.477077,3.031821,03-01-01-01-01-02-01.wav
2,-714.995728,69.689346,3.924564,11.92419,6.421723,11.011614,-2.878103,4.509558,-4.476109,-2.671549,-7.499283,-2.962266,1.873485,03-01-01-01-02-01-01.wav
3,-710.975281,67.56488,5.782241,13.230727,6.190845,12.628252,-1.67517,5.657494,-4.950634,-3.477545,-7.416558,-1.937004,2.271525,03-01-01-01-02-02-01.wav
4,-759.921753,75.783524,6.023604,14.557394,6.454188,14.631508,-3.004551,4.62097,-5.200016,-0.70743,-7.790287,-3.564949,2.18097,03-01-02-01-01-01-01.wav


### Import laballed RAVDESS data to join with vectorized data

In [10]:
# Read in the labelled data
labels_df = pd.read_csv("ravdess_labels.csv")

# Join with the vectorized data
full_df = pd.merge(mfcc_df, labels_df, left_on="file_name", right_on="file_name")

full_df.head()

Unnamed: 0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,...,Vocal_channel,Emotion,Emotional_intensity,Statement,Repetition,Actor,Emotion_label,Intensity_label,Statement_label,Actor_gender
0,-726.217224,68.54142,3.293398,12.2053,5.510278,13.66741,-2.983828,3.098029,-3.310813,-1.564384,...,1,1,1,1,1,1,neutral,normal,Kids are talking by the door,male
1,-719.128296,70.201569,1.168397,13.122543,7.83695,14.41129,-4.11136,4.468973,-3.539367,-3.658607,...,1,1,1,1,2,1,neutral,normal,Kids are talking by the door,male
2,-714.995728,69.689346,3.924564,11.92419,6.421723,11.011614,-2.878103,4.509558,-4.476109,-2.671549,...,1,1,1,2,1,1,neutral,normal,Dogs are sitting by the door,male
3,-710.975281,67.56488,5.782241,13.230727,6.190845,12.628252,-1.67517,5.657494,-4.950634,-3.477545,...,1,1,1,2,2,1,neutral,normal,Dogs are sitting by the door,male
4,-759.921753,75.783524,6.023604,14.557394,6.454188,14.631508,-3.004551,4.62097,-5.200016,-0.70743,...,1,2,1,1,1,1,calm,normal,Kids are talking by the door,male


### Export file as csv

In [14]:
full_df.to_csv("ravdess_mfcc_features_averaged.csv", index=False)