# Librosa Features Extraction

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import librosa
import librosa.display
import os
from tqdm import tqdm
import random
import pyloudnorm as pyln

In [2]:
df = pd.read_csv("train_dev_complete.csv")
df

Unnamed: 0,file,speaker,emotion,source,split
0,EMOITA/data/1613671614352.wav,emoita_321,anger,emoita,train
1,EMOITA/data/1613658275427.wav,emoita_303,anger,emoita,train
2,EMOITA/data/1613324357435.wav,emoita_314,anger,emoita,train
3,EMOITA/data/1614274086698.wav,emoita_109,anger,emoita,train
4,EMOITA/data/1612982146424.wav,emoita_179,anger,emoita,train
...,...,...,...,...,...
10673,TESS/data/OAF_cab_neutral.wav,tess_OAF,neutral,tess,train
10674,TESS/data/OAF_chain_disgust.wav,tess_OAF,disgust,tess,train
10675,TESS/data/OAF_jar_neutral.wav,tess_OAF,neutral,tess,train
10676,TESS/data/OAF_lot_angry.wav,tess_OAF,anger,tess,train


In [3]:
# ---- LUFS
def lufs_normalization(y, sr = 16000, target_lufs = -23.0):
    
    if np.max(np.abs(y)) < 1e5:
        return y
    try:
        meter = pyln.Meter(sr)
        current_lufs = meter.integrated_loudness(y)
        
        y_norm = pyln.normalize.loudness(y, current_lufs, target_lufs)
        return y_norm
    except Exception as e:
        print(f'{e}')
        return y

# ---- PEAK    
def peak_normalization(y, sr = 16000, target_level = 0.95):
    
    if np.max(np.abs(y)) < 1e5:
        return y
    try:
        peak = np.max(np.abs(y))
        y_norm = y / peak * target_level
        
        return y_norm
        
    except Exception as e:
        print(f'{e}')
        return y



In [5]:
# Function to compute mean and std for each feature, returning a dictionary of statistics of each feature
def compute_mean_and_std(feature_matrix, prefix):
    
    stats = {}
    mean = np.mean(feature_matrix, axis=1)
    std = np.std(feature_matrix, axis=1)
    
    for i, (m, s) in enumerate(zip(mean, std)):
        stats[f"{prefix}_mean_{i+1}"] = m
        stats[f"{prefix}_std_{i+1}"] = s
        
    return stats

#### Prosodic Features

In [6]:
# RMS Energy as proxy of loudness
def get_rms(y)-> dict:
    rms = librosa.feature.rms(y=y)
    return compute_mean_and_std(rms, "rms")

# Zero Crossing Rate as proxy of Arousal
def get_zcr(y)-> dict:
    zcr = librosa.feature.zero_crossing_rate(y=y)
    return compute_mean_and_std(zcr, "zcr")

#### Timbro vocale

In [7]:
# MFCC for spectral characteristics
def get_mfcc(y, rs = 16000, n_mfcc = 13)-> dict:
    mfcc = librosa.feature.mfcc(y=y, sr=rs, n_mfcc=n_mfcc)
    return compute_mean_and_std(mfcc, "mfcc")

#Delta MFCC for temporal dynamics
def get_delta_mfcc(y, rs = 16000, n_mfcc = 13)-> dict:
    mfcc = librosa.feature.mfcc(y=y, sr=rs, n_mfcc=n_mfcc)
    delta_mfcc = librosa.feature.delta(mfcc)
    return compute_mean_and_std(delta_mfcc, "delta_mfcc")

#Delta-Delta MFCC for acceleration characteristics
def get_delta2_mfcc(y, rs = 16000, n_mfcc = 13)-> dict:
    mfcc = librosa.feature.mfcc(y=y, sr=rs, n_mfcc=n_mfcc)
    delta2_mfcc = librosa.feature.delta(mfcc, order=2)
    return compute_mean_and_std(delta2_mfcc, "delta2_mfcc")


In [8]:
# Spectral contrast for timbral texture
def get_spectral_contrast(y, rs = 16000)-> dict:
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=rs)
    return compute_mean_and_std(spectral_contrast, "spectral_contrast")

#### Tonal Features

In [9]:
# Chroma features for harmonic content
def get_chroma(y, rs = 16000)-> dict:
    chroma = librosa.feature.chroma_stft(y=y, sr=rs)
    return compute_mean_and_std(chroma, "chroma")

# Tonnetz for tonal characteristics
def get_tonnetz(y, rs = 16000)-> dict:
    tonnetz = librosa.feature.tonnetz(y=librosa.effects.harmonic(y), sr=rs)
    return compute_mean_and_std(tonnetz, "tonnetz")

In [None]:
# --- FEATURES EXTRACTOR
SR = 16000

def extract_features(row):
    
    file = row["file"]
    
    y, _ = librosa.load(file, sr=SR)
    
    if y is None or len(y) < 2048:
        return None
    
    full_vector = {}
    
    full_vector["file"] = file
    full_vector["label"] = row["emotion"]
    full_vector["split"] = row["split"]
    
    try:
        full_vector.update(get_rms(y))
        full_vector.update(get_zcr(y))
        full_vector.update(get_mfcc(y))
        full_vector.update(get_delta_mfcc(y))
        full_vector.update(get_delta2_mfcc(y))
        full_vector.update(get_spectral_contrast(y))
        full_vector.update(get_chroma(y))
        full_vector.update(get_tonnetz(y))
        return full_vector
    except Exception as e:
        print(f"Extraction failed for {file}: {e}")
        return None

In [11]:
if __name__ == "__main__":
    
    features_list = list()
    
    for i, row in tqdm(df.iterrows(), total = df.shape[0]):
        
        features_vector = extract_features(row)
        
        if features_vector is not None:
            features_list.append(features_vector)
            
    librosa_features_df = pd.DataFrame(features_list)
    

100%|██████████| 10678/10678 [21:00<00:00,  8.47it/s]


In [29]:
librosa_features_df

Unnamed: 0,file,label,split,rms_mean_1,rms_std_1,zcr_mean_1,zcr_std_1,mfcc_mean_1,mfcc_std_1,mfcc_mean_2,...,tonnetz_mean_2,tonnetz_std_2,tonnetz_mean_3,tonnetz_std_3,tonnetz_mean_4,tonnetz_std_4,tonnetz_mean_5,tonnetz_std_5,tonnetz_mean_6,tonnetz_std_6
0,EMOITA/data/1613671614352.wav,anger,train,0.024803,0.018837,0.094025,0.041726,-386.987579,109.523575,92.180084,...,-0.000301,0.162367,-0.027398,0.170889,-0.027288,0.276434,0.049214,0.057863,0.035949,0.069892
1,EMOITA/data/1613658275427.wav,anger,train,0.098870,0.089459,0.236770,0.135869,-222.974136,138.506271,36.672932,...,0.006156,0.097414,0.011117,0.142223,0.002411,0.168923,0.012504,0.056174,-0.006190,0.045487
2,EMOITA/data/1613324357435.wav,anger,train,0.061353,0.076801,0.186284,0.138848,-334.634979,163.107941,46.877239,...,-0.026362,0.115640,0.036014,0.208363,-0.084484,0.179091,-0.034872,0.057218,-0.022433,0.060650
3,EMOITA/data/1614274086698.wav,anger,train,0.052951,0.055620,0.110824,0.083089,-337.719818,111.294464,59.848953,...,0.037641,0.121842,-0.049022,0.195470,0.020855,0.160576,-0.014241,0.067783,-0.028701,0.060781
4,EMOITA/data/1612982146424.wav,anger,train,0.029244,0.029896,0.227802,0.121835,-348.929504,174.111023,55.746693,...,-0.000657,0.043193,0.012547,0.135329,0.029270,0.073095,0.026593,0.035770,0.049189,0.038273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10673,TESS/data/OAF_cab_neutral.wav,neutral,train,0.011656,0.006050,0.099544,0.111094,-473.686096,104.024330,64.592720,...,0.057049,0.242782,0.269023,0.240616,-0.191205,0.226487,0.110749,0.083580,-0.006572,0.107392
10674,TESS/data/OAF_chain_disgust.wav,disgust,train,0.011848,0.010605,0.154508,0.182570,-458.265991,133.222198,64.057930,...,-0.008685,0.104257,-0.057601,0.151587,0.003219,0.162539,0.012875,0.046664,0.006841,0.055044
10675,TESS/data/OAF_jar_neutral.wav,neutral,train,0.012574,0.005751,0.102967,0.107745,-495.947906,89.606804,82.702103,...,-0.118085,0.335404,0.128877,0.280124,-0.300117,0.251332,0.103346,0.093092,0.046550,0.116739
10676,TESS/data/OAF_lot_angry.wav,anger,train,0.038752,0.028055,0.110754,0.108246,-377.398712,78.184418,52.664715,...,0.071702,0.153994,-0.003404,0.254154,-0.233565,0.279082,-0.071062,0.094332,-0.015076,0.110328


In [None]:
output_dir = "TRAINING_ML"

if os.path.exists(output_dir):
    print("Folder already exists")
else:
    print("Folder created.")
    os.mkdir(output_dir)
    
output_path = os.path.join(output_dir, "LIBROSA_FEATURES.pkl")

if os.path.exists(output_path):
    print("file already exists")
else:
    print("File created.")
    librosa_features_df.to_pickle(output_path)

Folder already exists
File created.


# OpenSMILE Features Extraction

In [5]:
import opensmile

In [6]:
SR = 16000
smile = opensmile.Smile(
    feature_set = opensmile.FeatureSet.eGeMAPSv02,
    feature_level = opensmile.FeatureLevel.Functionals
)

def opensmile_features(file):
    
    try:
        y, _ = librosa.load(file, sr = SR)
        
        if len(y) < 2048:
            return None
        
        features = smile.process_signal(y,SR)
        features_vector = features.reset_index(drop=True).iloc[0].to_dict()
        
        return features_vector
    except Exception as e:
        print(f"{e}")
        return None
    
def opensmile_features_NORM(file):
    
    try:
        y_raw, _ = librosa.load(file, sr = SR)
        
        if len(y_raw) < 2048:
            return None
        
        y = peak_normalization(y_raw)
        
        features = smile.process_signal(y,SR)
        features_vector = features.reset_index(drop=True).iloc[0].to_dict()
        
        return features_vector
    except Exception as e:
        print(f"{e}")
        return None        

In [9]:
def extract_opensmile_features(row):
    
    file = row["file"]
    
    full_vector = {}
    
    full_vector["file"] = file
    full_vector["label"] = row["emotion"]
    full_vector["split"] = row["split"]
    
    try:
        full_vector.update(opensmile_features(file))
        
        return full_vector
    except Exception as e:
        print(f"Extraction failed for {file}: {e}")
        return None

In [None]:
if __name__ == "__main__":
    
    features_list = list()
    
    for i, row in tqdm(df.iterrows(), total = df.shape[0]):
        
        features_vector = extract_opensmile_features(row)
        
        if features_vector is not None:
            features_list.append(features_vector)
            
    opensmile_features_df = pd.DataFrame(features_list)
    

100%|██████████| 10678/10678 [08:53<00:00, 20.02it/s]


In [28]:
opensmile_features_df

Unnamed: 0,file,label,split,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,...,slopeUV0-500_sma3nz_amean,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp
0,EMOITA/data/1613671614352.wav,anger,train,38.082653,0.070298,36.122330,39.153728,39.698322,3.575993,13.681193,...,0.043457,0.024475,0.013053,4.697987,2.097902,0.420000,0.143527,0.055000,0.035000,-29.997206
1,EMOITA/data/1613658275427.wav,anger,train,40.416210,0.120092,35.865879,40.851280,43.643463,7.777584,134.605118,...,0.082220,-0.000170,0.249301,4.437870,2.702703,0.203333,0.144760,0.144444,0.229594,-17.485447
2,EMOITA/data/1613324357435.wav,anger,train,36.352093,0.073894,32.938965,37.385406,38.716221,5.777256,36.450661,...,0.039326,0.018660,0.035453,3.888889,2.298851,0.247500,0.218561,0.220000,0.165731,-20.091305
3,EMOITA/data/1614274086698.wav,anger,train,39.521694,0.132925,33.641102,41.257381,44.157024,10.515923,95.706833,...,0.069682,0.005402,0.113051,4.635762,2.702703,0.230000,0.127574,0.186000,0.204900,-22.264414
4,EMOITA/data/1612982146424.wav,anger,train,34.608627,0.200003,30.287437,33.143135,35.956718,5.669281,2856.700928,...,0.036268,0.003279,0.147435,3.319502,1.271186,0.130000,0.141657,0.475000,0.397146,-27.532452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10673,TESS/data/OAF_cab_neutral.wav,neutral,train,33.587734,0.013397,33.257698,33.573410,33.801815,0.544117,11.546522,...,-0.036753,0.025578,0.020441,2.645503,1.092896,0.655000,0.335000,0.240000,0.030000,-37.568298
10674,TESS/data/OAF_chain_disgust.wav,disgust,train,32.935085,0.090005,30.599874,32.400887,35.419796,4.819921,55.039375,...,-0.025782,0.018447,0.033190,2.145923,1.754386,0.330000,0.384773,0.215000,0.127574,-35.905819
10675,TESS/data/OAF_jar_neutral.wav,neutral,train,33.239281,0.018863,32.946789,33.233498,33.606705,0.659916,21.448641,...,-0.043057,0.024206,0.018796,1.960784,1.010101,0.810000,0.370000,0.160000,0.040000,-37.107567
10676,TESS/data/OAF_lot_angry.wav,anger,train,39.684753,0.053821,38.062737,40.380672,41.368752,3.306015,37.567173,...,-0.023124,0.012767,0.059855,2.500000,1.290323,0.585000,0.285000,0.110000,0.028284,-26.319199


In [31]:
output_dir = "TRAINING_ML"

if os.path.exists(output_dir):
    print("Folder already exists")
else:
    print("Folder created.")
    os.mkdir(output_dir)
    
output_path = os.path.join(output_dir, "OPENSMILE_FEATURES.pkl")

if os.path.exists(output_path):
    print("file already exists")
else:
    print("File created.")
    opensmile_features_df.to_pickle(output_path)

Folder already exists
File created.


In [13]:
# ---- LUFS
def lufs_normalization(y, sr = 16000, target_lufs = -23.0):
    
    if np.max(np.abs(y)) < 1e5:
        return y
    try:
        meter = pyln.Meter(sr)
        current_lufs = meter.integrated_loudness(y)
        
        y_norm = pyln.normalize.loudness(y, current_lufs, target_lufs)
        return y_norm
    except Exception as e:
        print(f'{e}')
        return y

# ---- PEAK    
def peak_normalization(y, sr = 16000, target_level = 0.95):
    
    if np.max(np.abs(y)) < 1e5:
        return y
    try:
        peak = np.max(np.abs(y))
        y_norm = y / peak * target_level
        
        return y_norm
        
    except Exception as e:
        print(f'{e}')
        return y

# ---- OpenSMILE Normalized
def opensmile_features_NORM(file):
    
    try:
        y_raw, _ = librosa.load(file, sr = SR)
        
        if len(y_raw) < 2048:
            return None
        
        y = lufs_normalization(y_raw)
        
        features = smile.process_signal(y,SR)
        features_vector = features.reset_index(drop=True).iloc[0].to_dict()
        
        return features_vector
    except Exception as e:
        print(f"{e}")
        return None   

In [14]:
def extract_opensmile_features(row):
    
    file = row["file"]
    
    full_vector = {}
    
    full_vector["file"] = file
    full_vector["label"] = row["emotion"]
    full_vector["split"] = row["split"]
    
    try:
        full_vector.update(opensmile_features_NORM(file))
        
        return full_vector
    except Exception as e:
        print(f"Extraction failed for {file}: {e}")
        return None
if __name__ == "__main__":
    
    features_list = list()
    
    for i, row in tqdm(df.iterrows(), total = df.shape[0]):
        
        features_vector = extract_opensmile_features(row)
        
        if features_vector is not None:
            features_list.append(features_vector)
            
    opensmile_features_df = pd.DataFrame(features_list)
    

100%|██████████| 10678/10678 [08:23<00:00, 21.23it/s]


In [16]:
opensmile_features_df

Unnamed: 0,file,label,split,F0semitoneFrom27.5Hz_sma3nz_amean,F0semitoneFrom27.5Hz_sma3nz_stddevNorm,F0semitoneFrom27.5Hz_sma3nz_percentile20.0,F0semitoneFrom27.5Hz_sma3nz_percentile50.0,F0semitoneFrom27.5Hz_sma3nz_percentile80.0,F0semitoneFrom27.5Hz_sma3nz_pctlrange0-2,F0semitoneFrom27.5Hz_sma3nz_meanRisingSlope,...,slopeUV0-500_sma3nz_amean,slopeUV500-1500_sma3nz_amean,spectralFluxUV_sma3nz_amean,loudnessPeaksPerSec,VoicedSegmentsPerSec,MeanVoicedSegmentLengthSec,StddevVoicedSegmentLengthSec,MeanUnvoicedSegmentLength,StddevUnvoicedSegmentLength,equivalentSoundLevel_dBp
0,EMOITA/data/1613671614352.wav,anger,train,38.082653,0.070298,36.122330,39.153728,39.698322,3.575993,13.681193,...,0.043457,0.024475,0.013053,4.697987,2.097902,0.420000,0.143527,0.055000,0.035000,-29.997206
1,EMOITA/data/1613658275427.wav,anger,train,40.416210,0.120092,35.865879,40.851280,43.643463,7.777584,134.605118,...,0.082220,-0.000170,0.249301,4.437870,2.702703,0.203333,0.144760,0.144444,0.229594,-17.485447
2,EMOITA/data/1613324357435.wav,anger,train,36.352093,0.073894,32.938965,37.385406,38.716221,5.777256,36.450661,...,0.039326,0.018660,0.035453,3.888889,2.298851,0.247500,0.218561,0.220000,0.165731,-20.091305
3,EMOITA/data/1614274086698.wav,anger,train,39.521694,0.132925,33.641102,41.257381,44.157024,10.515923,95.706833,...,0.069682,0.005402,0.113051,4.635762,2.702703,0.230000,0.127574,0.186000,0.204900,-22.264414
4,EMOITA/data/1612982146424.wav,anger,train,34.608627,0.200003,30.287437,33.143135,35.956718,5.669281,2856.700928,...,0.036268,0.003279,0.147435,3.319502,1.271186,0.130000,0.141657,0.475000,0.397146,-27.532452
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10673,TESS/data/OAF_cab_neutral.wav,neutral,train,33.587734,0.013397,33.257698,33.573410,33.801815,0.544117,11.546522,...,-0.036753,0.025578,0.020441,2.645503,1.092896,0.655000,0.335000,0.240000,0.030000,-37.568298
10674,TESS/data/OAF_chain_disgust.wav,disgust,train,32.935085,0.090005,30.599874,32.400887,35.419796,4.819921,55.039375,...,-0.025782,0.018447,0.033190,2.145923,1.754386,0.330000,0.384773,0.215000,0.127574,-35.905819
10675,TESS/data/OAF_jar_neutral.wav,neutral,train,33.239281,0.018863,32.946789,33.233498,33.606705,0.659916,21.448641,...,-0.043057,0.024206,0.018796,1.960784,1.010101,0.810000,0.370000,0.160000,0.040000,-37.107567
10676,TESS/data/OAF_lot_angry.wav,anger,train,39.684753,0.053821,38.062737,40.380672,41.368752,3.306015,37.567173,...,-0.023124,0.012767,0.059855,2.500000,1.290323,0.585000,0.285000,0.110000,0.028284,-26.319199


In [17]:
output_dir = "TRAINING_ML"

if os.path.exists(output_dir):
    print("Folder already exists")
else:
    print("Folder created.")
    os.mkdir(output_dir)
    
output_path = os.path.join(output_dir, "OPENSMILE_FEATURES_NORM.pkl")

if os.path.exists(output_path):
    print("file already exists")
else:
    print("File created.")
    opensmile_features_df.to_pickle(output_path)

Folder already exists
file already exists
