In [1]:
import numpy as np
import librosa
import pandas as pd

In [7]:
#data augmentation
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate=rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=2)

In [25]:
import numpy as np
def extract_features(data, sample_rate):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    
    return result

def get_features(path):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    
    # without augmentation
    res1 = extract_features(data, sample_rate)
    result = np.array(res1)

    # data with noise
    noise_data = noise(data)
    res2 = extract_features(noise_data, sample_rate)
    result = np.vstack((result, res2)) # stacking vertically

    # data with stretching and pitching
    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    res3 = extract_features(data_stretch_pitch, sample_rate)
    result = np.vstack((result, res3)) # stacking vertically
    
    return result


In [27]:
import dill

with open("../artifacts/extract_features.pkl", "wb") as f:
  dill.dump(extract_features, f)


In [12]:
path = "../data/raw/OréauFR_01/f/sessc/10a01Ca.wav"
feature = get_features(path)


In [11]:
audio_path_df = pd.read_csv("../data/file_paths.csv")
audio_path_df = audio_path_df.loc[:, ~audio_path_df.columns.str.contains('^Unnamed')]
audio_path_df.head(1)

Unnamed: 0,Genre,Identifiant,Emotion,Fichier
0,F,37,N,../data/raw/OréauFR_02/f/sessn/37a05Na.wav


In [5]:
X, Y = [], []
for path, emotion in zip(audio_path_df.Fichier, audio_path_df.Emotion):
    feature = get_features(path)
    for ele in feature:
        X.append(ele)
        # appending emotion 3 times as we have made 3 augmentation techniques on each audio file.
        Y.append(emotion)

In [6]:
Features = pd.DataFrame(X)
Features['target'] = Y
Features.to_csv('../data/ref_data.csv', index=False)
Features.shape

(1302, 163)

In [13]:
def extract_feature_to_df(df):
    feature_list = []

    for index, row in df.iterrows():
        file_path = row['Fichier']
        features = get_features(file_path)
        feature_row = row.to_dict()
        feature_row.update({f'feature_{i}': feat for i, feat in enumerate(features)})
        feature_list.append(feature_row)

    df_features = pd.DataFrame(feature_list)
    
    df_features = df_features.drop(columns=["Fichier"])
    df_features = df_features.rename(columns={"Emotion": "target_emotion"})

    return df_features