## Data Preparation

### Feature Extraction
Data provided of audio cannot be understood by the models directly so we need to convert them into an understandable format for which feature extraction is used.

In this project we will be extracting these features:
- Zero Crossing Rate
- Chroma_stft
- MFCC
- RMS(root mean square) value
- MelSpectogram

to train our model.

In [49]:
# Importing required packages
import pandas as pd
import numpy as np

import sys

import librosa

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [2]:
dataset = pd.read_csv("./data/dataset.csv")
dataset.head()

Unnamed: 0,Emotions,Path
0,neutral,./data/Ravdess/audio_speech_actors_01-24/Actor...
1,neutral,./data/Ravdess/audio_speech_actors_01-24/Actor...
2,neutral,./data/Ravdess/audio_speech_actors_01-24/Actor...
3,neutral,./data/Ravdess/audio_speech_actors_01-24/Actor...
4,calm,./data/Ravdess/audio_speech_actors_01-24/Actor...


In [12]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12162 entries, 0 to 12161
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Emotions  12162 non-null  object
 1   Path      12162 non-null  object
dtypes: object(2)
memory usage: 190.2+ KB


In [3]:
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)

In [4]:
def extract_features(data, sample_rate):
    result = np.array([])
    # Zero Crossing Rate
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result = np.hstack((result, zcr))
    
    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # RMS(Root Mean Square) Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    
    return result

In [5]:
def get_features_from_audio_file(path):
    # duration and offset are used to use important parts of the audio contributing to feature extraction
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    
    # without augmentation
    res1 = extract_features(data, sample_rate)
    result = np.array(res1)
    
    # data with noise
    noise_data = noise(data)
    res2 = extract_features(noise_data, sample_rate)
    result = np.vstack((result, res2))
    
    # data with stretching and pitching
    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    res3 = extract_features(data_stretch_pitch, sample_rate)
    result = np.vstack((result, res3))
    
    return result

In [17]:
X, Y = [], []
for i, (path, emotion) in enumerate(zip(dataset.Path, dataset.Emotions)):
    if(i%50 == 0):
        print(f"Completed: {i/dataset.shape[0]}%")
        print(f"Remaining Files: {dataset.shape[0] - i}")
    feature = get_features_from_audio_file(path)
    for ele in feature:
        X.append(ele)
        # appending emotion 3 times as we have made 3 augmentation techniques on each audio file.
        Y.append(emotion)

Completed: 0.0%
Remaining Files: 12162
Completed: 0.0041111659266568%
Remaining Files: 12112
Completed: 0.0082223318533136%
Remaining Files: 12062
Completed: 0.0123334977799704%
Remaining Files: 12012
Completed: 0.0164446637066272%
Remaining Files: 11962
Completed: 0.020555829633284%
Remaining Files: 11912
Completed: 0.0246669955599408%
Remaining Files: 11862
Completed: 0.0287781614865976%
Remaining Files: 11812
Completed: 0.0328893274132544%
Remaining Files: 11762
Completed: 0.0370004933399112%
Remaining Files: 11712
Completed: 0.041111659266568%
Remaining Files: 11662
Completed: 0.0452228251932248%
Remaining Files: 11612
Completed: 0.0493339911198816%
Remaining Files: 11562
Completed: 0.0534451570465384%
Remaining Files: 11512
Completed: 0.0575563229731952%
Remaining Files: 11462
Completed: 0.061667488899852%
Remaining Files: 11412
Completed: 0.0657786548265088%
Remaining Files: 11362
Completed: 0.0698898207531656%
Remaining Files: 11312
Completed: 0.0740009866798224%
Remaining Files

In [19]:
features_dataset = pd.DataFrame(X)
features_dataset['labels'] = Y
features_dataset.to_csv('./data/features_dataset.csv', index=False)
features_dataset.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,153,154,155,156,157,158,159,160,161,labels
0,0.300781,0.732068,0.752591,0.733378,0.731624,0.704962,0.662388,0.687283,0.735435,0.7562,...,4.319258e-06,3.298864e-06,2.153281e-06,2.286675e-06,5.131692e-06,8.065742e-06,5e-06,2.270857e-06,1.642365e-07,neutral
1,0.335006,0.783609,0.838017,0.829405,0.825558,0.824765,0.757258,0.694166,0.734845,0.763651,...,0.0001252698,0.0001222288,0.0001163826,0.0001182331,0.0001225794,0.0001164659,0.000122,0.000123696,0.0001197713,neutral
2,0.185847,0.622753,0.718277,0.747283,0.709554,0.688333,0.674101,0.661235,0.697782,0.739232,...,8.620835e-07,9.594722e-07,7.775631e-07,5.270748e-07,3.628429e-07,9.075997e-07,1e-06,5.035564e-07,2.570757e-08,neutral
3,0.271272,0.674888,0.723125,0.724594,0.681155,0.670361,0.674336,0.629854,0.680085,0.708237,...,6.998011e-06,7.050108e-06,6.670962e-06,6.999257e-06,1.21788e-05,9.449916e-06,8e-06,2.638513e-06,1.788902e-07,neutral
4,0.3133,0.775134,0.813024,0.795702,0.780789,0.78547,0.700858,0.654817,0.712211,0.741578,...,6.885124e-05,6.948706e-05,6.912925e-05,6.853575e-05,7.07397e-05,6.978575e-05,6.9e-05,6.273952e-05,6.12525e-05,neutral
