---
---

# Feature Engineering

---
---

_The assertions and methodologies outlined in this notebook are substantiated by referenced scientific studies detailed in the README file._

### Load libraries and Data

In [1]:
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import sys
sys.path.append("../src")
from extract_features import *
from visualize_audio import *
from data_manager import *
from preprocess_data import *
from load_config import *
from split_silence_transformer import SplitSilenceTransformer
from mfcc_transformer import MfccTransformer

In [2]:
constants = load_constants_from_yaml('../constants.yml')

SAMPLING_RATING = constants["SAMPLING_RATING"]
FRAME_LENGTH_ENERGY = constants["FRAME_LENGTH_ENERGY"]
THRESHOLD_PERCENTAGE = constants["THRESHOLD_PERCENTAGE"]
MIN_SILENCE_DURATION = constants["MIN_SILENCE_DURATION"]
HOP_LENGTH = constants["HOP_LENGTH"]
SEGMENT_DURATION = constants["SEGMENT_DURATION"]
SEGMENT_OVERLAP = constants["SEGMENT_OVERLAP"]
N_MFCC = constants["N_MFCC"]
CONSIDERED_ACCENTS = constants["CONSIDERED_ACCENTS"]

In [3]:
df = load_audio_files("../data/raw/recordings/", sr=SAMPLING_RATING)
df = filter_data_based_on_accents(df=df, considered_accents=CONSIDERED_ACCENTS)

Trim silence from audio

In [4]:
split_tranformer=SplitSilenceTransformer(
    variables=['audio', 'labels'],
    sampling_rating=SAMPLING_RATING,
    threshold_percentage=THRESHOLD_PERCENTAGE,
    min_silence_duration=MIN_SILENCE_DURATION,
    frame_length_energy=FRAME_LENGTH_ENERGY,
    hop_length=HOP_LENGTH
)
split_tranformer.fit(df)

In [5]:
df=split_tranformer.transform(df)

In [6]:
df.shape

(242, 2)

---

One of the most commonly used spectral feature representations is the Mel-frequency cepstral coefficients (MFCC). MFCC features are generally employed in automatic speech recognition (ASR) and accent recognition systems and are known to perform best in shallow models. Spectrograms, on the other hand, are more effective in deep models and are sometimes utilized in accent recognition. We will extract MFCCs using the Librosa library.

Extract MFCC features from trimmed audio data (not on segmented audio data because the function itself will split the audio data into segments).

In [7]:
mfcc_transformer=MfccTransformer(
    variables=["audio", "labels"],
    sampling_rating=SAMPLING_RATING, 
    n_mfcc=N_MFCC,
    duration=SEGMENT_DURATION,
    overlap=SEGMENT_OVERLAP
)

In [8]:

mfcc_transformer.fit(df)

In [9]:
print(df.shape)
df.head

(242, 2)


<bound method NDFrame.head of                                                  audio   labels
0    [-0.00081889424, -0.0012332641, -0.0010821958,...  english
1    [-3.3004353e-06, 2.3220142e-05, -5.8616065e-06...  english
2    [1.5094573e-05, -4.2987816e-07, 2.1244243e-06,...  english
3    [0.0027380765, 0.0043952055, 0.004028226, 0.00...  english
4    [0.00018491458, -9.82639e-05, 6.523135e-05, -7...  english
..                                                 ...      ...
237  [0.0066564586, 0.0095736515, 0.008673913, 0.00...  english
238  [-5.1107454e-06, 1.4583517e-05, 2.057516e-05, ...   arabic
239  [-0.00048380543, -0.00065112824, -5.683335e-05...  english
240  [-1.4131354e-05, 2.5187623e-05, -1.1105545e-05...  english
241  [-0.0004948875, -0.0009152264, -0.0009689817, ...  english

[242 rows x 2 columns]>

In [10]:
df=mfcc_transformer.transform(df)
print(df.shape)

(242, 15)


In [16]:
print(df.columns)

Index(['audio', 'labels', 'mfcc_1', 'mfcc_2', 'mfcc_3', 'mfcc_4', 'mfcc_5',
       'mfcc_6', 'mfcc_7', 'mfcc_8', 'mfcc_9', 'mfcc_10', 'mfcc_11', 'mfcc_12',
       'mfcc_13'],
      dtype='object')


In [18]:
df["mfcc_1"]

0      [-637.64417, -645.1791, -655.1502, -655.1509, ...
1      [-666.61017, -666.61017, -666.61017, -666.6101...
2      [-712.3949, -708.4589, -705.359, -706.3481, -7...
3      [-505.00397, -487.16403, -495.6905, -506.24228...
4      [-591.27, -585.8986, -624.67267, -646.28455, -...
                             ...                        
237    [-543.0315, -580.4876, -671.9694, -679.3845, -...
238    [-633.45435, -633.45435, -633.45435, -633.4543...
239    [-620.981, -596.23126, -580.2883, -570.8729, -...
240    [-648.9887, -648.9887, -648.9887, -648.9887, -...
241    [-627.3312, -609.37616, -612.8013, -615.68915,...
Name: mfcc_1, Length: 242, dtype: object

---

In [None]:
# we want to split the mfccs into segments with the corresponding labels

# Initialize lists to store segments and their corresponding labels
segmented_mfccs = []
segmented_labels = []

# Iterate over each audio
for i, mfcc in enumerate(mfccs):
    # Get the shape of MFCCs and label for the current audio
    mfcc_shape = mfcc.shape
    label = labels_trimmed[i]

    # Extract the number of segments and the number of frames per segment
    num_segments = mfcc.shape[1]  # Second dimension of the MFCC shape

    # Iterate over each segment in the current audio
    for j in range(num_segments):
        # Extract the MFCCs for the current segment
        mfcc_segment = mfccs[i][:, j]

        # Append the segment and its corresponding label to the lists
        segmented_mfccs.append(mfcc_segment)
        segmented_labels.append(label)

# Convert the lists to NumPy arrays
# segmented_mfccs = np.array(segmented_mfccs)
# segmented_labels = np.array(segmented_labels)

print(f"len(segmented_mfccs): {len(segmented_mfccs)}")
print(f"len(segmented_labels): {len(segmented_labels)}")
print(f"segmented_mfccs[0]: {len(segmented_mfccs[0])}")
print(f"segmented_labels[0]: {segmented_labels[0]}")

len(segmented_mfccs): 941351
len(segmented_labels): 941351
segmented_mfccs[0]: 13
segmented_labels[0]: english
