## Load libraries and Data

In [1]:
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
import sys
sys.path.append("../src")
from extract_features import *
from visualize_audio import *
from prepare_data import *


constants

In [2]:
SAMPLING_RATING = 22050
FRAME_LENGTH_ENERGY = 2048  # 512
THRESHOLD_PERCENTAGE = 0.01  # percentage of max energy
MIN_SILENCE_DURATION = 1  # in seconds

# n_fft=512 # the window size
HOP_LENGTH = 512  # the number of samples between successive frames
SEGMENT_DURATION = 0.025  # in seceonds
SEGMENT_OVERLAP = 0.01  # in seceonds
N_MFCC = 13

Load all audio files with the corresponding labels (accents) 

In [3]:
audio_data, raw_labels = load_audio_files("../data/raw/recordings/", sr=SAMPLING_RATING)
unique_labels=set(raw_labels)
print(f'there are {len(unique_labels)} unique labels: {unique_labels}')
print(f'there are {len(audio_data)} audio files')

there are 3 unique labels: {'french', 'korean', 'english'}
there are 165 audio files


## Preprocessing

Convert labels to numerical format using label encoding

In [4]:
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(raw_labels)

In [5]:
raw_labels

['english',
 'french',
 'french',
 'korean',
 'english',
 'korean',
 'french',
 'english',
 'english',
 'french',
 'english',
 'french',
 'french',
 'english',
 'english',
 'korean',
 'english',
 'english',
 'french',
 'korean',
 'french',
 'french',
 'english',
 'korean',
 'english',
 'french',
 'korean',
 'english',
 'french',
 'english',
 'english',
 'french',
 'french',
 'english',
 'french',
 'korean',
 'korean',
 'korean',
 'english',
 'korean',
 'french',
 'french',
 'french',
 'french',
 'french',
 'korean',
 'korean',
 'french',
 'korean',
 'english',
 'korean',
 'french',
 'french',
 'french',
 'korean',
 'korean',
 'korean',
 'english',
 'korean',
 'korean',
 'english',
 'korean',
 'english',
 'english',
 'french',
 'french',
 'korean',
 'french',
 'english',
 'korean',
 'korean',
 'english',
 'french',
 'korean',
 'korean',
 'korean',
 'english',
 'english',
 'french',
 'french',
 'french',
 'english',
 'french',
 'english',
 'korean',
 'french',
 'french',
 'english',
 'en

In [6]:
labels

array([0, 1, 1, 2, 0, 2, 1, 0, 0, 1, 0, 1, 1, 0, 0, 2, 0, 0, 1, 2, 1, 1,
       0, 2, 0, 1, 2, 0, 1, 0, 0, 1, 1, 0, 1, 2, 2, 2, 0, 2, 1, 1, 1, 1,
       1, 2, 2, 1, 2, 0, 2, 1, 1, 1, 2, 2, 2, 0, 2, 2, 0, 2, 0, 0, 1, 1,
       2, 1, 0, 2, 2, 0, 1, 2, 2, 2, 0, 0, 1, 1, 1, 0, 1, 0, 2, 1, 1, 0,
       0, 1, 2, 0, 0, 0, 1, 2, 0, 2, 0, 0, 2, 0, 1, 2, 0, 0, 0, 2, 2, 2,
       2, 1, 2, 2, 2, 2, 0, 1, 2, 0, 0, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1,
       1, 1, 2, 0, 0, 2, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 2, 2, 1,
       1, 1, 2, 0, 2, 2, 1, 0, 2, 0, 0])

In [7]:
print(type(labels),' : ', labels.shape)

<class 'numpy.ndarray'>  :  (165,)


Trim silence part and store all segments as individual audios with the corresponding labels

In [8]:
#trim silence part and store all segments as individual audios with the corresponding labels
audio_data_trimmed = []
labels_trimmed = []
for i, audio in enumerate(audio_data):
    audible_segments = split_audio_by_silence(
        audio,
        SAMPLING_RATING,
        threshold_percentage=THRESHOLD_PERCENTAGE,
        min_silence_duration=MIN_SILENCE_DURATION,
    )
    audio_data_trimmed.extend(audible_segments)
    labels_trimmed.extend([labels[i]] * len(audible_segments))

In [9]:
labels_trimmed=np.array(labels_trimmed)
labels_trimmed=labels_trimmed.reshape(len(labels_trimmed), 1)
print(labels_trimmed.shape)

(175, 1)


segment audio data

In [10]:
# Segment audio data

# audio_data_segmented = []
# labels_segmented = []
# for i, audio in enumerate(audio_data_trimmed):
#     segments = segment_audio(
#         audio, SAMPLING_RATING, duration=SEGMENT_DURATION, overlap=SEGMENT_OVERLAP
#     )
#     audio_data_segmented.extend(segments)
#     labels_segmented.extend([labels_trimmed[i]] * len(segments))
#     # for j, segment in enumerate(segments):
#     #     np.save(f"../data/processed/segments/{labels_trimmed[i]}_{i}_{j}.npy", segment)

In [11]:
# print(f"there are {len(audio_data_segmented)} segments .")

### Extract Features

Extract MFCCs

In [12]:
# extract MFCC features from trimmed audio data, not on segmented audio data 
# because the function itself will split the audio data into segments
mfccs = []
for audio in audio_data_trimmed:
    mfcc = compute_mfcc(
        audio, SAMPLING_RATING, n_mfcc=  N_MFCC, duration=SEGMENT_DURATION, overlap=SEGMENT_OVERLAP
    )
    mfccs.append(mfcc)

In [13]:
print(f"len(mfccs): {len(mfccs)}")
print(f"num_segments for mfccs[0]: {mfccs[0].shape[1]}")
print(f"num_segments for mfccs[5]: {mfccs[5].shape[1]}")

len(mfccs): 175
num_segments for mfccs[0]: 2318
num_segments for mfccs[5]: 2259


perform one-hot encoding on labels

In [14]:
# Perform one-hot encoding
onehot_encoder = OneHotEncoder()
onehot_labels = onehot_encoder.fit_transform(labels_trimmed).toarray() # toarray() is  to convert the sparse matrix to a dense array
onehot_labels

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0., 1.],
       [0., 0., 1.],
       [1., 0., 0.],
       [0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 1., 0.],
       [0., 0., 1.],
       [0., 0

In [15]:
# Convert labels to numerical format using label encoding
#label_encoder = LabelEncoder()
#encoded_labels_tr = label_encoder.fit_transform(labels_trimmed)

# Perform one-hot encoding
#onehot_encoder = OneHotEncoder()
#encoded_labels_tr = encoded_labels_tr.reshape(len(encoded_labels_tr), 1)
#onehot_labels_tr = onehot_encoder.fit_transform(encoded_labels_tr).toarray() # toarray() is  to convert the sparse matrix to a dense array

### we want to split the mfccs into segments with the corresponding labels

Initialize lists to store segments and their corresponding labels

In [16]:
segmented_mfccs = []
segmented_labels = []

Iterate over each audio

In [17]:
for i, mfcc in enumerate(mfccs):
    # Get the shape of MFCCs and label for the current audio
    mfcc_shape = mfcc.shape
    label = labels_trimmed[i]

    # Extract the number of segments and the number of frames per segment
    num_segments = mfcc.shape[1]  # Second dimension of the MFCC shape

    # Iterate over each segment in the current audio
    for j in range(num_segments):
        # Extract the MFCCs for the current segment
        mfcc_segment = mfccs[i][:, j]

        # Append the segment and its corresponding label to the lists
        segmented_mfccs.append(mfcc_segment)
        segmented_labels.append(label)

In [18]:
# Convert the lists to NumPy arrays
# segmented_mfccs = np.array(segmented_mfccs)
# segmented_labels = np.array(segmented_labels)

print(f"len(segmented_mfccs): {len(segmented_mfccs)}")
print(f"len(segmented_labels): {len(segmented_labels)}")
print(f"segmented_mfccs[0]: {len(segmented_mfccs[0])}")
print(f"segmented_labels[0]: {segmented_labels[0]}")

len(segmented_mfccs): 451528
len(segmented_labels): 451528
segmented_mfccs[0]: 13
segmented_labels[0]: [0]


In [19]:
segmented_labels = np.array(segmented_labels)
segmented_labels = segmented_labels.reshape(len(segmented_labels), 1)

Perform One-Hot Encoding labels on splitted MFCCs

In [20]:
# Perform one-hot encoding
onehot_encoder_segmentation = OneHotEncoder()
segmented_onehot_labels = onehot_encoder_segmentation.fit_transform(segmented_labels).toarray() # toarray() is  to convert the sparse matrix to a dense array

print(segmented_onehot_labels)

[[1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]
 ...
 [1. 0. 0.]
 [1. 0. 0.]
 [1. 0. 0.]]


## Storing data

In [29]:
import h5py
def save_arrays_to_hdf5(arrays, file_path):
    with h5py.File(file_path, 'w') as hf:
        for i, arr in enumerate(arrays):
            dataset_name = f'array_{i}'
            hf.create_dataset(dataset_name, data=arr)

In [30]:
processed_data_path = "../data/processed/"

save audio data after trimming silence parts

In [31]:
#np.save(processed_data_path + 'audio_data_trimmed.npy', audio_data_trimmed)
save_arrays_to_hdf5(audio_data_trimmed, processed_data_path + 'audio_data_trimmed.h5')
np.save(processed_data_path + 'labels_trimmed.npy', labels_trimmed)

save MFCCs and the labels One-Hot encoded

In [32]:
#np.save(processed_data_path + 'mfccs.npy', mfccs)
save_arrays_to_hdf5(mfccs, processed_data_path + 'mfccs.h5')
np.save(processed_data_path + 'onehot_labels.npy', onehot_labels)

Save segmented MFCC features and the corresponding labels one-hot encoded

In [23]:
np.save(processed_data_path + 'segmented_mfccs.npy', segmented_mfccs)
np.save(processed_data_path + 'segmented_onehot_labels.npy', segmented_onehot_labels)