# Data Preprocessing

In [1]:
%%html
<style type='text/css'>
.CodeMirror{
font-family: JetBrains Mono;
</style>

In [2]:
import os
import pylab
import librosa
import warnings 
import numpy as np
import pandas as pd
import multiprocessing
import librosa.display
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

  "class": algorithms.Blowfish,


In [3]:
N_MELS = 256
MEL_SPEC_FRAME_SIZE = 1024
SAMPLING_RATE = 16_000


def get_speaker_paths(datapath: str) -> list[str]:
    speaker_list = []
    accent_subfolders = [f.path for f in os.scandir(datapath) if f.is_dir()]

    for accent in accent_subfolders:
        for gender in ['female', 'male']:
            for speaker in os.listdir(os.path.join(accent, gender)):

                if not speaker.startswith('.'):
                    speaker_list.append((speaker,os.path.join(accent, gender, speaker)))

    return speaker_list


def get_wav_files(datapath: str) -> list[str]:
    return [file for file in os.listdir(datapath) if file.endswith('.wav')]


def plot_melspec(melspec, fs):
    plt.figure(figsize=(20, 8))
    plt.xlabel('Time')
    plt.ylabel('Mel-Frequency')
    librosa.display.specshow(melspec,
                             y_axis='mel',
                             fmax=fs / 2,
                             sr=fs,
                             hop_length=int(MEL_SPEC_FRAME_SIZE / 2),
                             x_axis='time')
    plt.colorbar(format='%+2.0f dB')
    plt.title('Mel Spectrogram')
    plt.tight_layout()
    plt.show()


def mel_spectogram_chunks(wavfile_path: str, chunk_seconds: float,
                          plot: bool = False) -> (librosa.feature.melspectrogram):

    # Load .wav file
    sig, fs = librosa.load(wavfile_path, sr=SAMPLING_RATE)

    # Normalise between [-1,1]
    sig /= np.max(np.abs(sig), axis=0)

    # Determine the number of chunk samples
    samples = fs*chunk_seconds if chunk_seconds else len(sig)
    samples_elapsed = 0
    
    melspec_chunks = []
    
    while samples_elapsed < len(sig):
        melspec = librosa.feature.melspectrogram(y=sig[samples_elapsed:(samples_elapsed + samples)],
                                                 sr=fs,
                                                 center=True,
                                                 n_fft=MEL_SPEC_FRAME_SIZE,
                                                 hop_length=int(
                                                     MEL_SPEC_FRAME_SIZE / 2),
                                                 n_mels=N_MELS)
        melspec = librosa.power_to_db(melspec, ref=np.max)
        melspec_chunks.append(melspec)
        samples_elapsed += samples
    
    return melspec_chunks


In [4]:
# For loop that goes through the relative paths of every .wav file
corpus_path = '..\\corpus'

speaker_path = get_speaker_paths(corpus_path)

## Create 3 second melspectrogram images of every speaker wavelength.

In [None]:
#For every speaker create a folder

folder = os.path.join('..','data')

try: os.mkdir(folder)    
except: pass

for speaker,_ in speaker_path:
    try:os.mkdir(os.path.join(folder,speaker))
    except: pass

In [None]:
pylab.axis('off') #Remove Axis
pylab.axes([0., 0., 1., 1.], frameon=False, xticks=[], yticks=[]) #Remove white padding

count = {speaker:0 for speaker,_ in speaker_path}

#For every speaker
for i,(speaker, path) in enumerate(speaker_path):
    
   
    #For every .wav file for that speaker
    for file in get_wav_files(path):
    
        #Get path to .wav file
        filepath = os.path.join(path,file)
        
        # Slice into 3 seconds chunks
        mel_chunks = mel_spectogram_chunks(wavfile_path=filepath, chunk_seconds=3, plot=True)

        # Remove last item because it is not 3 seconds.
        mel_chunks = mel_chunks[:-1]
        
        #Save every 3 second chunk into an image in the respect speaker folder
        for i,mel in enumerate(mel_chunks):
            print(f'{speaker} - {file}: {str(i+1).zfill(3)}                                    ',end='\r')
            
            plt.clf() #Important. Without this time to save plot grows linearly.
            pylab.axis('off') #Remove Axis
            pylab.axes([0., 0., 1., 1.], frameon=False, xticks=[], yticks=[]) #Remove white padding
        
            count[speaker] += 1
            save_path = os.path.join(folder,speaker,f'{str(count[speaker]).zfill(3)}.jpg')
            
            librosa.display.specshow(mel)
        
            pylab.savefig(save_path, bbox_inches=None, pad_inches=0, dpi=15) #Save Image  
            


## For every speaker, perform train/val/test split and save in respective folders.

In [None]:
#For every split create a folder

folder = os.path.join('..','data')

for split in ['train','val','test']:
    try:os.mkdir(os.path.join(folder,split))
    except: pass
    
    
    for speaker,_ in speaker_path:
        try: os.mkdir(os.path.join(folder,split,speaker))
        except: pass

In [None]:
#Perform the split by moving (75%,15%,15%) from every speaker folder
#to the respective folder in the split

for speaker in os.listdir(folder):
    
    if speaker in ['train','val','test']:
        continue
    
    
    #Create a shuffled list of all image names in speaker
    random.shuffle(idx := [name for name in os.listdir(os.path.join(folder,speaker))])
    
    
    _80 = int(len(idx)*0.75)
    _10 = int(len(idx)*0.15)
    
    #Move first 80% to train/speaker
    for file in idx[:_80]:
        os.replace(os.path.join(folder,speaker,file),
                   os.path.join(folder,'train',speaker,file))
        
    #Move second 10% to train/speaker
    for file in idx[_80:-_10]:
        os.replace(os.path.join(folder,speaker,file),
                   os.path.join(folder,'val',speaker,file))
        
    #Move last 10% to test/speaker
    for file in idx[-_10:]:
        os.replace(os.path.join(folder,speaker,file),
                   os.path.join(folder,'test',speaker,file))


        #Finally remove all the empty speaker folders
for speaker,_ in speaker_path:
    try: os.rmdir(os.path.join(folder,speaker))
    except: pass
   

In [43]:
from collections import Counter

#Get speaker names
all_speakers = [s for s,_ in speaker_path]

#Count how many times the identifier appear in speaker names
my_dict = dict(Counter(all_speakers))

[k for k,v in my_dict.items() if v != 1]


['axm001']

In [15]:
len(get_speaker_paths('..\\corpus'))

285