In [87]:
from pathlib import Path
import os
from glob import glob
import numpy as np
from pydub import AudioSegment
import librosa

In [88]:
def get_filepaths():
    data_dir = Path('../data/raw')
    
    return list(data_dir.glob('*.m4a'))

In [89]:
def get_data_overview(filepaths):
    data_overview = {}
    for filepath in filepaths:
        filename = str(filepath).split('/')[-1].split('.m4a')[0]
        splits = filename.split('_')
        if not splits[0] in data_overview:
            data_overview[splits[0]] = []
        data_overview[splits[0]].append(int(splits[1]))
            
    return data_overview

In [90]:
def get_audio(filepath):
    audio_obj = AudioSegment.from_file(filepath)
    audio_array = np.array(audio_obj.get_array_of_samples()).astype(np.float32)
    audio_fs = audio_obj.frame_rate    
    
    return audio_array, audio_fs

In [91]:
def get_xarrays(filepaths, x_is='spectrogram', n_mels=128):
    xarrays = []
    laudio = 0
    for filepath in filepaths:
        audio_array, audio_fs = get_audio(filepath)
        if x_is == 'raw':
            laudio = max(len(audio_array), laudio)
            audio_array = np.reshape(audio_array, (1,len(audio_array)))
            xarrays.append(audio_array)
        elif x_is == 'spectrogram':
            spectro = librosa.feature.melspectrogram(audio_array, sr=audio_fs, n_mels=n_mels)
            log_spectro = librosa.power_to_db(spectro, ref=np.max)
            laudio = max(log_spectro.shape[1], laudio)
            xarrays.append(log_spectro)
        else:
            raise ValueError("x_is must be: 'raw', 'spectrogram'") 
    
    # pad
    for i in range(len(xarrays)):
        nappend = laudio - xarrays[i].shape[1]
        xarrays[i] = np.append(xarrays[i], np.zeros((xarrays[i].shape[0],nappend)), axis=1)    
    
    return np.array(xarrays)

In [92]:
def get_yarrays(filepaths, y_is='language'):
    """for gender: 0 is male, 1 is gender"""
    yarrays = []
    for filepath in filepaths:
        filename = str(filepath).split('/')[-1].split('.m4a')[0]
        if y_is == 'language':
            yarrays.append(filename.split('_')[0])
        elif y_is == 'gender':
            gender_index = int(filename.split('_')[1])
            if gender_index % 2 == 0:
                yarrays.append('male')
            else:
                yarrays.append('female')
    
    return yarrays
     

In [93]:
def split_train_test(filepaths, arrays, split):
    overview = get_data_overview(filepaths)
    files_train = []
    files_test = []
    for language, items in overview.items():
        ntrain = int(split*len(items))
        items_train = items[:ntrain]
        items_test = items[ntrain:]
        files_train.extend([f"{language}_{i}" for i in items_train])
        files_test.extend([f"{language}_{i}" for i in items_test])  
        
    arrays_train = []
    arrays_test = []
    for filepath, array in zip(filepaths, arrays):
        filename = str(filepath).split('/')[-1].split('.m4a')[0]
        if filename in files_train:
            arrays_train.append(array)
        elif filename in files_test:
            arrays_test.append(array)
        else:
            raise ValueError(f"Problem: {filepath} nor in train nor in test")
    
    return (np.array(arrays_train), np.array(arrays_test))
        
            

In [94]:
def build_train_test(x_is='raw', y_is='language', split=0.8):
    filepaths = get_filepaths()
    xarrays = get_xarrays(filepaths, x_is=x_is)
    yarrays = get_yarrays(filepaths, y_is=y_is)
    xtrain, xtest = split_train_test(filepaths, xarrays, split)
    ytrain, ytest = split_train_test(filepaths, yarrays, split)
    
    return (xtrain, ytrain), (xtest, ytest)

In [95]:
x_is = 'spectrogram'
y_is = 'language'
(xtrain, ytrain), (xtest, ytest) = build_train_test(x_is=x_is, y_is=y_is)

In [103]:
savedir = Path('../data/preprocessed')
savefile = os.path.join(savedir, f'{x_is}_{y_is}.npz')
np.savez(savefile, xtrain=xtrain, ytrain=ytrain, xtest=xtest, ytest=ytest)

In [108]:
npzfile = np.load(savefile)
print(npzfile['xtrain'].shape)

(8, 128, 3872)