In [3]:
from pathlib import Path
import os
import numpy as np
from pydub import AudioSegment
import librosa

In [4]:
def get_data_overview():
    data_dir = Path('../data')
    filenames = os.listdir(data_dir)
    
    data_overview = {}
    for file in filenames:
        if file.endswith('.m4a'):
            splits = file[:-4].split('_')
            if not splits[0] in data_overview:
                data_overview[splits[0]] = []
            data_overview[splits[0]].append(int(splits[1]))
            
    return data_overview

In [5]:
def get_audio(filepath):
    audio_obj = AudioSegment.from_file(filepath)
    audio_array = np.array(audio_obj.get_array_of_samples()).astype(np.float32)
    audio_fs = audio_obj.frame_rate    
    
    return audio_array, audio_fs

In [42]:
def get_raw(filenames):
    data_dir = Path('../data')
    
    raws = []
    laudio = 0
    for file in filenames:
        file += '.m4a'
        filepath = os.path.join(data_dir, file)
        audio_array, audio_fs = get_audio(filepath)
        laudio = max(len(audio_array), laudio)
        audio_array = np.reshape(audio_array, (1,len(audio_array)))
        raws.append(audio_array)
    
    # pad
    for i in range(len(raws)):
        nappend = laudio - np.shape(raws[i])[1]
        raws[i] = np.append(raws[i], np.zeros((1,nappend)), axis=1)
        
    return np.array(raws)

In [45]:
def get_spectrogram(filenames, n_mels=128):
    data_dir = Path('../data')
    
    spectograms = []
    laudio = 0
    for file in filenames:
        file += '.m4a'
        filepath = os.path.join(data_dir, file)
        audio_array, audio_fs = get_audio(filepath)
        spectro = librosa.feature.melspectrogram(audio_array, sr=audio_fs, n_mels=n_mels)
        log_spectro = librosa.power_to_db(spectro, ref=np.max)
        laudio = max(np.shape(log_spectro)[1], laudio)
        spectograms.append(log_spectro)
    
    # pad
    for i in range(len(spectograms)):
        nappend = laudio - np.shape(spectograms[i])[1]
        spectograms[i] = np.append(spectograms[i], np.zeros((n_mels,nappend)), axis=1)
        
    return np.array(spectograms)

In [8]:
def get_language(filenames):
    y = []
    for file in filenames:
        splits = file.split('_')
        y.append(splits[0])
        
    return y

In [9]:
def get_gender(filenames):
    """0 is male, 1 is gender"""
    y = []
    for file in filenames:
        splits = file.split('_')
        if int(splits[1])%2 == 0:
            y.append('male')
        else:
            y.append('female')
    
    return y

In [10]:
def load_train_test(x_is='raw', y_is='language', split=0.8):
    overview = get_data_overview()
    
    files_train = []
    files_test = []
    for language, items in overview.items():
        ntrain = int(split*len(items))
        items_train = items[:ntrain]
        items_test = items[ntrain:]
        files_train.extend([f"{language}_{i}" for i in items_train])
        files_test.extend([f"{language}_{i}" for i in items_test])
        
    if x_is == 'raw':
        xtrain = get_raw(files_train)
        xtest = get_raw(files_test)
    elif x_is == 'spectrogram':
        xtrain = get_spectrogram(files_train)
        xtest = get_spectrogram(files_test)
    else:
        raise ValueError("x_is must be: 'raw', 'spectrogram'")    

    if y_is == 'language':
        ytrain = get_language(files_train)
        ytest = get_language(files_test)
    elif y_is == 'gender':
        ytrain = get_gender(files_train)
        ytest = get_gender(files_test)
    else:
        raise ValueError("y_is must be: 'language', 'gender'")
    
    return (xtrain, ytrain), (xtest, ytest)

In [46]:
(xtrain, ytrain), (xtest, ytest) = load_train_test(x_is='spectrogram')
print(np.shape(xtrain))
# split after getting arrays, otherwise xtrain and xtest have different lengths

3058
3058
3198
3226
3226
3226
3226
3460
3460
3460
3460
3460
3460
3460
3460
3460
3460
3460
3872
3872
3872
3872
3872
3872
3872
3872
3872
3872
3872
3872
3872
3872
3180
3404
3404
3404
3404
3404
3404
3404
(32, 128, 3872)
(32, 128, 3872)
