In [36]:
from pathlib import Path
import os
import numpy as np
from pydub import AudioSegment
import librosa

In [37]:
def get_data_overview():
    data_dir = Path('../data')
    filenames = os.listdir(data_dir)
    
    data_overview = {}
    for file in filenames:
        if file.endswith('.m4a'):
            splits = file[:-4].split('_')
            if not splits[0] in data_overview:
                data_overview[splits[0]] = []
            data_overview[splits[0]].append(int(splits[1]))
            
    return data_overview

In [38]:
def get_audio(filepath):
    audio_obj = AudioSegment.from_file(filepath)
    audio_array = np.array(audio_obj.get_array_of_samples()).astype(np.float32)
    audio_fs = audio_obj.frame_rate    
    
    return audio_array, audio_fs

In [39]:
def get_raw(filenames):
    data_dir = Path('../data')
    
    x = []
    for file in filenames:
        file += '.m4a'
        filepath = os.path.join(data_dir, file)
        audio_array, audio_fs = get_audio(filepath)
        #! TODO: need to pad with zeros
        x.append(audio_array)
        
    return np.array(x)

In [46]:
def get_spectrogram(filenames):
    data_dir = Path('../data')
    
    x = []
    for file in filenames:
        file += '.m4a'
        filepath = os.path.join(data_dir, file)
        audio_array, audio_fs = get_audio(filepath)
        spectro = librosa.feature.melspectrogram(audio_array, sr=audio_fs, n_mels=128)
        log_spectro = librosa.power_to_db(spectro, ref=np.max)
        #! TODO: need to pad with zeros
        x.append(log_spectro)
        
    return np.array(x)

In [41]:
def get_language(filenames):
    y = []
    for file in filenames:
        splits = file.split('_')
        y.append(splits[0])
        
    return y

In [42]:
def get_gender(filenames):
    """0 is male, 1 is gender"""
    y = []
    for file in filenames:
        splits = file.split('_')
        if int(splits[1])%2 == 0:
            y.append('male')
        else:
            y.append('female')
    
    return y

In [43]:
def load_train_test(x_is='raw', y_is='language', split=0.8):
    overview = get_data_overview()
    
    files_train = []
    files_test = []
    for language, items in overview.items():
        ntrain = int(split*len(items))
        items_train = items[:ntrain]
        items_test = items[ntrain:]
        files_train.extend([f"{language}_{i}" for i in items_train])
        files_test.extend([f"{language}_{i}" for i in items_test])
        
    if x_is == 'raw':
        xtrain = get_raw(files_train)
        xtest = get_raw(files_test)
    elif x_is == 'spectrogram':
        xtrain = get_spectrogram(files_train)
        xtest = get_spectrogram(files_test)
    else:
        raise ValueError("x_is must be: 'raw', 'spectrogram'")    

    if y_is == 'language':
        ytrain = get_language(files_train)
        ytest = get_language(files_test)
    elif y_is == 'gender':
        ytrain = get_gender(files_train)
        ytest = get_gender(files_test)
    else:
        raise ValueError("y_is must be: 'language', 'gender'")
        
    print(np.shape(xtrain))
    
    return (xtrain, ytrain), (xtest, ytest)

In [47]:
load_train_test(x_is='spectrogram')

(128, 3058)
(128, 2954)
(128, 3198)
(128, 3226)
(128, 2852)
(128, 3170)
(128, 3160)
(128, 3460)
(128, 3076)
(128, 2944)
(128, 3180)
(128, 3188)
(128, 2880)
(128, 3076)
(128, 3216)
(128, 3122)
(128, 3394)
(128, 2522)
(128, 3872)
(128, 2786)
(128, 2448)
(128, 3038)
(128, 3488)
(128, 2420)
(128, 2944)
(128, 2372)
(128, 2794)
(128, 3010)
(128, 2916)
(128, 3198)
(128, 2570)
(128, 2832)


ValueError: could not broadcast input array from shape (128,3058) into shape (128)