In [None]:
import torchaudio

import numpy as np
from pathlib import Path

from helpers import extract_metadata 

In [None]:
##### CONFIG #####

MAX_CHARACTERS = 331
MAX_SPEECH = 1114
WAV_FILES = 13100

FOLDER = Path('data/LJSpeech-1.1/wavs')

# NOTE: we are quite complex here using every original character
# Text encoding NOTE: Do not remove the space in CHARSET: """ ABC""""
CHARSET = """ !"$&'(),-.0123456789:;?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyz|£àâèéêü’“”"""

In [None]:
print(CHARSET)
print(f'Size|Len of charset: {len(CHARSET)} characters')

In [None]:
df_metadata = extract_metadata()

# TODO: Better name then full_charset
full_charset = []
for idx, text in enumerate(df_metadata['text']):
    text_encoded = []
    for char in text:
        text_encoded.append(CHARSET.index(char))

    text_encoded = np.array(text_encoded)
    text_encoded_and_padded = np.pad(text_encoded,(0, MAX_CHARACTERS-len(text_encoded)))
    full_charset.append(text_encoded_and_padded)

df_metadata['charset'] = full_charset
# df_metadata.head()

In [None]:
# Need's lot's of memory when running all in one process 
files = list(Path(FOLDER).glob('*.wav'))

n_mels = 50
mel_specgram_all = []
for file in files:
    waveform, sample_rate = torchaudio.load(file)
    transform = torchaudio.transforms.MelSpectrogram(sample_rate, n_mels=n_mels)
    mel_specgram = transform(waveform)
    mel_specgram = mel_specgram.numpy().reshape(n_mels,-1)
    zeros = np.zeros((n_mels,MAX_SPEECH-mel_specgram.shape[1]), dtype=float)
    mel_specgram_padded = np.concatenate((mel_specgram,zeros), axis=1)
    mel_specgram_all.append(np.log(mel_specgram_padded))

In [None]:
# Add spectrogram to the dataframe
df_metadata['spectrogram'] = mel_specgram_all
df_metadata.head()

In [None]:
# Store data to a pickle file with spectrogram (NB Log of mel_specgram)
df_metadata.to_pickle(FOLDER.parent / 'LJSpeech-1.1.pkl')