The right way to generate mel-spectrogram #3

v-nhandt21 · 2022-06-10T12:42:07Z

I found your repo from this issue: jik876/hifi-gan#63

I am still confused about the mismatch between repos in Mel spectrogram generation. I collect some method from some TTS repo, there are some differences such as

STFT from torch vs librosa
Log mel with base e vs base 10
Difference in padding
Use center or not


def get_mel_librosa1(wave):
     wave = wave / max_wav_value
     wave = wave.astype('float32')
     mel = librosa.feature.melspectrogram(y=wave, sr=sampling_rate, n_mels=num_mels, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa) #, center=True, pad_mode='constant', power=2.0)
     return mel

def get_mel_librosa2(wave):
     wave = wave / max_wav_value
     wave = wave.astype('float32')
     sgram = librosa.stft(wave, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa)
     sgram_mag, _ = librosa.magphase(sgram)
     mel_scale_sgram = librosa.feature.melspectrogram(S=sgram_mag, sr=sampling_rate, n_mels=num_mels, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa)
     mel_sgram = librosa.amplitude_to_db(mel_scale_sgram, ref=np.min)
     return mel_sgram

def get_mel_parallelwavegan(wave):
     # get amplitude spectrogram
     wave = wave / max_wav_value
     wave = wave.astype('float32')
     x_stft = librosa.stft(wave, n_fft=fft_size, hop_length=hop_size, win_length=win_length, window=window_librosa, center=True, pad_mode="reflect")
     spc = np.abs(x_stft).T  # (#frames, #bins)
     mel = np.maximum(eps, np.dot(spc, melbasis.T))
     return np.log10(mel).T

def get_mel_tacotron2(wave):
     wave = torch.FloatTensor(wave)
     audio_norm = wave / max_wav_value
     audio_norm = audio_norm.unsqueeze(0)
     audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)

     _stft = TacotronSTFT(fft_size, hop_size, fft_size, num_mels, sampling_rate, fmin, fmax)

     melspec = _stft.mel_spectrogram(audio_norm)
     melspec = torch.squeeze(melspec, 0)
     return melspec.cpu().detach().numpy()

def get_mel_hifigan_origin(y):
     y = y/max_wav_value
     y = torch.FloatTensor([y]).to(device)
     y = torch.nn.functional.pad(y.unsqueeze(1), (int((fft_size-hop_size)/2), int((fft_size-hop_size)/2)), mode='reflect').squeeze(1)
     spec = torch.stft(y, fft_size, hop_length=hop_size, win_length=win_length, window=window_torch, center=False, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
     spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
     mel_basis = torch.from_numpy( melbasis ).float().to(device)
     spec = torch.matmul(mel_basis, spec)
     spec = torch.log(torch.clamp(spec, min=1e-5) * 1)
     return spec.cpu().detach().numpy()[0]

def get_mel_hifigan_center(y):
     y = y/max_wav_value
     y = torch.FloatTensor([y]).to(device)
     # y = torch.nn.functional.pad(y.unsqueeze(1), (int((fft_size-hop_size)/2), int((fft_size-hop_size)/2)), mode='reflect').squeeze(1)
     spec = torch.stft(y, fft_size, hop_length=hop_size, win_length=win_length, window=window_torch, center=True, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
     spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
     mel_basis = torch.from_numpy( melbasis ).float().to(device)
     spec = torch.matmul(mel_basis, spec)
     spec = torch.log(torch.clamp(spec, min=1e-5) * 1)
     return spec.cpu().detach().numpy()[0]

def get_mel_hifigan_change_pad(y):
     # https://github.com/jik876/hifi-gan/issues/63
     y = y/max_wav_value
     y = torch.FloatTensor([y]).to(device)
     y = torch.nn.functional.pad(y.unsqueeze(1), (int((fft_size)/2), int((fft_size)/2)), mode='reflect').squeeze(1)
     spec = torch.stft(y, fft_size, hop_length=hop_size, win_length=win_length, window=window_torch, center=False, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
     spec = torch.sqrt(spec.pow(2).sum(-1)+(1e-9))
     mel_basis = torch.from_numpy( melbasis ).float().to(device)
     spec = torch.matmul(mel_basis, spec)
     spec = torch.log(torch.clamp(spec, min=1e-5) * 1)

     return spec.cpu().detach().numpy()[0]

 mel0 = get_mel_librosa1(wave)
 mel1 = get_mel_librosa2(wave)
 mel2 = get_mel_parallelwavegan(wave)
 mel3 = get_mel_tacotron2(wave)
 mel4 = get_mel_hifigan_origin(wave)
 mel5 = get_mel_hifigan_center(wave)
 mel6 = get_mel_hifigan_change_pad(wave)

(80, 487)
(80, 487)
(80, 487)
(80, 487)
(80, 486)
(80, 487)
(80, 487)

Only the origin way of hifigan repo give difference shape: get_mel_hifigan_origin

Do you have any comments on this, when I compare element values, there is no total match between these method.

One more question, Is there any benchmark for these Vocoders?

The text was updated successfully, but these errors were encountered:

v-nhandt21 · 2022-06-10T12:43:17Z

Which should I use when I want to fine tune Taco to Hifigan?

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

The right way to generate mel-spectrogram #3

The right way to generate mel-spectrogram #3

v-nhandt21 commented Jun 10, 2022

v-nhandt21 commented Jun 10, 2022

The right way to generate mel-spectrogram #3

The right way to generate mel-spectrogram #3

Comments

v-nhandt21 commented Jun 10, 2022

v-nhandt21 commented Jun 10, 2022