In [1]:
import numpy as np
import torch
import torch.nn.functional as F
import torchaudio

import plotly.graph_objects as go
import plotly.express as px
import matplotlib.pyplot as plt

import librosa
import librosa.display
from fastpitch.data_function import TTSDataset, load_wav_to_torch


hifi_ds = TTSDataset('/root/datasets/',
                    ['filelists/Cori_Samuel_audio_pitch_text_train.txt'],
                    text_cleaners=['english_cleaners_v2'],
                    n_mel_channels=80,
                    symbol_set='english_basic',
                    p_arpabet=0.0,
                    n_speakers=1,
                    load_mel_from_disk=False,
                    load_pitch_from_disk=False,
                    pitch_mean=None,
                    pitch_std=None,
                    max_wav_value=32768.0,
                    sampling_rate=22050,
                    filter_length=1024,
                    hop_length=256,
                    win_length=1024,
                    mel_fmin=0,
                    mel_fmax=8000,
                    betabinomial_online_dir=None,
                    pitch_online_dir=None,
                    pitch_online_method='pyin')

lj_ds = TTSDataset('/root/datasets/LJSpeech-1.1',
                   ['filelists/ljs_audio_pitch_text_train_v3.txt'],
                   text_cleaners=['english_cleaners_v2'],
                   n_mel_channels=80,
                   symbol_set='english_basic',
                   p_arpabet=0.0,
                   n_speakers=1,
                   load_mel_from_disk=False,
                   load_pitch_from_disk=False,
                   pitch_mean=None,
                   pitch_std=None,
                   max_wav_value=32768.0,
                   sampling_rate=22050,
                   filter_length=1024,
                   hop_length=256,
                   win_length=1024,
                   mel_fmin=0,
                   mel_fmax=8000,
                   betabinomial_online_dir=None,
                   pitch_online_dir=None,
                   pitch_online_method='pyin')



In [9]:
index = 300

audiopath, *extra, text, speaker = hifi_ds.audiopaths_and_text[index]
# audiopath = '/home/aleksey/datasets/exeter_01_heath_0012.wav'
audiopath = '/home/aleksey/datasets/LJ001-0007.wav'
print(text)

audio, sampling_rate = load_wav_to_torch(audiopath)
if sampling_rate != lj_ds.stft.sampling_rate:
    raise ValueError("{} SR doesn't match target {} SR".format(
        sampling_rate, lj_ds.stft.sampling_rate))
audio_norm = audio / lj_ds.max_wav_value
audio_norm = audio_norm.unsqueeze(0)
audio_norm = torch.autograd.Variable(audio_norm, requires_grad=False)
melspec = lj_ds.stft.mel_spectrogram(audio_norm)
melspec = lj_ds.stft.spectral_de_normalize(melspec)
melspec = torch.squeeze(melspec, 0)
mel_len = melspec.size(-1)

snd, sr = librosa.load(audiopath)
pitch_mel, voiced_flag, voiced_probs = librosa.pyin(
    snd, fmin=librosa.note_to_hz('C2'),
    fmax=librosa.note_to_hz('C7'), frame_length=1024)
assert np.abs(mel_len - pitch_mel.shape[0]) <= 1.0

pitch_mel = np.where(np.isnan(pitch_mel), 0.0, pitch_mel)
mel_fq = librosa.mel_frequencies(n_mels=80, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'), htk=True)
pitch_mel = [np.abs(mel_fq - p).argmin() if p != 0.0 else None for p in pitch_mel]

fig = px.imshow(melspec)
fig.add_trace(go.Scatter(y=pitch_mel, line={'width': 2, 'color': 'green'}))
fig.show()

i may as well exhaust it without paying any regard to the chronological order of my reminiscences


In [10]:
from pathlib import Path
import re
import torch
from hifigan.models import Generator


def load_hifigan(ckpt_path: Path) -> Generator:
    ckpt_data = torch.load(ckpt_path)
    ckpt_config = ckpt_data.get('config')
    model_config = ckpt_config
    model = Generator(model_config)
    sd = ckpt_data['generator']
    sd = {re.sub('^module\.', '', k): v for k, v in sd.items()}
    status = model.load_state_dict(sd, strict=False)
    model.remove_weight_norm()
    model.eval()
    return model

# hifigan_path = Path('~') / 'weights' / 'hifigan' / 'hifigan_gen_checkpoint_10000_ft.pt'
# hifigan_model = load_hifigan(hifigan_path.expanduser())

# audio = hifigan_model(melspec.unsqueeze(0).float()).float().squeeze(1)
melspec_ = (melspec * lj_ds.max_wav_value).numpy().astype(np.float32)
audio = librosa.feature.inverse.mel_to_audio(melspec_, sr=22050, n_fft=1024,
                   hop_length=256,
                   win_length=1024,)
audio = torch.from_numpy(audio.astype(np.float32) / lj_ds.max_wav_value).unsqueeze(0)
print(audio.max())

torchaudio.save('test.wav', audio, 22050)

tensor(5.1020e-06)
