In [1]:
import os

import numpy as np

import joblib
import soundfile as sf
import torch
import json
import sys
import librosa

In [2]:
sys.path.append('../../egs')
from gslm.speech2unit.pretrained.hubert_feature_reader import (
    HubertFeatureReader,
)

from gslm.unit2speech.tts_data import (
    TacotronInputDataset,
)
from gslm.unit2speech.utils import (
    load_quantized_audio_from_file,
    load_tacotron,
    load_waveglow,
    synthesize_audio,
)

In [40]:
sys.path.append("/net/papilio/storage2/yhaoyuan/LAbyLM")
import hifigan
from hifigan.env import AttrDict
from hifigan.models import Generator

device = device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
from gslm.speech2unit.HuBERT_100_unit import get_units

2023-02-15 15:27:37 | INFO | fairseq.tasks.hubert_pretraining | current directory is /net/papilio/storage2/yhaoyuan/transformer_I2S/dataprep/I2U
2023-02-15 15:27:37 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': '/checkpoint/wnhsu/data/librispeech/960h/iter/250K_50hz_km100_mp0_65_v2', 'fine_tuning': False, 'labels': ['layer6.km500'], 'label_dir': None, 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}
2023-02-15 15:27:37 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': gelu, 'layer_type': transformer, 'dropout': 0.1, 'attention_dropout': 0.1, 'a

In [41]:
# HifiGAN
# pretrained waveglow has problems: the output audio is [nan, nan, ...]
# So we use HifiGAN.
# The input of higigan is longtensor shaped [80, len(audio)/hop_len == T]

# checkpoint_file = "/net/papilio/storage2/yhaoyuan/LAbyLM/hifigan/LJ_FT_T2_V3/generator_v3"
checkpoint_file = "/net/papilio/storage2/yhaoyuan/LAbyLM/hifigan/FOOD_V1_24K_Speaker3/generator_v1_24k"
config_file = os.path.join(os.path.split(checkpoint_file)[0], 'config.json')
with open(config_file) as f:
        data = f.read()

global h
json_config = json.loads(data)
h = AttrDict(json_config)
generator = Generator(h).to(device)
assert os.path.isfile(checkpoint_file)
checkpoint_dict = torch.load(checkpoint_file, map_location=device)
generator.load_state_dict(checkpoint_dict['generator'])
generator.eval()
generator.remove_weight_norm()

Removing weight norm...


In [42]:
tts_model_path = "/net/papilio/storage2/yhaoyuan/transformer_I2S/gslm_models/u2S/HuBERT_KM100_tts_checkpoint_best.pt"
max_decoder_steps = 2000
code_dict_path = "/net/papilio/storage2/yhaoyuan/transformer_I2S/gslm_models/u2S/HuBERT_KM100_code_dict"

In [43]:
tacotron_model, sample_rate, hparams = load_tacotron(
    tacotron_model_path=tts_model_path,
    max_decoder_steps=max_decoder_steps,
)

# waveglow, denoiser = load_waveglow(waveglow_path=waveglow_path)

if not os.path.exists(hparams.code_dict):
    hparams.code_dict = code_dict_path
tts_dataset = TacotronInputDataset(hparams)

In [66]:
hparams.code_dict

'/net/papilio/storage2/yhaoyuan/transformer_I2S/gslm_models/u2S/HuBERT_KM100_code_dict'

In [44]:
def synthesize_mel(model, inp, lab=None, strength=0.0):
    assert inp.size(0) == 1
    inp = inp.cuda()
    if lab is not None:
        lab = torch.LongTensor(1).cuda().fill_(lab)

    with torch.no_grad():
        _, mel, _, ali, has_eos = model.inference(inp, lab, ret_has_eos=True)
    return mel, has_eos

In [5]:
# # Load HuBERT
# hubert_model = "/net/papilio/storage2/yhaoyuan/transformer_I2S/gslm_models/S2u/hubert_base_ls960.pt"
# hubert_cluster_100 = "/net/papilio/storage2/yhaoyuan/transformer_I2S/gslm_models/S2u/HuBERT_100_km.bin"
# feature_reader = HubertFeatureReader(checkpoint_path=hubert_model, layer=6)

# # Load Cluster
# kmeans_model = joblib.load(open(hubert_cluster_100, "rb"))
# kmeans_model.verbose = False

In [6]:
# def run_length_encoding(seq):
#     pred = []
#     prev = -1
#     for i in seq:
#         if i != prev:
#             pred.append(i)
#             prev = i
#         else:
#             continue
#     return np.array(pred)

In [57]:
wav = "/net/papilio/storage2/yhaoyuan/LAbyLM/audios_24k_trimmed/5/147.wav"

In [58]:
pred_RLE = get_units(wav)

In [63]:
pred_RLE

array([71, 12, 40, 63, 40, 93, 14, 24, 13, 68, 44, 18, 89, 78, 52, 25, 11,
       32, 88, 64, 44, 18, 27, 47, 87,  9, 74,  2, 27, 47,  5, 79, 25, 37,
       11, 64, 44, 80, 26, 73, 16, 66, 27, 63, 47, 52, 25, 24, 17, 68, 16,
       18, 66, 27, 47, 87, 91, 43, 74,  2, 63, 78, 52, 25, 17, 42, 80, 18,
       31, 10, 83, 84, 63, 22, 63, 22, 63, 22, 63, 72, 63, 56, 72, 20],
      dtype=int32)

In [60]:
quantized_units_str = " ".join(map(str, pred_RLE))
tts_input = tts_dataset.get_tensor(quantized_units_str)
mel, has_eos = synthesize_mel(
    tacotron_model,
    tts_input.unsqueeze(0),
)
with torch.no_grad():
    x = mel.squeeze().float()
    # x = torch.FloatTensor(x).to(device)
    y_g_hat = generator(x)
    audio = y_g_hat.squeeze()
    audio = audio * 32768.0
    audio = audio.cpu().numpy().astype('int16')
    import IPython.display as ipd
    display(ipd.Audio(audio, rate=22050))
    print(has_eos)

True


In [64]:
quantized_units_str


'71 12 40 63 40 93 14 24 13 68 44 18 89 78 52 25 11 32 88 64 44 18 27 47 87 9 74 2 27 47 5 79 25 37 11 64 44 80 26 73 16 66 27 63 47 52 25 24 17 68 16 18 66 27 47 87 91 43 74 2 63 78 52 25 17 42 80 18 31 10 83 84 63 22 63 22 63 22 63 72 63 56 72 20'