In [2]:
import argparse
import logging
import os

import numpy as np

import joblib
import soundfile as sf
import torch
import json
import yaml

In [3]:
import sys
sys.path.append("../")

In [4]:
from gslm.speech2unit.clustering.utils import (
    get_audio_files,
)
from gslm.speech2unit.pretrained.utils import (
    get_features,
)

from gslm.unit2speech.tts_data import (
    TacotronInputDataset,
)
from gslm.unit2speech.utils import (
    load_quantized_audio_from_file,
    load_tacotron,
    load_waveglow,
    synthesize_audio,
)

In [63]:
sys.path.append("/net/papilio/storage2/yhaoyuan/LAbyLM")
import hifigan
from hifigan.env import AttrDict
from hifigan.models import Generator


In [67]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [80]:
# HifiGAN
# pretrained waveglow has problems: the output audio is [nan, nan, ...]
# So we use HifiGAN.
# The input of higigan is longtensor shaped [80, len(audio)/hop_len == T]

# checkpoint_file = "/net/papilio/storage2/yhaoyuan/LAbyLM/hifigan/LJ_FT_T2_V3/generator_v3"
checkpoint_file = "/net/papilio/storage2/yhaoyuan/LAbyLM/hifigan/FOOD_V1_24K_Speaker3/generator_v1_24k"
config_file = os.path.join(os.path.split(checkpoint_file)[0], 'config.json')
with open(config_file) as f:
        data = f.read()

global h
json_config = json.loads(data)
h = AttrDict(json_config)
generator = Generator(h).to(device)
assert os.path.isfile(checkpoint_file)
checkpoint_dict = torch.load(checkpoint_file, map_location=device)
generator.load_state_dict(checkpoint_dict['generator'])
generator.eval()
generator.remove_weight_norm()

Removing weight norm...


In [3]:
feature_type = "hubert"
checkpoint_path = "/net/papilio/storage2/yhaoyuan/transformer_I2S/gslm_models/S2u/hubert_base_ls960.pt"
layer = 6
manifest_path = "manifest.txt"
kmeans_model_path = "/net/papilio/storage2/yhaoyuan/transformer_I2S/gslm_models/S2u/HuBERT_100_km.bin"

In [4]:

features_batch = get_features(
    feature_type=feature_type,
    checkpoint_path=checkpoint_path,
    layer=layer,
    manifest_path=manifest_path,
    sample_pct=1.0,
    flatten=False,
    channel_id=None,
)


2023-02-13 18:23:41 | INFO | fairseq.tasks.hubert_pretraining | current directory is /net/papilio/storage2/yhaoyuan/transformer_I2S/egs/gslm
2023-02-13 18:23:41 | INFO | fairseq.tasks.hubert_pretraining | HubertPretrainingTask Config {'_name': 'hubert_pretraining', 'data': '/checkpoint/wnhsu/data/librispeech/960h/iter/250K_50hz_km100_mp0_65_v2', 'fine_tuning': False, 'labels': ['layer6.km500'], 'label_dir': None, 'label_rate': 50.0, 'sample_rate': 16000, 'normalize': False, 'enable_padding': False, 'max_keep_size': None, 'max_sample_size': 250000, 'min_sample_size': 32000, 'single_target': False, 'random_crop': True, 'pad_audio': False}
2023-02-13 18:23:41 | INFO | fairseq.models.hubert.hubert | HubertModel Config: {'_name': 'hubert', 'label_rate': 50.0, 'extractor_mode': default, 'encoder_layers': 12, 'encoder_embed_dim': 768, 'encoder_ffn_embed_dim': 3072, 'encoder_attention_heads': 12, 'activation_fn': gelu, 'layer_type': transformer, 'dropout': 0.1, 'attention_dropout': 0.1, 'activ

In [5]:
kmeans_model = joblib.load(open(kmeans_model_path, "rb"))
kmeans_model.verbose = False


https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [6]:
predictions = []

In [7]:
for i, feats in enumerate(features_batch):
    pred = kmeans_model.predict(feats)
    pred_str = " ".join(str(p) for p in pred)
    predictions.append(pred)
    

In [8]:
def RLE(seq):
    pred = []
    prev = -1
    for i in seq:
        if i != prev:
            pred.append(i)
            prev = i
        else:
            continue
    return pred

In [16]:
predictions_RLE = []
for prediction in predictions:
    predictions_RLE.append(RLE(prediction))


In [103]:
tts_model_path = "/net/papilio/storage2/yhaoyuan/transformer_I2S/gslm_models/u2S/HuBERT_KM100_tts_checkpoint_best.pt"
max_decoder_steps = 2000
waveglow_path = "/net/papilio/storage2/yhaoyuan/transformer_I2S/gslm_models/u2S/waveglow_256channels_new.pt"
code_dict_path = "/net/papilio/storage2/yhaoyuan/transformer_I2S/gslm_models/u2S/HuBERT_KM100_code_dict"

In [23]:
tacotron_model, sample_rate, hparams = load_tacotron(
    tacotron_model_path=tts_model_path,
    max_decoder_steps=max_decoder_steps,
)

# waveglow, denoiser = load_waveglow(waveglow_path=waveglow_path)

if not os.path.exists(hparams.code_dict):
    hparams.code_dict = code_dict_path
tts_dataset = TacotronInputDataset(hparams)

In [30]:
def synthesize_mel(model, inp, lab=None, strength=0.0):
    assert inp.size(0) == 1
    inp = inp.cuda()
    if lab is not None:
        lab = torch.LongTensor(1).cuda().fill_(lab)

    with torch.no_grad():
        _, mel, _, ali, has_eos = model.inference(inp, lab, ret_has_eos=True)
    return mel, has_eos

In [35]:
mels_RLE = []
for quantized_units in predictions_RLE:
    quantized_units_str = " ".join(map(str, quantized_units))
    tts_input = tts_dataset.get_tensor(quantized_units_str)
    mel, has_eos = synthesize_mel(
        tacotron_model,
        tts_input.unsqueeze(0),
    )
    mels_RLE.append(mel)

mels = []
for quantized_units in predictions:
    quantized_units_str = " ".join(map(str, quantized_units))
    tts_input = tts_dataset.get_tensor(quantized_units_str)
    mel, has_eos = synthesize_mel(
        tacotron_model,
        tts_input.unsqueeze(0),
    )
    mels.append(mel)

In [84]:
for mel in mels_RLE:
    with torch.no_grad():
        x = mel.squeeze().float()
        # x = torch.FloatTensor(x).to(device)
        y_g_hat = generator(x)
        audio = y_g_hat.squeeze()
        audio = audio * 32768.0
        audio = audio.cpu().numpy().astype('int16')
        import IPython.display as ipd
        display(ipd.Audio(audio, rate=22050))

In [85]:
from collections import namedtuple
import os
import ast
import numpy as np

from fairseq import checkpoint_utils, options, tasks, utils

import tqdm

In [108]:
uLM_checkpoint_path = "/net/papilio/storage2/yhaoyuan/transformer_I2S/gslm_models/uLM/hubert100_lm/checkpoint_best.pt"

In [111]:
checkpoint = torch.load(uLM_checkpoint_path)

In [124]:
checkpoint["model"]["decoder.layers.0.self_attn.k_proj.weight"].shape

torch.Size([1024, 1024])

In [120]:
for k,v in checkpoint["model"].items():
    print(k)

decoder.version
decoder.embed_tokens.weight
decoder.embed_positions._float_tensor
decoder.layers.0.self_attn.k_proj.weight
decoder.layers.0.self_attn.k_proj.bias
decoder.layers.0.self_attn.v_proj.weight
decoder.layers.0.self_attn.v_proj.bias
decoder.layers.0.self_attn.q_proj.weight
decoder.layers.0.self_attn.q_proj.bias
decoder.layers.0.self_attn.out_proj.weight
decoder.layers.0.self_attn.out_proj.bias
decoder.layers.0.self_attn_layer_norm.weight
decoder.layers.0.self_attn_layer_norm.bias
decoder.layers.0.fc1.weight
decoder.layers.0.fc1.bias
decoder.layers.0.fc2.weight
decoder.layers.0.fc2.bias
decoder.layers.0.final_layer_norm.weight
decoder.layers.0.final_layer_norm.bias
decoder.layers.1.self_attn.k_proj.weight
decoder.layers.1.self_attn.k_proj.bias
decoder.layers.1.self_attn.v_proj.weight
decoder.layers.1.self_attn.v_proj.bias
decoder.layers.1.self_attn.q_proj.weight
decoder.layers.1.self_attn.q_proj.bias
decoder.layers.1.self_attn.out_proj.weight
decoder.layers.1.self_attn.out_proj

In [110]:
models, _model_args = checkpoint_utils.load_model_ensemble(
    uLM_checkpoint_path,
    arg_overrides=None,
    task="language_modeling",
    suffix="",
)

IsADirectoryError: [Errno 21] Is a directory: '/'

In [121]:
from fairseq.models.transformer_lm import TransformerLanguageModel

In [304]:
custom_lm = TransformerLanguageModel.from_pretrained('/net/papilio/storage2/yhaoyuan/transformer_I2S/gslm_models/uLM/hubert100_lm/', 'checkpoint_best.pt')
pretrain_state_dict = custom_lm.models.state_dict()

2023-02-14 21:00:15 | INFO | fairseq.file_utils | loading archive file /net/papilio/storage2/yhaoyuan/transformer_I2S/gslm_models/uLM/hubert100_lm/
2023-02-14 21:00:16 | INFO | fairseq.tasks.language_modeling | dictionary: 104 types
2023-02-14 21:00:17 | INFO | fairseq.models.fairseq_model | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 500, 'log_format': 'json', 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 1337, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memory_efficient_fp16': False, 'fp16_no_flatten_grads': False, 'fp16_init_scale': 128, 'fp16_scale_window': None, 'fp16_scale_tolerance': 0.0, 'min_loss_scale': 0.0001, 'threshold_loss_scale': None, 'user_dir': None, 'empty_cache_freq': 0, 'all_gather_list_size': 16384, 'model_parallel_size': 1, 'quantization_config_path': None, 'profile': False, 'reset_logging': True}, 'common_eval': {'_name': None, 'path': Non

In [261]:
# with open("pretrained_HuBERT_uLM_state_dict.txt", "w") as f:
#     #print("%-50s\t%-20s\t" % ("Key Name", "Value Shape"))
#     f.writelines("%-50s\t%-20s\t\n" % ("Key Name", "Value Shape"))
#     for k, v in custom_lm.models.state_dict().items():
#         f.writelines("%-50s\t%-20s\t\n" % (k, v.shape))
#         #print("%-50s\t%-20s\t" % (k, v.shape))

In [369]:
self_LM_dict = torch.load("/net/papilio/storage2/yhaoyuan/transformer_I2S/saved_model/LM/LibriSpeech/23-02-13_21:14:32_sentence/top5acc_BEST_checkpoint_coco_1_cap_per_img_1_min_word_freq.pth.tar")
# self_LM_dict = torch.load("/net/papilio/storage2/yhaoyuan/transformer_I2S/saved_model/I2U/VC_5_captions/1024*16*12_0.1_32/checkpoint_coco_5_cap_per_img_1_min_word_freq_gpu.pth.tar")
self_state_dict = self_LM_dict["model_state_dict"]

# self_LM_dict = torch.load("/net/papilio/storage2/yhaoyuan/LAbyLM_Komatsu/model/U2U/transformer_vit2/transformer_vit2.pt")
# self_state_dict = self_LM_dict

In [353]:
# with open("I2U_state_dict.txt", "w") as f:
#     #print("%-50s\t%-20s\t" % ("Key Name", "Value Shape"))
#     f.writelines("%-50s\t%-20s\t\n" % ("Key Name", "Value Shape"))
#     for k, v in self_LM_dict["model_state_dict"].items():
#         f.writelines("%-50s\t%-20s\t\n" % (k, v.shape))
#         #print("%-50s\t%-20s\t" % (k, v.shape))

In [354]:
# q, k, v = torch.arange(3072*1024).chunk(3)
# q = q.view(1024, 1024)
# k = k.view(1024, 1024)
# v = v.view(1024, 1024)
# w = torch.cat([q, k, v], dim = 0)
# q_1, k_1, v_1 = w.split([1024, 1024, 1024])

In [355]:
# q == q_1

In [370]:
q_weight = pretrain_state_dict["0.decoder.layers.0.self_attn.q_proj.weight"]
k_weight = pretrain_state_dict["0.decoder.layers.0.self_attn.k_proj.weight"]
v_weight = pretrain_state_dict["0.decoder.layers.0.self_attn.v_proj.weight"]
q_bias = pretrain_state_dict["0.decoder.layers.0.self_attn.q_proj.bias"]
k_bias = pretrain_state_dict["0.decoder.layers.0.self_attn.k_proj.bias"]
v_bias = pretrain_state_dict["0.decoder.layers.0.self_attn.v_proj.bias"]

In [371]:
w = self_state_dict["LM_decoder.layers.0.self_attn.in_proj_weight"]
# b = self_state_dict["decoder.layers.0.self_attn.out_proj.bias"]
b = self_state_dict["LM_decoder.layers.0.self_attn.in_proj_bias"]
b = self_state_dict["LM_decoder.layers.0.linear1.bias"]

In [372]:
for key in self_state_dict.keys():
    if key[-4:] == "bias":
        print(key, self_state_dict[key].all() == 0)

LM_decoder.layers.0.self_attn.in_proj_bias tensor(True, device='cuda:0')
LM_decoder.layers.0.self_attn.out_proj.bias tensor(True, device='cuda:0')
LM_decoder.layers.0.linear1.bias tensor(False, device='cuda:0')
LM_decoder.layers.0.linear2.bias tensor(False, device='cuda:0')
LM_decoder.layers.0.norm1.bias tensor(True, device='cuda:0')
LM_decoder.layers.0.norm2.bias tensor(True, device='cuda:0')
LM_decoder.layers.1.self_attn.in_proj_bias tensor(True, device='cuda:0')
LM_decoder.layers.1.self_attn.out_proj.bias tensor(True, device='cuda:0')
LM_decoder.layers.1.linear1.bias tensor(False, device='cuda:0')
LM_decoder.layers.1.linear2.bias tensor(False, device='cuda:0')
LM_decoder.layers.1.norm1.bias tensor(True, device='cuda:0')
LM_decoder.layers.1.norm2.bias tensor(True, device='cuda:0')
LM_decoder.layers.2.self_attn.in_proj_bias tensor(True, device='cuda:0')
LM_decoder.layers.2.self_attn.out_proj.bias tensor(True, device='cuda:0')
LM_decoder.layers.2.linear1.bias tensor(False, device='cuda

In [373]:
print(torch.__version__)

1.13.1+cu117


In [374]:
math.sqrt(1024)

32.0