# Style-Bert-VITS2ライブラリの使用例

`pip install style-bert-vits2`を使った、jupyter notebookでの使用例です。Google colab等でも動きます。

In [1]:
# BERTモデルをロード（ローカルに手動でダウンロードする必要はありません）

from style_bert_vits2.nlp import bert_models
from style_bert_vits2.constants import Languages


bert_models.load_model(Languages.JP, "ku-nlp/deberta-v2-large-japanese-char-wwm")
bert_models.load_tokenizer(Languages.JP, "ku-nlp/deberta-v2-large-japanese-char-wwm")
# bert_models.load_model(Languages.EN, "microsoft/deberta-v3-large")
# bert_models.load_tokenizer(Languages.EN, "microsoft/deberta-v3-large")
# bert_models.load_model(Languages.ZH, "hfl/chinese-roberta-wwm-ext-large")
# bert_models.load_tokenizer(Languages.ZH, "hfl/chinese-roberta-wwm-ext-large")

  from .autonotebook import tqdm as notebook_tqdm


[32m07-19 19:44:44[0m |[1m  INFO  [0m| bert_models.py:92 | Loaded the JP BERT model from ku-nlp/deberta-v2-large-japanese-char-wwm
[32m07-19 19:44:45[0m |[1m  INFO  [0m| bert_models.py:154 | Loaded the JP BERT tokenizer from ku-nlp/deberta-v2-large-japanese-char-wwm


BertJapaneseTokenizer(name_or_path='ku-nlp/deberta-v2-large-japanese-char-wwm', vocab_size=22012, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	4: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [2]:
# Hugging Faceから試しにデフォルトモデルをダウンロードしてみて、それを音声合成に使ってみる
# model_assetsディレクトリにダウンロードされます

from pathlib import Path
from huggingface_hub import hf_hub_download


model_file = "jvnv-F1-jp/jvnv-F1-jp_e160_s14000.safetensors"
config_file = "jvnv-F1-jp/config.json"
style_file = "jvnv-F1-jp/style_vectors.npy"

for file in [model_file, config_file, style_file]:
    print(file)
    hf_hub_download("litagin/style_bert_vits2_jvnv", file, local_dir="model_assets")

jvnv-F1-jp/jvnv-F1-jp_e160_s14000.safetensors


jvnv-F1-jp/config.json
jvnv-F1-jp/style_vectors.npy


In [3]:
# 上でダウンロードしたモデルファイルを指定して音声合成のテスト

from style_bert_vits2.tts_model import TTSModel

assets_root = Path("model_assets")

model = TTSModel(
    model_path=assets_root / model_file,
    config_path=assets_root / config_file,
    style_vec_path=assets_root / style_file,
    device="cpu",
)

In [4]:
from IPython.display import Audio, display

sr, audio = model.infer(text="絵本読んで")
display(Audio(audio, rate=sr))

[32m07-19 19:44:47[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
絵本読んで
[32m07-19 19:44:47[0m |[1m  INFO  [0m| infer.py:24 | Using JP-Extra model




[32m07-19 19:44:47[0m |[1m  INFO  [0m| safetensors.py:50 | Loaded 'model_assets/jvnv-F1-jp/jvnv-F1-jp_e160_s14000.safetensors' (iteration 160)
[32m07-19 19:44:48[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully


In [5]:
import torch

for name, module in model.net_g.named_modules():
    if isinstance(module, torch.nn.Embedding):
        print(name)

enc_p.emb
enc_p.tone_emb
enc_p.language_emb
emb_g


In [6]:
from style_bert_vits2.nlp.symbols import SYMBOLS, JP_SYMBOLS

print(SYMBOLS)
print(JP_SYMBOLS)

difficulty ={
    'r': 1.0,
    'g': 0.5,
    't': 0.3,
}

['_', 'AA', 'E', 'EE', 'En', 'N', 'OO', 'V', 'a', 'a:', 'aa', 'ae', 'ah', 'ai', 'an', 'ang', 'ao', 'aw', 'ay', 'b', 'by', 'c', 'ch', 'd', 'dh', 'dy', 'e', 'e:', 'eh', 'ei', 'en', 'eng', 'er', 'ey', 'f', 'g', 'gy', 'h', 'hh', 'hy', 'i', 'i0', 'i:', 'ia', 'ian', 'iang', 'iao', 'ie', 'ih', 'in', 'ing', 'iong', 'ir', 'iu', 'iy', 'j', 'jh', 'k', 'ky', 'l', 'm', 'my', 'n', 'ng', 'ny', 'o', 'o:', 'ong', 'ou', 'ow', 'oy', 'p', 'py', 'q', 'r', 'ry', 's', 'sh', 't', 'th', 'ts', 'ty', 'u', 'u:', 'ua', 'uai', 'uan', 'uang', 'uh', 'ui', 'un', 'uo', 'uw', 'v', 'van', 've', 'vn', 'w', 'x', 'y', 'z', 'zh', 'zy', '!', '?', '…', ',', '.', "'", '-', 'SP', 'UNK']
['N', 'a', 'a:', 'b', 'by', 'ch', 'd', 'dy', 'e', 'e:', 'f', 'g', 'gy', 'h', 'hy', 'i', 'i:', 'j', 'k', 'ky', 'm', 'my', 'n', 'ny', 'o', 'o:', 'p', 'py', 'q', 'r', 'ry', 's', 'sh', 't', 'ts', 'ty', 'u', 'u:', 'w', 'y', 'z', 'zy']


In [8]:

class HookManager:
    def __init__(self, difficulty):
        self.handles = []
        self.difficulty = difficulty
        self.duration = None
        self.stochastic_duration = None

    def read_duration(self, module, input, output):
        self.duration = output

    def read_stochastic_duration(self, module, input, output):
        self.stochastic_duration = output

    def write_duration(self, module, input, output):
        if self.duration is not None:
            return self.duration
        
    def write_stochastic_duration(self, module, input, output):
        if self.stochastic_duration is not None:
            return self.stochastic_duration

    def modify_embedding(self, module, input, output):
        y = output
        difficulty = y.new_tensor(self.difficulty).unsqueeze(-1)
        y[:, 1:-2:2] = y[:, 1:-2:2] * (1-difficulty) + y[:, 3::2] * difficulty
        return y
    
    def register_hooks(self, model):
        for name, module in model.net_g.named_modules():
            if isinstance(module, torch.nn.Embedding):
                handle = module.register_forward_hook(self.modify_embedding)
                self.handles.append(handle)
            if name == "enc_p.emb":
                handle = module.register_forward_hook(self.read_duration)
                self.handles.append(handle)
            if name == "enc_p.stochastic_emb":
                handle = module.register_forward_hook(self.read_stochastic_duration)
                self.handles.append(handle)
            if name == "dec_p.emb":
                handle = module.register_forward_hook(self.write_duration)
                self.handles.append(handle)
            if name == "dec_p.stochastic_emb":
                handle = module.register_forward_hook(self.write_stochastic_duration)
                self.handles.append(handle)

def hook(module, input, output):
    x, = input
    y = output
    jp = [SYMBOLS[id] for id in x[0].tolist()]

    print(jp)

handle = model.net_g.enc_p.emb.register_forward_hook(hook)

In [9]:
sr, audio = model.infer(text="まんま、まんま")
display(Audio(audio, rate=sr))

[32m07-19 19:45:35[0m |[1m  INFO  [0m| tts_model.py:259 | Start generating audio data from text:
まんま、まんま
['_', '_', '_', 'm', '_', 'a', '_', 'N', '_', 'm', '_', 'a', '_', ',', '_', 'm', '_', 'a', '_', 'N', '_', 'm', '_', 'a', '_', '_', '_']
[32m07-19 19:45:35[0m |[1m  INFO  [0m| tts_model.py:324 | Audio data generated successfully


In [15]:
x = torch.zeros(11)
x[3::2].size()

torch.Size([4])