In [1]:
import sys

vosk_path = r'../../vosk-tts/training/vits2'
sys.path.append(vosk_path)
sys.path.append('../../')

In [2]:
import models
import text
import utils
import data_utils
import json
import commons
import torch
import numpy as np
from scipy.io.wavfile import write

In [3]:
with open(r'../../pretrained/config.json', 'r') as f:
    config = json.load(f)

In [4]:
device = 'cpu'

In [5]:
config

{'train': {'log_interval': 200,
  'eval_interval': 1000,
  'seed': 1234,
  'epochs': 20000,
  'learning_rate': 0.0002,
  'betas': [0.8, 0.99],
  'eps': 1e-09,
  'batch_size': 24,
  'fp16_run': False,
  'lr_decay': 0.999875,
  'segment_size': 8192,
  'init_lr_ratio': 1,
  'warmup_epochs': 0,
  'c_mel': 45,
  'c_kl': 1.0,
  'fft_sizes': [384, 683, 171],
  'hop_sizes': [30, 60, 10],
  'win_lengths': [150, 300, 60],
  'window': 'hann_window'},
 'data': {'use_mel_posterior_encoder': True,
  'training_files': 'db/metadata-phones-ids.csv.train',
  'validation_files': 'db/metadata-phones-ids.csv.dev',
  'text_cleaners': [''],
  'max_wav_value': 32768.0,
  'sampling_rate': 22050,
  'filter_length': 1024,
  'hop_length': 256,
  'win_length': 1024,
  'n_mel_channels': 80,
  'mel_fmin': 0.0,
  'mel_fmax': None,
  'add_blank': True,
  'n_speakers': 5,
  'cleaned_text': False,
  'g2p_text': False,
  'aligned_text': True},
 'model': {'use_mel_posterior_encoder': True,
  'use_transformer_flows': True,

In [6]:
%pip show torch

Name: torch
Version: 2.8.0
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3-Clause
Location: /Users/egorkolesnikv/Documents/ai_talent/edl/phone-tts/.venv/lib/python3.9/site-packages
Requires: sympy, jinja2, fsspec, typing-extensions, filelock, networkx
Required-by: torchaudio
Note: you may need to restart the kernel to use updated packages.


In [7]:
from text.symbols import symbols
net_g = models.SynthesizerTrn(
    len(symbols),
    80,
    config['train']['segment_size'] // config['data']['hop_length'],
    n_speakers=config['data']['n_speakers'],
    mas_noise_scale_initial=0.01,
    noise_scale_delta=2e-6,
    **config['model']).cpu()

256 2
Multi-band iSTFT VITS2


  WeightNorm.apply(module, name, dim)


In [8]:
utils.load_checkpoint(r"../../pretrained/G_1000.pth",
                    net_g,
                    None)

INFO:root:Loaded checkpoint '../../pretrained/G_1000.pth' (iteration 1000)


(SynthesizerTrn(
   (enc_p): TextEncoder(
     (emb): Embedding(62, 192)
     (encoder): Encoder(
       (drop): Dropout(p=0.1, inplace=False)
       (attn_layers): ModuleList(
         (0-5): 6 x MultiHeadAttention(
           (conv_q): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
           (conv_k): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
           (conv_v): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
           (conv_o): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
           (drop): Dropout(p=0.1, inplace=False)
         )
       )
       (norm_layers_1): ModuleList(
         (0-5): 6 x LayerNorm()
       )
       (ffn_layers): ModuleList(
         (0-5): 6 x FFN(
           (conv_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,))
           (conv_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,))
           (drop): Dropout(p=0.1, inplace=False)
         )
       )
       (norm_layers_2): ModuleList(
         (0-5): 6 x LayerNorm()
       )
       (spk_emb_lin

In [9]:
net_g.eval()

SynthesizerTrn(
  (enc_p): TextEncoder(
    (emb): Embedding(62, 192)
    (encoder): Encoder(
      (drop): Dropout(p=0.1, inplace=False)
      (attn_layers): ModuleList(
        (0-5): 6 x MultiHeadAttention(
          (conv_q): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
          (conv_k): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
          (conv_v): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
          (conv_o): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
          (drop): Dropout(p=0.1, inplace=False)
        )
      )
      (norm_layers_1): ModuleList(
        (0-5): 6 x LayerNorm()
      )
      (ffn_layers): ModuleList(
        (0-5): 6 x FFN(
          (conv_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,))
          (conv_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,))
          (drop): Dropout(p=0.1, inplace=False)
        )
      )
      (norm_layers_2): ModuleList(
        (0-5): 6 x LayerNorm()
      )
      (spk_emb_linear): Linear(in_features=256

In [10]:
txt = 'Я текст сгене+рированный мод+е+лью, загруженной из чекп+оинта. Если меня озвучили - отпра—вивший его+++ достоин уважения.'
out = 'congrats'

In [11]:
def get_text(txt, config):
    text_norm = text.text_to_sequence_g2p(txt)
    if config['data']['add_blank']:
        text_norm = commons.intersperse(text_norm, 0)
    text_norm = torch.LongTensor(text_norm)
    print(text_norm)
    return text_norm

In [12]:
get_text(txt, config)

tensor([ 1,  0, 32,  0, 14,  0,  3,  0, 52,  0, 23,  0, 33,  0, 47,  0, 51,  0,
         3,  0, 47,  0, 27,  0, 22,  0, 40,  0, 22,  0, 46,  0, 30,  0, 45,  0,
        41,  0, 55,  0, 14,  0, 39,  0, 39,  0, 57,  0, 32,  0,  3,  0, 37,  0,
        41,  0, 21,  0, 23,  0, 36,  0, 32,  0, 53,  0,  8,  0,  3,  0, 59,  0,
        14,  0, 26,  0, 45,  0, 54,  0, 60,  0, 22,  0, 39,  0, 39,  0, 41,  0,
        32,  0,  3,  0, 31,  0, 59,  0,  3,  0, 19,  0, 22,  0, 33,  0, 43,  0,
        42,  0, 30,  0, 39,  0, 51,  0, 14,  0, 10,  0,  3,  0, 32,  0, 23,  0,
        47,  0, 36,  0, 30,  0,  3,  0, 38,  0, 22,  0, 40,  0, 15,  0,  3,  0,
        41,  0, 59,  0, 55,  0, 54,  0, 19,  0, 30,  0, 36,  0, 30,  0,  3,  0,
         9,  0,  3,  0, 41,  0, 51,  0, 43,  0, 45,  0, 14,  0, 56,  0, 30,  0,
        55,  0, 49,  0, 30,  0, 32,  0,  3,  0, 32,  0, 22,  0, 26,  0, 41,  0,
         3,  0, 20,  0, 41,  0, 47,  0, 51,  0, 42,  0, 30,  0, 39,  0,  3,  0,
        53,  0, 55,  0, 14,  0, 60,  0, 

tensor([ 1,  0, 32,  0, 14,  0,  3,  0, 52,  0, 23,  0, 33,  0, 47,  0, 51,  0,
         3,  0, 47,  0, 27,  0, 22,  0, 40,  0, 22,  0, 46,  0, 30,  0, 45,  0,
        41,  0, 55,  0, 14,  0, 39,  0, 39,  0, 57,  0, 32,  0,  3,  0, 37,  0,
        41,  0, 21,  0, 23,  0, 36,  0, 32,  0, 53,  0,  8,  0,  3,  0, 59,  0,
        14,  0, 26,  0, 45,  0, 54,  0, 60,  0, 22,  0, 39,  0, 39,  0, 41,  0,
        32,  0,  3,  0, 31,  0, 59,  0,  3,  0, 19,  0, 22,  0, 33,  0, 43,  0,
        42,  0, 30,  0, 39,  0, 51,  0, 14,  0, 10,  0,  3,  0, 32,  0, 23,  0,
        47,  0, 36,  0, 30,  0,  3,  0, 38,  0, 22,  0, 40,  0, 15,  0,  3,  0,
        41,  0, 59,  0, 55,  0, 54,  0, 19,  0, 30,  0, 36,  0, 30,  0,  3,  0,
         9,  0,  3,  0, 41,  0, 51,  0, 43,  0, 45,  0, 14,  0, 56,  0, 30,  0,
        55,  0, 49,  0, 30,  0, 32,  0,  3,  0, 32,  0, 22,  0, 26,  0, 41,  0,
         3,  0, 20,  0, 41,  0, 47,  0, 51,  0, 42,  0, 30,  0, 39,  0,  3,  0,
        53,  0, 55,  0, 14,  0, 60,  0, 

In [13]:
def vcss(out, inputstr, i):  # single
    device = torch.device("cpu")  # ВАЖНО: quantized модель = CPU
    net_g.to(device)
    net_g.eval()

    stn_tst = get_text(inputstr, config)

    speed = 1.0
    output_dir = r'outputs'
    sid = torch.LongTensor([i]).to(device)

    with torch.no_grad():
        x_tst = stn_tst.to(device).unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)

        o, o_mb, *_ = net_g.infer(
            x_tst,
            x_tst_lengths,
            sid=sid,
            noise_scale=.667,
            noise_scale_w=0.8,
            length_scale=1 / speed,
        )

        audio = o[0, 0].cpu().numpy() * 32768.0  # vol scale

    write(rf'{output_dir}/{out}.wav', config['data']['sampling_rate'], audio.astype(np.int16))
    print(rf'{out}.wav Generated!')


In [14]:
vcss(out, txt, 1)

tensor([ 1,  0, 32,  0, 14,  0,  3,  0, 52,  0, 23,  0, 33,  0, 47,  0, 51,  0,
         3,  0, 47,  0, 27,  0, 22,  0, 40,  0, 22,  0, 46,  0, 30,  0, 45,  0,
        41,  0, 55,  0, 14,  0, 39,  0, 39,  0, 57,  0, 32,  0,  3,  0, 37,  0,
        41,  0, 21,  0, 23,  0, 36,  0, 32,  0, 53,  0,  8,  0,  3,  0, 59,  0,
        14,  0, 26,  0, 45,  0, 54,  0, 60,  0, 22,  0, 39,  0, 39,  0, 41,  0,
        32,  0,  3,  0, 31,  0, 59,  0,  3,  0, 19,  0, 22,  0, 33,  0, 43,  0,
        42,  0, 30,  0, 39,  0, 51,  0, 14,  0, 10,  0,  3,  0, 32,  0, 23,  0,
        47,  0, 36,  0, 30,  0,  3,  0, 38,  0, 22,  0, 40,  0, 15,  0,  3,  0,
        41,  0, 59,  0, 55,  0, 54,  0, 19,  0, 30,  0, 36,  0, 30,  0,  3,  0,
         9,  0,  3,  0, 41,  0, 51,  0, 43,  0, 45,  0, 14,  0, 56,  0, 30,  0,
        55,  0, 49,  0, 30,  0, 32,  0,  3,  0, 32,  0, 22,  0, 26,  0, 41,  0,
         3,  0, 20,  0, 41,  0, 47,  0, 51,  0, 42,  0, 30,  0, 39,  0,  3,  0,
        53,  0, 55,  0, 14,  0, 60,  0, 

Post-training quantization

За это дело отвечает ptq.py, там обёртка лежит специальная


Нужно ещё осуществить подгруз данных для калибровки

In [15]:
import os

In [16]:
import os

MARKS_PATH = f"{os.getenv('HOME')}/natasha_dataset/marks.txt" 

def load_texts_from_marks(marks_path):
    texts = []
    with open(marks_path, "r", encoding="utf-8") as f:
        for line in f:
            parts = line.strip()
            if not line:
                continue
            try:
                _, text = line.split("|", maxsplit=1)
            except ValueError:
                continue
            texts.append(text)
    return texts


In [17]:
texts_for_calib = load_texts_from_marks(MARKS_PATH)
print(f"Loaded {len(texts_for_calib)} texts for calibration")
print(texts_for_calib[0])

Loaded 1000 texts for calibration
С трев+ожным ч+увством бер+усь я з+а пер+о.



Dataset для подачи текста, беру оригинальный, и выкидываю подгрузку звука, сейчас только текст нужен.

In [18]:
from torch.utils.data import Dataset

import logging

logger = logging.getLogger("calib_texts")
logger.setLevel(logging.DEBUG)

if not logger.handlers:
    handler = logging.StreamHandler()  # в stdout
    fmt = logging.Formatter("[%(levelname)s] %(name)s: %(message)s")
    handler.setFormatter(fmt)
    logger.addHandler(handler)

# from hparams import hps  # гиперпараметры, которыми ты создавал модель
class TextOnlyCalibrationDataset(Dataset):
    """
    Использует внутренности TextAudioSpeakerLoader:
    - те же audiopaths_sid_text
    - те же настройки текст-очистки
    Но не грузит аудио/спеки.
    """
    def __init__(self, filelist_path: str, hparams, logger=None, log_every: int = 1):
        # создаём "базовый" лоадер, чтобы он:
        #  - прочитал файл
        #  - отфильтровал по длине текста
        #  - настроил text_cleaners, add_blank и т.д.
        self.base = data_utils.TextAudioSpeakerLoader(filelist_path, hparams)
        self.logger = logger
        self.log_every = log_every

    def __len__(self):
        return len(self.base.audiopaths_sid_text)

    def __getitem__(self, idx):
        audiopath, sid, text, cleaned_text = self.base.audiopaths_sid_text[idx]
        # Используем ровно те же функции, что и при тренировке
        text_tensor = self.base.get_text(text, cleaned_text)
        sid_tensor = self.base.get_sid(sid)

        if self.logger is not None and (idx % self.log_every == 0):
            # Чуть подсократим текст, чтобы не зафлудить лог
            short_text = text if len(text) <= 120 else text[:117] + "..."
            self.logger.debug(
                f"Calib sample idx={idx} sid={sid} wav={audiopath} text={short_text}"
            )
        return text_tensor, sid_tensor



In [19]:
import torch
class TextSpeakerCollate:
    def __call__(self, batch):
        # batch: list of (text_tensor, sid_tensor)
        texts, sids = zip(*batch)
        text_lengths = torch.LongTensor([t.size(0) for t in texts])
        max_len = int(text_lengths.max().item())

        text_padded = torch.zeros(len(texts), max_len, dtype=torch.long)
        for i, t in enumerate(texts):
            text_padded[i, :t.size(0)] = t

        sids = torch.stack(sids).long().view(-1)  # [B]

        return text_padded, text_lengths, sids


In [20]:
from torch.utils.data import DataLoader
from scripts.build_audiopaths_sid_texts import FILELIST_PATH

hparams = utils.get_hparams_from_file(f"{os.getenv('HOME')}/pretrained/config.json")
# ПРОБУЮ ПОМЕНЯТЬ В КОНФИГЕ НАПРЯМУЮ aligned_text на false а g2p_text на true

hparams.data['aligned_text'] = False
hparams.data['g2p_text'] = True
calib_dataset = TextOnlyCalibrationDataset(FILELIST_PATH, hparams.data)
calib_collate = TextSpeakerCollate()

calib_loader = DataLoader(
    calib_dataset,
    batch_size=8,     # подбери под память
    shuffle=True,
    num_workers=0,
    collate_fn=calib_collate,
)


Функция для калибровки

In [21]:

def calibration_fn(model):
    model.eval()
    with torch.inference_mode():
        for i, (x, x_lengths, sid) in enumerate(calib_loader):
            if i >= 30:  # ограничение, чтобы не возиться слишком долго
                break

            x = x.to("cpu")
            x_lengths = x_lengths.to("cpu")
            sid = sid.to("cpu")

            _ = model.infer(
                x=x,
                x_lengths=x_lengths,
                sid=sid,              # multi-speaker случай
                noise_scale=0.667,
                length_scale=1.0,
                noise_scale_w=1.0,
                max_len=None,
            )


In [22]:
# def calibration_fn(model):
#     model.eval()
#     with torch.inference_mode():
#         for i, (x, x_lengths, sid) in enumerate(calib_loader):
#             if i >= 30:
#                 break
#             x = x.to("cpu")
#             x_lengths = x_lengths.to("cpu")
#             sid = sid.to("cpu")
#             # просто прогоняем infer — observers соберут статы
#             model.infer(x, x_lengths, sid=sid, noise_scale=0.667,
#                         length_scale=1.0, noise_scale_w=0.8, max_len=None)



In [23]:
# from ptq import quantize_ptq
# #net_g.dec.remove_weight_norm()
# net_g.to("cpu")
# net_g.eval()

# modules_to_quantize = [
#     # "dec",  # временно убрать
#     "enc_p.encoder",   # self.enc_p.encoder — чистый attention/FFN-энкодер
#     #"dp",              # DurationPredictor или SDP, проще по структуре
#     #"flow",            # ResidualCouplingTransformersBlock, но тут тоже могут быть подводные камни
# ]


# quantize_ptq(
#     net_g,
#     module_names=modules_to_quantize,
#     calibration_fn=calibration_fn,
#     backend="fbgemm",
# )

# torch.save({"model": net_g.state_dict()}, "G_natasha_quantized_dec.pth")


# Пробуем квантануть только свёртки

In [24]:
from ptq import quantize_ptq_convs_only

net_g.to("cpu")
net_g.eval()

# ВАЖНО: сначала убрать weight_norm, если он ещё жив в модели!
# Если у тебя mb_istft_vits / ms_istft_vits — там есть методы remove_weight_norm().
# Например:
try:
    net_g.dec.remove_weight_norm()
except AttributeError:
    pass  # если уже убрал руками в коде — окей

# Квантуем только свёртки внутри декодера:
quantize_ptq_convs_only(
    net_g,
    calibration_fn=calibration_fn,
    module_roots=["dec"],  # или None, чтобы пройтись по всей модели
    backend="fbgemm",
)

torch.save({"model": net_g.state_dict()}, "G_natasha_quantized_conv_only.pth")


Removing weight norm...


KeyboardInterrupt: 

# тестим на звучание (шумно, но синтез идёт)

In [None]:
vcss("congrats_q.wav", txt, 1)

tensor([ 1,  0, 32,  0, 14,  0,  3,  0, 52,  0, 23,  0, 33,  0, 47,  0, 51,  0,
         3,  0, 47,  0, 27,  0, 22,  0, 40,  0, 22,  0, 46,  0, 30,  0, 45,  0,
        41,  0, 55,  0, 14,  0, 39,  0, 39,  0, 57,  0, 32,  0,  3,  0, 37,  0,
        41,  0, 21,  0, 23,  0, 36,  0, 32,  0, 53,  0,  8,  0,  3,  0, 59,  0,
        14,  0, 26,  0, 45,  0, 54,  0, 60,  0, 22,  0, 39,  0, 39,  0, 41,  0,
        32,  0,  3,  0, 31,  0, 59,  0,  3,  0, 19,  0, 22,  0, 33,  0, 43,  0,
        42,  0, 30,  0, 39,  0, 51,  0, 14,  0, 10,  0,  3,  0, 32,  0, 23,  0,
        47,  0, 36,  0, 30,  0,  3,  0, 38,  0, 22,  0, 40,  0, 15,  0,  3,  0,
        41,  0, 59,  0, 55,  0, 54,  0, 19,  0, 30,  0, 36,  0, 30,  0,  3,  0,
         9,  0,  3,  0, 41,  0, 51,  0, 43,  0, 45,  0, 14,  0, 56,  0, 30,  0,
        55,  0, 49,  0, 30,  0, 32,  0,  3,  0, 32,  0, 22,  0, 26,  0, 41,  0,
         3,  0, 20,  0, 41,  0, 47,  0, 51,  0, 42,  0, 30,  0, 39,  0,  3,  0,
        53,  0, 55,  0, 14,  0, 60,  0, 

In [None]:
#torch.save(net_g, "G_natasha_quantized_dec_full.pt")


In [None]:
# import torch

# net_g = torch.load("G_natasha_quantized_dec_full.pt", map_location="cpu", weights_only=False)
# net_g.eval()
# можно сразу вызывать net_g.infer(...)


# пробуем квант загрузить

In [None]:
# from torch.nn.utils import remove_weight_norm, WeightNorm

# def fix_weight_norm_after_load(model: torch.nn.Module):
#     # 1. Сначала пытаемся КОРРЕКТНО снять weight_norm там, где он есть.
#     #    Это создаст обычный .weight из weight_g / weight_v и уберёт хуки.
#     for module in model.modules():
#         try:
#             remove_weight_norm(module)
#         except (ValueError, AttributeError):
#             # ValueError — если на модуле не стоял weight_norm,
#             # AttributeError — на квантованных / "нестандартных" модулях
#             pass

#     # 2. На всякий случай вычищаем оставшиеся WeightNorm-хуки
#     #    (обычно они останутся как раз на квантованных Conv1d/ConvTranspose1d,
#     #     где remove_weight_norm уже не сработает).
#     for module in model.modules():
#         if not hasattr(module, "_forward_pre_hooks"):
#             continue
#         for hook_id, hook in list(module._forward_pre_hooks.items()):
#             if isinstance(hook, WeightNorm):
#                 del module._forward_pre_hooks[hook_id]


In [17]:
import torch
from ptq import prepare_model_for_ptq_convs_only, convert_model_from_ptq
from models import SynthesizerTrn
import utils  # если из него берёшь hparams
from torch.nn.utils.weight_norm import WeightNorm

device = torch.device("cpu")

# 1. Создаём обычный float-модель как обычно
net_g_q = SynthesizerTrn(
    len(symbols),
    80,
    config['train']['segment_size'] // config['data']['hop_length'],
    n_speakers=config['data']['n_speakers'],
    mas_noise_scale_initial=0.01,
    noise_scale_delta=2e-6,
    **config['model'],
).to(device)
try:
    net_g_q.dec.remove_weight_norm()
except AttributeError:
    pass 

# Строим ту же квант-структуру, что и при квантизации
prepare_model_for_ptq_convs_only(
    net_g_q,
    module_roots=["dec"],
    backend="fbgemm",
)
convert_model_from_ptq(net_g_q)

# Грузим квантованные веса
checkpoint = torch.load("G_natasha_quantized_conv_only.pth", map_location=device)
missing, unexpected = net_g_q.load_state_dict(checkpoint["model"], strict=False)
print("MISSING:", missing)
print("UNEXPECTED:", unexpected)

# ЧИНИМ weight_norm
#fix_weight_norm_after_load(net_g_q)
net_g_q.eval()

# try:
#     net_g_q.dec.remove_weight_norm()
# except AttributeError:
#     pass 


256 2
Multi-band iSTFT VITS2
Removing weight norm...
MISSING: []
UNEXPECTED: []




SynthesizerTrn(
  (enc_p): TextEncoder(
    (emb): Embedding(62, 192)
    (encoder): Encoder(
      (drop): Dropout(p=0.1, inplace=False)
      (attn_layers): ModuleList(
        (0-5): 6 x MultiHeadAttention(
          (conv_q): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
          (conv_k): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
          (conv_v): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
          (conv_o): Conv1d(192, 192, kernel_size=(1,), stride=(1,))
          (drop): Dropout(p=0.1, inplace=False)
        )
      )
      (norm_layers_1): ModuleList(
        (0-5): 6 x LayerNorm()
      )
      (ffn_layers): ModuleList(
        (0-5): 6 x FFN(
          (conv_1): Conv1d(192, 768, kernel_size=(3,), stride=(1,))
          (conv_2): Conv1d(768, 192, kernel_size=(3,), stride=(1,))
          (drop): Dropout(p=0.1, inplace=False)
        )
      )
      (norm_layers_2): ModuleList(
        (0-5): 6 x LayerNorm()
      )
      (spk_emb_linear): Linear(in_features=256

In [18]:
checkpoint = torch.load("G_natasha_quantized_conv_only.pth", map_location=device)

missing, unexpected = net_g_q.load_state_dict(checkpoint["model"], strict=False)
print("MISSING:", missing)
print("UNEXPECTED:", unexpected)


MISSING: []
UNEXPECTED: []


In [22]:
def vcss_q(out, inputstr, i):  # single
    stn_tst = get_text(inputstr, config)

    speed = 1.0
    output_dir = "outputs"
    sid = torch.LongTensor([i]).to(device)
    with torch.no_grad():
        x_tst = stn_tst.to(device).unsqueeze(0)
        x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
        audio = \
        net_g_q.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.8, length_scale=1 / speed)[0][
            0, 0].data.cpu().numpy() * 32768.0  # vol scale
        print(audio, np.max(audio))
    write(rf'{output_dir}/{out}.wav', config['data']['sampling_rate'], audio.astype(np.int16))
    print(rf'{out}.wav Generated!')

In [23]:
vcss_q("congrats_q_loaded", txt, 1)

tensor([ 1,  0, 32,  0, 14,  0,  3,  0, 52,  0, 23,  0, 33,  0, 47,  0, 51,  0,
         3,  0, 47,  0, 27,  0, 22,  0, 40,  0, 22,  0, 46,  0, 30,  0, 45,  0,
        41,  0, 55,  0, 14,  0, 39,  0, 39,  0, 57,  0, 32,  0,  3,  0, 37,  0,
        41,  0, 21,  0, 23,  0, 36,  0, 32,  0, 53,  0,  8,  0,  3,  0, 59,  0,
        14,  0, 26,  0, 45,  0, 54,  0, 60,  0, 22,  0, 39,  0, 39,  0, 41,  0,
        32,  0,  3,  0, 31,  0, 59,  0,  3,  0, 19,  0, 22,  0, 33,  0, 43,  0,
        42,  0, 30,  0, 39,  0, 51,  0, 14,  0, 10,  0,  3,  0, 32,  0, 23,  0,
        47,  0, 36,  0, 30,  0,  3,  0, 38,  0, 22,  0, 40,  0, 15,  0,  3,  0,
        41,  0, 59,  0, 55,  0, 54,  0, 19,  0, 30,  0, 36,  0, 30,  0,  3,  0,
         9,  0,  3,  0, 41,  0, 51,  0, 43,  0, 45,  0, 14,  0, 56,  0, 30,  0,
        55,  0, 49,  0, 30,  0, 32,  0,  3,  0, 32,  0, 22,  0, 26,  0, 41,  0,
         3,  0, 20,  0, 41,  0, 47,  0, 51,  0, 42,  0, 30,  0, 39,  0,  3,  0,
        53,  0, 55,  0, 14,  0, 60,  0, 

In [32]:
# from text.symbols import symbols
# net_g_quant = models.SynthesizerTrn(
#     len(symbols),
#     80,
#     config['train']['segment_size'] // config['data']['hop_length'],
#     n_speakers=config['data']['n_speakers'],
#     mas_noise_scale_initial=0.01,
#     noise_scale_delta=2e-6,
#     **config['model']).cpu()

In [33]:
# utils.load_checkpoint(r"/home/michael/Documents/ITMO/EDLM/phone-tts/pretrained/G_1000.pth",
#                     net_g_quant,
#                     None)

In [34]:
#net_g_quant.eval()

подгруз

In [35]:
# # Load quantized encoder weights
# enc_int8_path = "/home/michael/Documents/ITMO/EDLM/phone-tts/G_natasha_quantized_dec.pth"
# sd = torch.load(str(enc_int8_path), map_location="cpu")
# net_g_quant.load_state_dict(sd['model'], strict=True)


In [36]:
# txt = 'Я текст сгене+рированный м+о+делью, загруженной из чекп+оинта. Если меня озвучили - отпра—вивший его+++ достоин уважения.'
# out = 'congrats_quant'

In [37]:
# def get_text(txt, config):
#     text_norm = text.text_to_sequence_g2p(txt)
#     if config['data']['add_blank']:
#         text_norm = commons.intersperse(text_norm, 0)
#     text_norm = torch.LongTensor(text_norm)
#     print(text_norm)
#     return text_norm

In [38]:
#get_text(txt, config)

In [35]:
from torch.profiler import profile, record_function, ProfilerActivity

# Helper to run vcss N times to accumulate profiler statistics
def _run_vcss_n(n: int = 1):
    for _ in range(n):
        # Use a unique output name to avoid overwriting previous files
        vcss("congrats_prof", txt, 1)

# Profile CPU ops during vcss; record_shapes helps attribute conv shapes
with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
    with record_function("vcss_inference"):
        _run_vcss_n(n=5)  # increase to >=5 for more stable averages



tensor([ 1,  0, 32,  0, 14,  0,  3,  0, 52,  0, 23,  0, 33,  0, 47,  0, 51,  0,
         3,  0, 47,  0, 27,  0, 22,  0, 40,  0, 22,  0, 46,  0, 30,  0, 45,  0,
        41,  0, 55,  0, 14,  0, 39,  0, 39,  0, 57,  0, 32,  0,  3,  0, 37,  0,
        41,  0, 21,  0, 23,  0, 36,  0, 32,  0, 53,  0,  8,  0,  3,  0, 59,  0,
        14,  0, 26,  0, 45,  0, 54,  0, 60,  0, 22,  0, 39,  0, 39,  0, 41,  0,
        32,  0,  3,  0, 31,  0, 59,  0,  3,  0, 19,  0, 22,  0, 33,  0, 43,  0,
        42,  0, 30,  0, 39,  0, 51,  0, 14,  0, 10,  0,  3,  0, 32,  0, 23,  0,
        47,  0, 36,  0, 30,  0,  3,  0, 38,  0, 22,  0, 40,  0, 15,  0,  3,  0,
        41,  0, 59,  0, 55,  0, 54,  0, 19,  0, 30,  0, 36,  0, 30,  0,  3,  0,
         9,  0,  3,  0, 41,  0, 51,  0, 43,  0, 45,  0, 14,  0, 56,  0, 30,  0,
        55,  0, 49,  0, 30,  0, 32,  0,  3,  0, 32,  0, 22,  0, 26,  0, 41,  0,
         3,  0, 20,  0, 41,  0, 47,  0, 51,  0, 42,  0, 30,  0, 39,  0,  3,  0,
        53,  0, 55,  0, 14,  0, 60,  0, 

In [36]:

# Show top operators by self CPU time
print(prof.key_averages(group_by_stack_n=10).table(
    sort_by="self_cpu_time_total", row_limit=15
))

-------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                       Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                 aten::_slow_conv2d_forward        35.94%     637.598ms        38.16%     677.038ms     115.045us          5885  
                  aten::slow_conv_dilated2d        23.87%     423.510ms        28.73%     509.724ms      49.488us         10300  
                             vcss_inference         7.60%     134.842ms       100.00%        1.774s        1.774s             1  
                                aten::copy_         4.01%      71.083ms         4.01%      71.083ms       2.429us         29270  
                aten::slow_conv_transpose2d         3.30%      58.526ms         3.57%     

In [37]:
from torch.profiler import profile, record_function, ProfilerActivity

# Helper to run vcss N times to accumulate profiler statistics
def _run_vcss_n(n: int = 1):
    for _ in range(n):
        # Use a unique output name to avoid overwriting previous files
        vcss_q("congrats_prof_q", txt, 1)

# Profile CPU ops during vcss; record_shapes helps attribute conv shapes
with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
    with record_function("vcss_inference"):
        _run_vcss_n(n=5)  # increase to >=5 for more stable averages



tensor([ 1,  0, 32,  0, 14,  0,  3,  0, 52,  0, 23,  0, 33,  0, 47,  0, 51,  0,
         3,  0, 47,  0, 27,  0, 22,  0, 40,  0, 22,  0, 46,  0, 30,  0, 45,  0,
        41,  0, 55,  0, 14,  0, 39,  0, 39,  0, 57,  0, 32,  0,  3,  0, 37,  0,
        41,  0, 21,  0, 23,  0, 36,  0, 32,  0, 53,  0,  8,  0,  3,  0, 59,  0,
        14,  0, 26,  0, 45,  0, 54,  0, 60,  0, 22,  0, 39,  0, 39,  0, 41,  0,
        32,  0,  3,  0, 31,  0, 59,  0,  3,  0, 19,  0, 22,  0, 33,  0, 43,  0,
        42,  0, 30,  0, 39,  0, 51,  0, 14,  0, 10,  0,  3,  0, 32,  0, 23,  0,
        47,  0, 36,  0, 30,  0,  3,  0, 38,  0, 22,  0, 40,  0, 15,  0,  3,  0,
        41,  0, 59,  0, 55,  0, 54,  0, 19,  0, 30,  0, 36,  0, 30,  0,  3,  0,
         9,  0,  3,  0, 41,  0, 51,  0, 43,  0, 45,  0, 14,  0, 56,  0, 30,  0,
        55,  0, 49,  0, 30,  0, 32,  0,  3,  0, 32,  0, 22,  0, 26,  0, 41,  0,
         3,  0, 20,  0, 41,  0, 47,  0, 51,  0, 42,  0, 30,  0, 39,  0,  3,  0,
        53,  0, 55,  0, 14,  0, 60,  0, 

In [38]:
# Show top operators by self CPU time
print(prof.key_averages(group_by_stack_n=5).table(
    sort_by="self_cpu_time_total", row_limit=15
))


-----------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                               Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg    # of Calls  
-----------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  
                  quantized::conv1d        38.61%     680.790ms        39.48%     696.101ms       3.664ms           190  
         aten::_slow_conv2d_forward        11.90%     209.785ms        13.34%     235.144ms      40.859us          5755  
          aten::slow_conv_dilated2d        10.80%     190.487ms        13.45%     237.082ms      23.153us         10240  
        quantized::conv_transpose1d         9.14%     161.214ms         9.17%     161.752ms      16.175ms            10  
                     vcss_inference         4.27%      75.282ms       100.00%        1.763s        1.763s             1  
                        