<a href="https://colab.research.google.com/github/0ktim/0ktim/blob/main/NERDYNAV_MMS_TTS_Inference_Colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Running MMS-TTS inference in Colab


Original: https://github.com/facebookresearch/fairseq/tree/main/examples/mms

=============================================================================

Modified by Nerdynav (https://www.youtube.com/@nerdynav) for ease of use.

=============================================================================




## 1. Preliminaries
This section installs necessary python packages for the other sections. Run it first.

In [1]:
#@title Automatic Setup
%pwd
!git clone https://github.com/jaywalnut310/vits.git
!python --version
%cd vits/

!pip install Cython==0.29.21
!pip install librosa==0.8.0
!pip install phonemizer==2.2.1
!pip install scipy
!pip install numpy
!pip install torch
!pip install torchvision
!pip install matplotlib
!pip install Unidecode==1.1.1

%cd monotonic_align/
%mkdir monotonic_align
!python3 setup.py build_ext --inplace
%cd ../
%pwd

Cloning into 'vits'...
remote: Enumerating objects: 81, done.[K
remote: Counting objects: 100% (55/55), done.[K
remote: Compressing objects: 100% (34/34), done.[K
remote: Total 81 (delta 21), reused 21 (delta 21), pack-reused 26[K
Receiving objects: 100% (81/81), 3.33 MiB | 7.26 MiB/s, done.
Resolving deltas: 100% (22/22), done.
Python 3.10.12
/content/vits
Collecting Cython==0.29.21
  Downloading Cython-0.29.21-py2.py3-none-any.whl (974 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m974.2/974.2 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Cython
  Attempting uninstall: Cython
    Found existing installation: Cython 3.0.8
    Uninstalling Cython-3.0.8:
      Successfully uninstalled Cython-3.0.8
Successfully installed Cython-0.29.21
Collecting librosa==0.8.0
  Downloading librosa-0.8.0.tar.gz (183 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m183.9/183.9 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m


'/content/vits'

## 2. Choose a language and convert text to audio
Find the ISO code for your target language [here](https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html). You can find more details about the languages we currently support for TTS in this [table](https://dl.fbaipublicfiles.com/mms/misc/language_coverage_mms.html).

In [2]:
#@title Select Language { run: "auto" }
LANG = "bul" #@param {type:"string"}
import os
import subprocess
import locale
locale.getpreferredencoding = lambda: "UTF-8"

def download(lang, tgt_dir="./"):
  lang_fn, lang_dir = os.path.join(tgt_dir, lang+'.tar.gz'), os.path.join(tgt_dir, lang)
  cmd = ";".join([
        f"wget https://dl.fbaipublicfiles.com/mms/tts/{lang}.tar.gz -O {lang_fn}",
        f"tar zxvf {lang_fn}"
  ])
  print(f"Download model for language: {lang}")
  subprocess.check_output(cmd, shell=True)
  print(f"Model checkpoints in {lang_dir}: {os.listdir(lang_dir)}")
  return lang_dir

ckpt_dir = download(LANG)

from IPython.display import Audio
import os
import re
import glob
import json
import tempfile
import math
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader
import numpy as np
import commons
import utils
import argparse
import subprocess
from data_utils import TextAudioLoader, TextAudioCollate, TextAudioSpeakerLoader, TextAudioSpeakerCollate
from models import SynthesizerTrn
from scipy.io.wavfile import write

def preprocess_char(text, lang=None):
    """
    Special treatement of characters in certain languages
    """
    print(lang)
    if lang == 'ron':
        text = text.replace("ț", "ţ")
    return text

class TextMapper(object):
    def __init__(self, vocab_file):
        self.symbols = [x.replace("\n", "") for x in open(vocab_file, encoding="utf-8").readlines()]
        self.SPACE_ID = self.symbols.index(" ")
        self._symbol_to_id = {s: i for i, s in enumerate(self.symbols)}
        self._id_to_symbol = {i: s for i, s in enumerate(self.symbols)}

    def text_to_sequence(self, text, cleaner_names):
        '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
        Args:
        text: string to convert to a sequence
        cleaner_names: names of the cleaner functions to run the text through
        Returns:
        List of integers corresponding to the symbols in the text
        '''
        sequence = []
        clean_text = text.strip()
        for symbol in clean_text:
            symbol_id = self._symbol_to_id[symbol]
            sequence += [symbol_id]
        return sequence

    def uromanize(self, text, uroman_pl):
        iso = "xxx"
        with tempfile.NamedTemporaryFile() as tf, \
             tempfile.NamedTemporaryFile() as tf2:
            with open(tf.name, "w") as f:
                f.write("\n".join([text]))
            cmd = f"perl " + uroman_pl
            cmd += f" -l {iso} "
            cmd +=  f" < {tf.name} > {tf2.name}"
            os.system(cmd)
            outtexts = []
            with open(tf2.name) as f:
                for line in f:
                    line =  re.sub(r"\s+", " ", line).strip()
                    outtexts.append(line)
            outtext = outtexts[0]
        return outtext

    def get_text(self, text, hps):
        text_norm = self.text_to_sequence(text, hps.data.text_cleaners)
        if hps.data.add_blank:
            text_norm = commons.intersperse(text_norm, 0)
        text_norm = torch.LongTensor(text_norm)
        return text_norm

    def filter_oov(self, text):
        val_chars = self._symbol_to_id
        txt_filt = "".join(list(filter(lambda x: x in val_chars, text)))
        return txt_filt

def preprocess_text(txt, text_mapper, hps, uroman_dir=None, lang=None):
    txt = preprocess_char(txt, lang=lang)
    is_uroman = hps.data.training_files.split('.')[-1] == 'uroman'
    if is_uroman:
        with tempfile.TemporaryDirectory() as tmp_dir:
            if uroman_dir is None:
                cmd = f"git clone git@github.com:isi-nlp/uroman.git {tmp_dir}"
                print(cmd)
                subprocess.check_output(cmd, shell=True)
                uroman_dir = tmp_dir
            uroman_pl = os.path.join(uroman_dir, "bin", "uroman.pl")
            print(f"uromanize")
            txt = text_mapper.uromanize(txt, uroman_pl)
            print(f"uroman text: {txt}")
    txt = txt.lower()
    txt = text_mapper.filter_oov(txt)
    return txt

if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

print(f"Run inference with {device}")
vocab_file = f"{ckpt_dir}/vocab.txt"
config_file = f"{ckpt_dir}/config.json"
assert os.path.isfile(config_file), f"{config_file} doesn't exist"
hps = utils.get_hparams_from_file(config_file)
text_mapper = TextMapper(vocab_file)
net_g = SynthesizerTrn(
    len(text_mapper.symbols),
    hps.data.filter_length // 2 + 1,
    hps.train.segment_size // hps.data.hop_length,
    **hps.model)
net_g.to(device)
_ = net_g.eval()

g_pth = f"{ckpt_dir}/G_100000.pth"
print(f"load {g_pth}")

_ = utils.load_checkpoint(g_pth, net_g, None)

Download model for language: bul
Model checkpoints in ./bul: ['config.json', 'vocab.txt', 'G_100000.pth']
Run inference with cuda




load ./bul/G_100000.pth


In [5]:
#@title Generate audio from text { run: "auto" }
Text = "\u041E\u0442 \u0443\u043F\u0440\u0430\u0432\u043B\u044F\u0432\u0430\u0449\u0430\u0442\u0430 \u043F\u0430\u0440\u0442\u0438\u044F \u043D\u0430\u0440\u0435\u043A\u043E\u0445\u0430 \u0442\u0435\u0437\u0438 \u043F\u0430\u0441\u0430\u0436\u0438 \"\u043D\u0435\u043E\u0441\u043D\u043E\u0432\u0430\u0442\u0435\u043B\u043D\u0438, \u043D\u0435\u0442\u043E\u0447\u043D\u0438 \u0438 \u043D\u0435\u0443\u043C\u0435\u0441\u0442\u043D\u0438\". \u0420\u0430\u0437\u0441\u043B\u0435\u0434\u0432\u0430\u043D\u0435\u0442\u043E \u0437\u0430\u0449\u043E \u0411\u0430\u0439\u0434\u044A\u043D \u0435 \u0437\u0430\u0434\u044A\u0440\u0436\u0430\u043B \u0432 \u0434\u043E\u043C\u0430 \u0441\u0438 \u0441\u0435\u043A\u0440\u0435\u0442\u043D\u0438 \u0434\u043E\u043A\u0443\u043C\u0435\u043D\u0442\u0438 \u0431\u0435\u0448\u0435 \u0432\u043E\u0434\u0435\u043D\u043E \u043E\u0442 \u0440\u0435\u043F\u0443\u0431\u043B\u0438\u043A\u0430\u043D\u0435\u0446, \u043D\u0430\u0437\u043D\u0430\u0447\u0435\u043D \u043E\u0442 \u0422\u0440\u044A\u043C\u043F. \u041F\u0440\u0438\u0442\u0435\u0441\u043D\u0435\u043D\u0438\u044F\u0442\u0430 \u0437\u0430 \u0432\u044A\u0437\u0440\u0430\u0441\u0442\u0442\u0430 \u043D\u0430 \u043F\u0440\u0435\u0437\u0438\u0434\u0435\u043D\u0442\u0430 \u0441\u0435 \u0441\u043F\u043E\u0434\u0435\u043B\u044F\u0442 \u043E\u0442 \u0448\u0438\u0440\u043E\u043A \u043A\u0440\u044A\u0433 \u0432 \u043E\u0431\u0449\u0435\u0441\u0442\u0432\u043E\u0442\u043E. \u041F\u0440\u043E\u0443\u0447\u0432\u0430\u043D\u0435, \u043F\u043E\u0440\u044A\u0447\u0430\u043D\u043E \u043E\u0442 \u0415\u043D \u0411\u0438 \u0421\u0438 \u0442\u0430\u0437\u0438 \u0441\u0435\u0434\u043C\u0438\u0446\u0430 \u043F\u043E\u0441\u043E\u0447\u0438, \u0447\u0435 \u043D\u0430\u0434 75% \u043E\u0442 \u0430\u043C\u0435\u0440\u0438\u043A\u0430\u043D\u0446\u0438\u0442\u0435 \u0441\u043C\u044F\u0442\u0430\u0442 \u0442\u043E\u0432\u0430 \u0437\u0430 \u0441\u0435\u0440\u0438\u043E\u0437\u0435\u043D \u043F\u0440\u043E\u0431\u043B\u0435\u043C, \u0438\u043D\u0444\u043E\u0440\u043C\u0438\u0440\u0430 NOVA." #@param {type:"string"}
txt = Text
txt = preprocess_text(txt, text_mapper, hps, lang=LANG)
stn_tst = text_mapper.get_text(txt, hps)
with torch.no_grad():
    x_tst = stn_tst.unsqueeze(0).to(device)
    x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
    hyp = net_g.infer(
        x_tst, x_tst_lengths, noise_scale=.667,
        noise_scale_w=0.8, length_scale=1.0
    )[0][0,0].cpu().float().numpy()

print(f"Generated audio")
Audio(hyp, rate=hps.data.sampling_rate)

bul
Generated audio
