# Mozilla TTS on CPU Real-Time Speech Synthesis 

We use Tacotron2 and MultiBand-Melgan models and LJSpeech dataset.

Tacotron2 is trained using [Double Decoder Consistency](https://erogol.com/solving-attention-problems-of-tts-models-with-double-decoder-consistency/) (DDC) only for 130K steps (3 days) with a single GPU.

MultiBand-Melgan is trained  1.45M steps with real spectrograms.

Note that both model performances can be improved with more training.

### Download Models

In [1]:
#!gdown --id 1dntzjWFg7ufWaTaFy80nRz-Tu02xWZos -O tts_model.pth.tar
#!gdown --id 18CQ6G6tBEOfvCHlPqP8EBI4xWbrr9dBc -O config.json

In [2]:
#!gdown --id 1Ty5DZdOc0F7OTGj9oJThYbL5iVu_2G0K -O vocoder_model.pth.tar
#!gdown --id 1Rd0R_nRCrbjEdpOwq6XwZAktvugiBvmu -O config_vocoder.json
#!gdown --id 11oY3Tv0kQtxK_JPgxrfesa99maVXHNxU -O scale_stats.npy

### Setup Libraries

In [3]:
#! sudo apt-get install espeak ffmpeg -y

In [4]:
#!git clone https://github.com/mozilla/TTS

In [5]:
#%cd TTS
#!git checkout b1935c97
#!pip install -r requirements.txt
#!python setup.py install
#!pip install inflect pydub re
#%cd ..

### Load Models

In [6]:
import gc
import copy
import os
import torch
import time
import IPython
import numpy as np
import scipy.io.wavfile
import math
from playsound import playsound

from TTS.utils.generic_utils import setup_model
from TTS.utils.io import load_config
from TTS.utils.text.symbols import symbols, phonemes
from TTS.utils.audio import AudioProcessor
from TTS.utils.synthesis import synthesis
from TTS.vocoder.utils.generic_utils import setup_generator

In [7]:
import resource
#TTS Class
class TTSModel:
    def __init__(self, TTS_MODEL, TTS_CONFIG, VOCODER_MODEL, VOCODER_CONFIG, use_cuda, use_gl):
        self.use_cuda = use_cuda
        self.use_gl = use_gl 
        # model paths
        self.tts_config = load_config(TTS_CONFIG)
        vocoder_config = load_config(VOCODER_CONFIG)
        # load audio processor
        self.ap = AudioProcessor(**self.tts_config.audio)
        # LOAD TTS MODEL
        # multi speaker 
        self.speaker_id = None
        speakers = []
        # load the model
        num_chars = len(phonemes) if self.tts_config.use_phonemes else len(symbols)
        self.model = setup_model(num_chars, len(speakers), self.tts_config)
        # load model state
        self.cp =  torch.load(TTS_MODEL, map_location=torch.device('cpu'))
        # load the model
        self.model.load_state_dict(self.cp['model'])
        if self.use_cuda:
            self.model.cuda()
        self.model.eval()
        # set model stepsize
        if 'r' in self.cp:
            self.model.decoder.set_r(self.cp['r'])
        # LOAD VOCODER MODEL
        self.vocoder_model = setup_generator(vocoder_config)
        self.vocoder_model.load_state_dict(torch.load(VOCODER_MODEL, map_location="cpu")["model"])
        self.vocoder_model.remove_weight_norm()
        self.vocoder_model.inference_padding = 0
        ap_vocoder = AudioProcessor(**vocoder_config['audio'])    
        if use_cuda:
            self.vocoder_model.cuda()
        self.vocoder_model.eval()
        #get sample rate
        self.sample_rate = self.ap.sample_rate
        gc.collect()
    def tts(self,text,interactive=False):
        figures=True
        t_1 = time.time()
        tmodel = copy.deepcopy(self.model)
        #tmodel = self.model
        tap = copy.deepcopy(self.ap)
        #tap = self.ap
        tvoc = copy.deepcopy(self.vocoder_model)
        #tvoc = self.vocoder_model
        enable_chars = self.tts_config.enable_eos_bos_chars
        waveform, alignment, mel_spec, mel_postnet_spec, stop_tokens, inputs = synthesis(tmodel, text, self.tts_config, self.use_cuda, tap, 
                                                            self.speaker_id, style_wav=None, truncated=False, enable_eos_bos_chars=enable_chars)
        # mel_postnet_spec = ap._denormalize(mel_postnet_spec.T)
        del tmodel
        del tap
        gc.collect()
        if not self.use_gl:
            waveform = tvoc.inference(torch.FloatTensor(mel_postnet_spec.T).unsqueeze(0))
            waveform = waveform.flatten()
        if self.use_cuda:
            waveform = waveform.cpu()
        waveform = waveform.numpy()
        del tvoc
        #rtf = (time.time() - t_1) / (len(waveform) / self.ap.sample_rate)
        #tps = (time.time() - t_1) / len(waveform)
        #print(waveform.shape)
        usage = resource.getrusage(resource.RUSAGE_SELF).ru_maxrss 
        print(" > Run-time: {}".format(time.time() - t_1)) 
        print(" > Memory Used: {} MB".format(math.floor(usage/1024)))
        #print(" > Real-time factor: {}".format(rtf))
        #print(" > Time per step: {}".format(tps))
        if interactive:
            IPython.display.display(IPython.display.Audio(waveform, rate=self.sample_rate)) 
        gc.collect()
        return alignment, mel_postnet_spec, stop_tokens, waveform
    def simpletts(self,text):
        a,m,s,wav = self.tts(text)
        del a
        del m
        del s
        gc.collect()
        return wav

## See it in action!

In [8]:
def tryit(sample):
    # load the model
    ttsmodel = TTSModel("tts_model.pth.tar","config.json","vocoder_model.pth.tar","config_vocoder.json",False,False)
    # input sample and hear it!
    stuff = ttsmodel.tts(sample,True)
    del stuff
    del ttsmodel
#tryit("Bill got in the habit of asking himself “Is that thought true?” and if he wasn’t absolutely certain it was, he just let it go.")

## Process files and output to wav

In [9]:
from pydub import AudioSegment
from functools import reduce
import re
from shutil import copyfile

In [10]:
def preprocess(info):
    info = ' '.join(info.split('\n'))
    info = map(lambda x: x+'.', info.split('. '))
    return list(filter(lambda x: len(x)>1,info))

In [11]:
def writetofile(sample_rate, name, wav):
    scipy.io.wavfile.write(name,sample_rate,wav)
    del wav
    return
def writesent(ttsmodel, name,sent):
    scipy.io.wavfile.write(name,ttsmodel.sample_rate,ttsmodel.simpletts(sent))
    gc.collect()
    return

### Option 1: Separate speaking and acuumulating

In [12]:
def speaksents(ttsmodel, sents):
    for i in range(len(sents)):
        print(i)
        wav = ttsmodel.simpletts(sents[i])
        writetofile(ttsmodel.sample_rate, 'tmp/thing'+str(i)+'.wav',wav)
        del wav
        gc.collect()
    return
def collectnonsense(num,out):
    print(num)
    copyfile("tmp/thing0.wav",out)
    os.remove("tmp/thing0.wav")
    for i in range(num-1):
        thingi = "tmp/thing"+str(i+1)+".wav"
        acc = AudioSegment.from_wav(out)
        acc = acc + AudioSegment.from_wav(thingi)
        acc.export(out,format="wav")
        del acc
        os.remove(thingi)
        gc.collect()
    return
def speaktofile(paragraphs,out):
    print('loading model')
    model = TTSModel("tts_model.pth.tar","config.json","vocoder_model.pth.tar","config_vocoder.json",False,False)
    print('processing')
    sents = preprocess(paragraphs)
    #print(sents)
    print('speaking')
    speaksents(model, sents)
    del model
    print('collecting')
    collectnonsense(len(sents),out)
    print("Done!")
    return

### Option 2: Speak and accumulate in the same step

In [13]:
def accwavs(out,tmp):
    gc.collect()
    acc = AudioSegment.from_wav(out)
    add = AudioSegment.from_wav(tmp)
    acc = acc + add
    del add
    acc.export(out,format="wav")
    del acc
    return
def sillyspeak(ttsmodel, sents,out):
    tmp = "tmp/tmp.wav"
    writetofile(ttsmodel.sample_rate, out,ttsmodel.simpletts(sents[0]))
    i = 0
    for sent in sents[1:]:
        tmpmodel = copy.copy(ttsmodel)
        print(sent)
        writesent(tmpmodel,tmp,sent)
        del tmpmodel
        gc.collect()
        accwavs(out,tmp)
        os.remove(tmp)
        i += 1
        print(i)
    return
def testspeaktofile(paragraphs,out):
    print('loading model')
    model = TTSModel("tts_model.pth.tar","config.json","vocoder_model.pth.tar","config_vocoder.json",False,False)
    print('processing')
    sents = preprocess(paragraphs)
    print('test speaking')
    sillyspeak(model, sents,out)
    del model
    print("Done!")
    return

## Run it!

In [14]:
file = open("ACyborgManifesto","r")
#words = file.readlines()
#words = ' '.join(words[:5])
words = file.read()
file.close()
del file
#print(preprocess(words))
# option 1
#speaktofile(words,'manifesto.wav')
# option 2
testspeaktofile(words,'manifesto.wav')

loading model
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:1.5
 | > preemphasis:0.0
 | > griffin_lim_iters:60
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > spec_gain:1.0
 | > stft_pad_mode:reflect
 | > max_norm:4.0
 | > clip_norm:True
 | > do_trim_silence:True
 | > trim_db:60
 | > do_sound_norm:False
 | > stats_path:./scale_stats.npy
 | > hop_length:256
 | > win_length:1024
 > Using model: Tacotron2
 > Generator Model: multiband_melgan_generator
 > Setting up Audio Processor...
 | > sample_rate:22050
 | > num_mels:80
 | > min_level_db:-100
 | > frame_shift_ms:None
 | > frame_length_ms:None
 | > ref_level_db:0
 | > fft_size:1024
 | > power:None
 | > preemphasis:0.0
 | > griffin_lim_iters:None
 | > signal_norm:True
 | > symmetric_norm:True
 | > mel_fmin:50.0
 | > mel_fmax:7600.0
 | > spec_gain

OSError: [Errno 12] Cannot allocate memory