From 79b2cfb9b96af8ad041307f0c63f6d6deb62e070 Mon Sep 17 00:00:00 2001 From: suzakuwcx Date: Fri, 29 Mar 2024 10:10:51 +0800 Subject: [PATCH] melo/api.py: add a 'tts' iterator to greatly improve the response speed In 'tts_to_file' function, the preprocessing process will try to split a long sentences into texts array. Then using model to interfence with each sentences and combine the result into finally audio array. But if the sentences is very very long, wait the entire process to be finished will cost lots of time, it is not a great idea. Most of the time, the interfence speed will extremely faster than playing speed, so use a iterator to get each of the audio piece. --- melo/api.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/melo/api.py b/melo/api.py index 236ea8f17..2c38823e8 100644 --- a/melo/api.py +++ b/melo/api.py @@ -80,10 +80,10 @@ def split_sentences_into_pieces(text, language, quiet=False): print(" > ===========================") return texts - def tts_to_file(self, text, speaker_id, output_path=None, sdp_ratio=0.2, noise_scale=0.6, noise_scale_w=0.8, speed=1.0, pbar=None, format=None, position=None, quiet=False,): + def tts_iter(self, text, speaker_id, sdp_ratio=0.2, noise_scale=0.6, noise_scale_w=0.8, speed=1.0, pbar=None, position=None, quiet=False,): language = self.language texts = self.split_sentences_into_pieces(text, language, quiet) - audio_list = [] + if pbar: tx = pbar(texts) else: @@ -121,10 +121,24 @@ def tts_to_file(self, text, speaker_id, output_path=None, sdp_ratio=0.2, noise_s length_scale=1. / speed, )[0][0, 0].data.cpu().float().numpy() del x_tst, tones, lang_ids, bert, ja_bert, x_tst_lengths, speakers - # - audio_list.append(audio) + + + + audio_segments = [] + audio_segments += audio.reshape(-1).tolist() + audio_segments += [0] * int((self.hps.data.sampling_rate * 0.05) / speed) + audio_segments = np.array(audio_segments).astype(np.float32) + + yield audio_segments + torch.cuda.empty_cache() - audio = self.audio_numpy_concat(audio_list, sr=self.hps.data.sampling_rate, speed=speed) + + def tts_to_file(self, text, speaker_id, output_path=None, sdp_ratio=0.2, noise_scale=0.6, noise_scale_w=0.8, speed=1.0, pbar=None, format=None, position=None, quiet=False,): + audio_list = [] + for audio in self.tts_iter(text, speaker_id, sdp_ratio, noise_scale, noise_scale_w, speed, pbar, position, quiet): + audio_list.append(audio) + + audio = np.concatenate(audio_list) if output_path is None: return audio