Skip to content

How to scale TTS output duration to a given length

szhaomsft edited this page Oct 22, 2020 · 1 revision

In some scenario, you might want to have controlled duration of TTS. For example, if you want to add voice dubbing with TTS. The audio needs to sync with the video. Here is one way to do it.

Basic idea is to generate TTS with default rate. Then you can calculate the ratio to expected duration, then you can adjust the rate for the TTS output with SSML prosody rate.

   public static async Task SythensizeTextByDuration(string voice, string locale, string text, double duration)
    {
        var config = SpeechConfig.FromSubscription(CogSvcKey.SpeechKey, CogSvcKey.SpeechRegion);
        string file = "temp.wav";

        config.SpeechSynthesisLanguage = locale;
        config.SpeechSynthesisVoiceName = voice;

        string ssml = $"<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='{locale}'><voice xml:lang='{locale}' xml:gender='Female' name='{voice}'>{text}</voice></speak>";

        using (var fileOutput = AudioConfig.FromWavFileOutput(file))
        {
            using (var fileSynthesizer = new SpeechSynthesizer(config, fileOutput))
            {
                using (var result = await fileSynthesizer.SpeakSsmlAsync(ssml))
                {
                    if (result.Reason == ResultReason.SynthesizingAudioCompleted)
                    {
                    }
                    else if (result.Reason == ResultReason.Canceled)
                    {
                        var cancellation = SpeechSynthesisCancellationDetails.FromResult(result);
                        Console.WriteLine($"CANCELED: Reason={cancellation.Reason}");

                        if (cancellation.Reason == CancellationReason.Error)
                        {
                            Console.WriteLine($"CANCELED: ErrorCode={cancellation.ErrorCode}");
                            Console.WriteLine($"CANCELED: ErrorDetails=[{cancellation.ErrorDetails}]");
                            Console.WriteLine($"CANCELED: Did you update the subscription info?");
                        }
                    }
                }
            }

            fileOutput.Dispose();
        }

   

        WaveFileReader reader = new WaveFileReader(file);
        TimeSpan span = reader.TotalTime;
        reader.Close();
        double ratioAdustPercentage = (span.TotalMilliseconds / 1000 / duration - 1.0f) * 100;
        ssml = $"<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='{locale}'><voice xml:lang='{locale}' xml:gender='Female' name='{voice}'><prosody rate='{ratioAdustPercentage}%'>{text}</prosody></voice></speak>";

        using (var fileOutput = AudioConfig.FromWavFileOutput(file + $"-{duration}.wav"))
        {
            using (var fileSynthesizer = new SpeechSynthesizer(config, fileOutput))
            {
                using (var result = await fileSynthesizer.SpeakSsmlAsync(ssml))
                {
                    if (result.Reason == ResultReason.SynthesizingAudioCompleted)
                    {
                    }
                    else if (result.Reason == ResultReason.Canceled)
                    {
                        var cancellation = SpeechSynthesisCancellationDetails.FromResult(result);
                        Console.WriteLine($"CANCELED: Reason={cancellation.Reason}");

                        if (cancellation.Reason == CancellationReason.Error)
                        {
                            Console.WriteLine($"CANCELED: ErrorCode={cancellation.ErrorCode}");
                            Console.WriteLine($"CANCELED: ErrorDetails=[{cancellation.ErrorDetails}]");
                            Console.WriteLine($"CANCELED: Did you update the subscription info?");
                        }
                    }
                }
            }

            fileOutput.Dispose();
        }
    }