How to scale TTS output duration to a given length

In some scenario, you might want to have controlled duration of TTS. For example, if you want to add voice dubbing with TTS. The audio needs to sync with the video. Here is one way to do it.

Basic idea is to generate TTS with default rate. Then you can calculate the ratio to expected duration, then you can adjust the rate for the TTS output with SSML prosody rate.

   public static async Task SythensizeTextByDuration(string voice, string locale, string text, double duration)
    {
        var config = SpeechConfig.FromSubscription(CogSvcKey.SpeechKey, CogSvcKey.SpeechRegion);
        string file = "temp.wav";

        config.SpeechSynthesisLanguage = locale;
        config.SpeechSynthesisVoiceName = voice;

        string ssml = $"<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='{locale}'><voice xml:lang='{locale}' xml:gender='Female' name='{voice}'>{text}</voice></speak>";

        using (var fileOutput = AudioConfig.FromWavFileOutput(file))
        {
            using (var fileSynthesizer = new SpeechSynthesizer(config, fileOutput))
            {
                using (var result = await fileSynthesizer.SpeakSsmlAsync(ssml))
                {
                    if (result.Reason == ResultReason.SynthesizingAudioCompleted)
                    {
                    }
                    else if (result.Reason == ResultReason.Canceled)
                    {
                        var cancellation = SpeechSynthesisCancellationDetails.FromResult(result);
                        Console.WriteLine($"CANCELED: Reason={cancellation.Reason}");

                        if (cancellation.Reason == CancellationReason.Error)
                        {
                            Console.WriteLine($"CANCELED: ErrorCode={cancellation.ErrorCode}");
                            Console.WriteLine($"CANCELED: ErrorDetails=[{cancellation.ErrorDetails}]");
                            Console.WriteLine($"CANCELED: Did you update the subscription info?");
                        }
                    }
                }
            }

            fileOutput.Dispose();
        }

   

        WaveFileReader reader = new WaveFileReader(file);
        TimeSpan span = reader.TotalTime;
        reader.Close();
        double ratioAdustPercentage = (span.TotalMilliseconds / 1000 / duration - 1.0f) * 100;
        ssml = $"<speak version='1.0' xmlns='http://www.w3.org/2001/10/synthesis' xmlns:mstts='https://www.w3.org/2001/mstts' xml:lang='{locale}'><voice xml:lang='{locale}' xml:gender='Female' name='{voice}'><prosody rate='{ratioAdustPercentage}%'>{text}</prosody></voice></speak>";

        using (var fileOutput = AudioConfig.FromWavFileOutput(file + $"-{duration}.wav"))
        {
            using (var fileSynthesizer = new SpeechSynthesizer(config, fileOutput))
            {
                using (var result = await fileSynthesizer.SpeakSsmlAsync(ssml))
                {
                    if (result.Reason == ResultReason.SynthesizingAudioCompleted)
                    {
                    }
                    else if (result.Reason == ResultReason.Canceled)
                    {
                        var cancellation = SpeechSynthesisCancellationDetails.FromResult(result);
                        Console.WriteLine($"CANCELED: Reason={cancellation.Reason}");

                        if (cancellation.Reason == CancellationReason.Error)
                        {
                            Console.WriteLine($"CANCELED: ErrorCode={cancellation.ErrorCode}");
                            Console.WriteLine($"CANCELED: ErrorDetails=[{cancellation.ErrorDetails}]");
                            Console.WriteLine($"CANCELED: Did you update the subscription info?");
                        }
                    }
                }
            }

            fileOutput.Dispose();
        }
    }

Azure TTS: Empower every person and every organization on the planet to have a delightful digital voice!
Azure Custom Voice: Build your one-of-a-kind Custom Voice and close to human Neural TTS in cloud and edge!

Azure Speech Document

Create Custom Neural Voice

Speech SDK

Azure Speech Containers

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

How to scale TTS output duration to a given length

Clone this wiki locally