In [1]:
import pandas as pd 
import numpy as np 
import plotly.express as px 
pd.options.plotting.backend = 'plotly' 
pd.set_option('display.max_columns', None)

# Microsoft TTS5

In [None]:
#https://thepythoncode.com/article/convert-text-to-speech-in-python

In [2]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from datasets import load_dataset
import torch
import random
import string
import soundfile as sf

device = "cuda" if torch.cuda.is_available() else "cpu"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load the processor
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
# load the model
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
# load the vocoder, that is the voice encoder
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
# we load this dataset to get the speaker embeddings
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")

In [4]:
# speaker ids from the embeddings dataset
speakers = {
    'awb': 0,     # Scottish male
    'bdl': 1138,  # US male
    'clb': 2271,  # US female
    'jmk': 3403,  # Canadian male
    'ksp': 4535,  # Indian male
    'rms': 5667,  # US male
    'slt': 6799   # US female
}

In [10]:
def save_text_to_speech(text, speaker=None):
    # preprocess text
    inputs = processor(text=text, return_tensors="pt").to(device)
    if speaker is not None:
        # load xvector containing speaker's voice characteristics from a dataset
        speaker_embeddings = torch.tensor(embeddings_dataset[speaker]["xvector"]).unsqueeze(0).to(device)
    else:
        # random vector, meaning a random voice
        speaker_embeddings = torch.randn((1, 512)).to(device)
    # generate speech with the models
    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
    if speaker is not None:
        # if we have a speaker, we use the speaker's ID in the filename
        output_filename = f"{speaker}-{'-'.join(text.split()[:6])}.mp3"
    else:
        # if we don't have a speaker, we use a random string in the filename
        random_str = ''.join(random.sample(string.ascii_letters+string.digits, k=5))
        output_filename = f"{random_str}-{'-'.join(text.split()[:6])}.mp3"
    # save the generated speech to a file with 16KHz sampling rate
    sf.write(output_filename, speech.cpu().numpy(), samplerate=16000)
    # return the filename for reference
    return output_filename

In [9]:
# generate speech with a US female voice
save_text_to_speech("Iron Man is a superhero appearing in American comic books published by Marvel Comics.", speaker=speakers["slt"])

'6799-Iron-Man-is-a-superhero-appearing.mp3'

In [11]:
text=  """Iron Man is a superhero appearing in American comic books published by Marvel Comics."""

In [16]:
from IPython.display import Audio
for speaker_name, speaker in speakers.items():
    output_filename = save_text_to_speech(text, speaker)
    print(f"Saved {output_filename}")
    display(Audio(output_filename, rate=16000))
# random speaker
output_filename = save_text_to_speech(text)
print(f"Saved {output_filename}")

Saved 0-Iron-Man-is-a-superhero-appearing.mp3


Saved 1138-Iron-Man-is-a-superhero-appearing.mp3


Saved 2271-Iron-Man-is-a-superhero-appearing.mp3


Saved 3403-Iron-Man-is-a-superhero-appearing.mp3


Saved 4535-Iron-Man-is-a-superhero-appearing.mp3


Saved 5667-Iron-Man-is-a-superhero-appearing.mp3


Saved 6799-Iron-Man-is-a-superhero-appearing.mp3


Saved uJoq4-Iron-Man-is-a-superhero-appearing.mp3


In [13]:
for speaker_name, speaker in speakers.items():
    output_filename = save_text_to_speech(text, speaker)
    print(f"Saved {output_filename}")
    # Assuming the save_text_to_speech function returns a numpy array of audio data
    # Display the audio for each speaker
    display(Audio(output_filename, rate=SAMPLE_RATE))


Saved 0-Iron-Man-is-a-superhero-appearing.mp3


NameError: name 'Audio' is not defined