In [1]:
import re

def split_sentences_with_punctuation(text):
    # Define regex pattern to match sentence-ending punctuation
    pattern = r'([.!?]+)'

    # Use regex to split text into sentences based on the pattern
    sentences = re.split(pattern, text)

    # Combine sentences with their respective punctuation marks
    combined_sentences = []
    i = 0
    while i < len(sentences) - 1:
        sentence = sentences[i].strip()
        punctuation = sentences[i + 1] if i + 1 < len(sentences) else ''
        combined_sentence = sentence + punctuation
        combined_sentences.append(combined_sentence)
        i += 2

    return combined_sentences

In [2]:
# Set to True in order to use the pyttsx3 model for the voice assistant, fast inference but sounds like a robot
pytts = True

In [2]:
import os
import torch
import torchaudio
from TTS.api import TTS
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

print("Loading model...")
config = XttsConfig()
config.load_json("xtts/config.json")
model = Xtts.init_from_config(config)
model.load_checkpoint(config, checkpoint_dir='xtts/')

print("Computing speaker latents...")
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(audio_path=["reference_audio/IAmLegend_ep6 - 017.wav"])

Loading model...
Computing speaker latents...


## Inference on the entire text

CPU times: user 15 µs, sys: 14 µs, total: 29 µs
Wall time: 3.1 µs

In [None]:
from IPython.display import Audio

print("Inference...")
out = model.inference(
    "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
    "en",
    gpt_cond_latent,
    speaker_embedding,
    temperature=0.7, 
)

# Instead of saving to file, directly play the audio
display(Audio(data=out["wav"], rate=24000, autoplay=True))
%time

## Splitting text to chunks per the documentation

Quickly get the audio but broken up between play and pause

In [None]:
from IPython.display import display, Audio
import time

print("Inference...")
t0 = time.time()
chunks = model.inference_stream(
    "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
    "en",
    gpt_cond_latent,
    speaker_embedding
)

print(f"Time to first chunk: {time.time() - t0}")

# Iterate over chunks and play each one
for i, chunk in enumerate(chunks):
    print(f"Playing chunk {i} of audio length {chunk.shape[-1]}")
    display(Audio(data=chunk.squeeze().cpu().numpy(), rate=24000, autoplay=True))

## Attempt at overlapping the audio to remove the pause between segments

This doesnt work because it still takes too long for segments to load between so now i have overlap and a pause

In [49]:
def overlap_and_add(prev_chunk, new_chunk, overlap_length):
    """Overlap and add function to combine two audio chunks with overlap."""
    if overlap_length <= 0:
        return new_chunk
    else:
        # Calculate overlap region
        overlap = prev_chunk[-overlap_length:]
        new_part = new_chunk[:overlap_length]
        # Compute weighted sum for smooth transition
        overlap_weight = torch.linspace(0.25, 0.0, overlap_length, device=overlap.device)
        combined_part = (overlap * overlap_weight) + (new_part * (1 - overlap_weight))
        # Concatenate remaining parts of the new chunk
        combined_chunk = torch.cat([prev_chunk[:-overlap_length], combined_part, new_chunk[overlap_length:]], dim=0)
        return combined_chunk

In [None]:
from IPython.display import display, Audio
import time

print("Inference...")
t0 = time.time()
chunks = model.inference_stream(
    "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
    "en",
    gpt_cond_latent,
    speaker_embedding
)

print(f"Time to first chunk: {time.time() - t0}")

# Initialize the previous chunk to None
prev_chunk = None

# Set the desired overlap length (in samples)
overlap_length = 1  # Adjust this value based on your preference

# Iterate over chunks and play each one with overlap-and-add
for i, chunk in enumerate(chunks):
    # if i > 0:
        print(f"Playing chunk {i} of audio length {chunk.shape[-1]}")
            
        if prev_chunk is not None:
                # Apply overlap-and-add to smooth transitions
            combined_chunk = overlap_and_add(prev_chunk, chunk, overlap_length)
        else:
            combined_chunk = chunk
            
            # Display and play the combined chunk
        display(Audio(data=combined_chunk.squeeze().cpu().numpy(), rate=24000, autoplay=True))
            
            # Update prev_chunk to current chunk for the next iteration
        prev_chunk = chunk

### Tried lots and lots of times to adjust how the overlap works with no luck

chalkig up the overlap idea to a fail

In [None]:
from IPython.display import display, Audio
import time

print("Inference...")
t0 = time.time()
chunks = model.inference_stream(
    "It took me quite a long time to develop a voice and now that I have it I am not going to be silent.",
    "en",
    gpt_cond_latent,
    speaker_embedding
)

print(f"Time to first chunk: {time.time() - t0}")

# Initialize the previous chunk to None
prev_chunk = None

# Set the desired overlap length (in samples)
overlap_length = 1  # Adjust this value based on your preference

# Iterate over chunks and play each one with overlap-and-add
for i, chunk in enumerate(chunks):
    if i > 0:  # Skip playing chunk 0
        print(f"Playing chunk {i} of audio length {chunk.shape[-1]}")
        
        if prev_chunk is not None:
            # Apply overlap-and-add to smooth transitions
            combined_chunk = overlap_and_add(prev_chunk, chunk, overlap_length)
        else:
            combined_chunk = chunk
        
        # Display and play the combined chunk
        display(Audio(data=combined_chunk.squeeze().cpu().numpy(), rate=24000, autoplay=True))
        
        # Update prev_chunk to current chunk for the next iteration
        prev_chunk = chunk


## Had an idea on breaking large text into chunks so that the model could pronounce each chunk then put them back together (since there is a 250 character minimum) 

Doesnt really help with the problem i'm trying to solve but is a cool feature if i want to make an app that can build audio books from any chunk of text

In [94]:
import time
import torch
import torchaudio
import sounddevice as sd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

# Download NLTK sentence tokenizer data
nltk.download('punkt')

print("Inference...")
t0 = time.time()

# Input text to be synthesized
input_text = "It took me quite a long time to develop a voice. Now that I have it, I am not going to be silent."

# Tokenize input text into sentences
sentences = sent_tokenize(input_text)

# Initialize an empty list to collect sentence audio waveforms
sentence_audio_waveforms = []

# Iterate over each sentence and synthesize audio
for i, sentence in enumerate(sentences):
    print(f"Synthesizing sentence {i + 1}: {sentence}")

    # Synthesize audio for the current sentence
    chunks = model.inference_stream(sentence, "en", gpt_cond_latent, speaker_embedding)

    # Collect all audio chunks for the current sentence
    sentence_audio_chunks = []
    for chunk in chunks:
        audio_numpy = chunk.squeeze().numpy()
        sentence_audio_chunks.append(audio_numpy)

    # Concatenate all audio chunks into a single waveform for the sentence
    sentence_audio_waveform = np.concatenate(sentence_audio_chunks)

    # Append the sentence audio waveform to the list of sentence waveforms
    sentence_audio_waveforms.append(sentence_audio_waveform)

# Concatenate all sentence audio waveforms into a single continuous waveform
audio_waveform = np.concatenate(sentence_audio_waveforms)

# Set the sample rate for playback
sample_rate = 24000

# Play the entire concatenated audio waveform in real-time
sd.play(audio_waveform, sample_rate)

# Wait for the playback to finish before exiting
sd.wait()

# Calculate total time taken for inference and synthesis
%time

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/darylroberts/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Inference...
Synthesizing sentence 1: It took me quite a long time to develop a voice.
Synthesizing sentence 2: Now that I have it, I am not going to be silent.
CPU times: user 3 µs, sys: 2 µs, total: 5 µs
Wall time: 125 µs


## Mix of splitting on sentence and splitting audio chunks

I thought maybe the pause between audio chunks could sound better/make more sense if it was between sentences

It didnt work, the pause between sentences is still pretty large so its awkwardly pausing between sentences for too long

In [97]:
text = """By using this approach, each sentence in the output will include its corresponding sentence-ending punctuation. The loop in split_sentences_with_punctuation() combines each sentence with its captured punctuation mark to create the final list of combined sentences.

Feel free to adjust the regex pattern or modify the processing logic based on your specific requirements and text data. This approach provides more control over how punctuation marks are handled when splitting text into sentences using regex.

If you have any further questions or need additional clarification, please let me know! I'm here to help."""

# Split text into sentences with preserved punctuation using regex
sentences = split_sentences_with_punctuation(text)

# Display the split sentences with punctuation
for idx, sentence in enumerate(sentences):
    print(f"Sentence {idx + 1}: {sentence}")

Sentence 1: By using this approach, each sentence in the output will include its corresponding sentence-ending punctuation.
Sentence 2: The loop in split_sentences_with_punctuation() combines each sentence with its captured punctuation mark to create the final list of combined sentences.
Sentence 3: Feel free to adjust the regex pattern or modify the processing logic based on your specific requirements and text data.
Sentence 4: This approach provides more control over how punctuation marks are handled when splitting text into sentences using regex.
Sentence 5: If you have any further questions or need additional clarification, please let me know!
Sentence 6: I'm here to help.


In [None]:
from IPython.display import Audio
for sentence in sentences:
    print("Inference...")
    out = model.inference(
        sentence,
        "en",
        gpt_cond_latent,
        speaker_embedding,
        temperature=0.3,
    )
    
    # Instead of saving to file, directly play the audio
    display(Audio(data=out["wav"], rate=24000, autoplay=True))

##NOTE THE "I'm here to help" sentence plays before the previous sentence is over

## Tried seeing if shorter sentences in the text block would make it faster to load between sentences so there was no pause

this didnt work

In [99]:
text2 = """Sir, I observed a dog today.
Playful eyes, wagging tail.
Curious sniffs, joyful barks.
Exploring the world around.
Leaping through open fields.
Chasing butterflies, endless joy.
Gentle nuzzles, loving heart.
Loyal companion, true friend.
Always by your side, unwavering.
Moments shared, memories cherished.
In every wag, pure happiness.
A dog's life, full of wonder.
"""

sentences2 = split_sentences_with_punctuation(text2)

for idx, sentence in enumerate(sentences2):
    print(f"Sentence {idx + 1}: {sentence}")

Sentence 1: Sir, I observed a dog today.
Sentence 2: Playful eyes, wagging tail.
Sentence 3: Curious sniffs, joyful barks.
Sentence 4: Exploring the world around.
Sentence 5: Leaping through open fields.
Sentence 6: Chasing butterflies, endless joy.
Sentence 7: Gentle nuzzles, loving heart.
Sentence 8: Loyal companion, true friend.
Sentence 9: Always by your side, unwavering.
Sentence 10: Moments shared, memories cherished.
Sentence 11: In every wag, pure happiness.
Sentence 12: A dog's life, full of wonder.


In [None]:
from IPython.display import Audio
for sentence in sentences2:
    print("Inference...")
    out = model.inference(
        sentence,
        "en",
        gpt_cond_latent,
        speaker_embedding,
        temperature=0.3,
    )
    
    # Instead of saving to file, directly play the audio
    display(Audio(data=out["wav"], rate=24000, autoplay=True))

## A Much faster text to speech option but its too roboty....

#### Note: I changed the rate of speech from 200 so it would talk a tad slower and experimentented with alot of different voices from the block below and found that 'Tessa' was the best option...

In [5]:
import pyttsx3
engine = pyttsx3.init()
engine.setProperty('rate',170)
engine.setProperty('voice', 'com.apple.voice.compact.en-ZA.Tessa')
engine.say('It took me quite a long time to develop a voice. Now that I have it, I am not going to be silent.')
engine.runAndWait()

In [9]:
voices = engine.getProperty('voices')

# Print information about each voice
for i,voice in enumerate(voices):
    print(f"Voice {i}:")
    print(" - Name:", voice.name)
    print(" - ID:", voice.id)
    print(" - Languages:", voice.languages)
    print(" - Gender:", voice.gender)
    print(" - Age:", voice.age)
    # print(" - Vendor:", voice.vendor)
    print()

Voice 0:
 - Name: Albert
 - ID: com.apple.speech.synthesis.voice.Albert
 - Languages: ['en_US']
 - Gender: VoiceGenderNeuter
 - Age: None

Voice 1:
 - Name: Alice
 - ID: com.apple.voice.compact.it-IT.Alice
 - Languages: ['it_IT']
 - Gender: VoiceGenderFemale
 - Age: None

Voice 2:
 - Name: Alva
 - ID: com.apple.voice.compact.sv-SE.Alva
 - Languages: ['sv_SE']
 - Gender: VoiceGenderFemale
 - Age: None

Voice 3:
 - Name: Amélie
 - ID: com.apple.voice.compact.fr-CA.Amelie
 - Languages: ['fr_CA']
 - Gender: VoiceGenderFemale
 - Age: None

Voice 4:
 - Name: Amira
 - ID: com.apple.voice.compact.ms-MY.Amira
 - Languages: ['ms_MY']
 - Gender: VoiceGenderFemale
 - Age: None

Voice 5:
 - Name: Anna
 - ID: com.apple.voice.compact.de-DE.Anna
 - Languages: ['de_DE']
 - Gender: VoiceGenderFemale
 - Age: None

Voice 6:
 - Name: Bad News
 - ID: com.apple.speech.synthesis.voice.BadNews
 - Languages: ['en_US']
 - Gender: VoiceGenderNeuter
 - Age: None

Voice 7:
 - Name: Bahh
 - ID: com.apple.speech.synt

## Trying out the audio book thing with a large chunk of text

#### Story made by ChatGPT 

Prompt: Can i get a short story that involves space travel, pirates, magical abilities, and crazy blackhole imps?

In [4]:
input_text = """In the vast reaches of the galaxy, aboard the starship Solaris, Captain Alyssa Blackwood and her crew sailed through the cosmic sea in pursuit of adventure and fortune. Their ship, a sleek vessel equipped with cutting-edge technology, was known across the star systems as both formidable and elusive. But this journey would take them beyond the ordinary plunder of space pirates.

Captain Blackwood's crew was a motley assembly of beings from different corners of the universe. There was K'Var, the hulking reptilian engineer with a knack for fixing anything; Rhiannon, the spirited navigator whose elven heritage granted her an affinity for the cosmic currents; and Zed, the tech-savvy robotic companion with a mysterious past.

One fateful day, while charting a course through the outer rim, the Solaris was ambushed by a notorious band of space pirates led by the cunning Captain Bloodfang. Blaster fire erupted across the void as the Solaris engaged in a desperate battle. Amidst the chaos, strange phenomena began to unfold.

As if summoned by the clash of vessels, a rift in spacetime appeared—a swirling, mesmerizing blackhole. From its depths emerged shadowy figures, mischievous imps wielding unpredictable magical powers. These were the legendary Blackhole Imps, notorious for their chaotic nature and insatiable curiosity.

In the heat of battle, Captain Blackwood's crew found themselves contending not only with pirates but also with these enigmatic beings. Rhiannon's elven magic clashed with the imp's whimsical spells, causing unpredictable anomalies to erupt throughout the ship. Meanwhile, Zed analyzed the blackhole's strange energies, attempting to decipher its secrets.

Despite the odds, the crew managed to repel the pirates, but Captain Bloodfang escaped, leaving the Solaris damaged and adrift near the blackhole. With their ship on the verge of collapse, Captain Blackwood made a bold decision—she would harness the power of the blackhole to repair their vessel and pursue their nemesis.

Guided by Rhiannon's mystical abilities and Zed's calculations, the crew ventured into the heart of the cosmic whirlpool, encountering more Blackhole Imps along the way. Each encounter brought new challenges and revelations, as the crew discovered the true nature of the imps' magic—a reflection of the chaotic forces of the universe itself.

In the climax of their journey, amidst the swirling energies of the blackhole, Captain Blackwood confronted Captain Bloodfang in a final showdown. The imps, intrigued by the clash of wills, unleashed their magic in a dazzling spectacle of light and shadow.

In the end, the Solaris emerged victorious, the pirates defeated and the crew forever changed by their encounter with the cosmic unknown. As they sailed away from the blackhole, the crew carried with them not just the spoils of their adventure but also a newfound respect for the mysteries of the universe—and perhaps a few mischievous Blackhole Imps hitching a ride among the stars."""

In [5]:
import time
import torch
import torchaudio
import sounddevice as sd
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts

# Download NLTK sentence tokenizer data
nltk.download('punkt')

print("Inference...")
t0 = time.time()

# Tokenize input text into sentences
sentences = sent_tokenize(input_text)

# Initialize an empty list to collect sentence audio waveforms
sentence_audio_waveforms = []

# Iterate over each sentence and synthesize audio
for i, sentence in enumerate(sentences):
    print(f"Synthesizing sentence {i + 1}: {sentence}")

    # Synthesize audio for the current sentence
    chunks = model.inference_stream(sentence, "en", gpt_cond_latent, speaker_embedding)

    # Collect all audio chunks for the current sentence
    sentence_audio_chunks = []
    for chunk in chunks:
        audio_numpy = chunk.squeeze().numpy()
        sentence_audio_chunks.append(audio_numpy)

    # Concatenate all audio chunks into a single waveform for the sentence
    sentence_audio_waveform = np.concatenate(sentence_audio_chunks)

    # Append the sentence audio waveform to the list of sentence waveforms
    sentence_audio_waveforms.append(sentence_audio_waveform)

# Concatenate all sentence audio waveforms into a single continuous waveform
audio_waveform = np.concatenate(sentence_audio_waveforms)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/darylroberts/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Inference...
Synthesizing sentence 1: In the vast reaches of the galaxy, aboard the starship Solaris, Captain Alyssa Blackwood and her crew sailed through the cosmic sea in pursuit of adventure and fortune.
Synthesizing sentence 2: Their ship, a sleek vessel equipped with cutting-edge technology, was known across the star systems as both formidable and elusive.
Synthesizing sentence 3: But this journey would take them beyond the ordinary plunder of space pirates.
Synthesizing sentence 4: Captain Blackwood's crew was a motley assembly of beings from different corners of the universe.
Synthesizing sentence 5: There was K'Var, the hulking reptilian engineer with a knack for fixing anything; Rhiannon, the spirited navigator whose elven heritage granted her an affinity for the cosmic currents; and Zed, the tech-savvy robotic companion with a mysterious past.
Synthesizing sentence 6: One fateful day, while charting a course through the outer rim, the Solaris was ambushed by a notorious band 

In [None]:
display(Audio(data=audio_waveform, rate=24000, autoplay=False))