# SmolTTS

In [1]:
from smoltts_mlx import SmolTTS

model = SmolTTS(checkpoint_dir="../inits/emilia/v1")

  from .autonotebook import tqdm as notebook_tqdm


## Testing speaker conditioning

In [3]:
import numpy as np
import soundfile as sf
from scipy.signal import resample
from IPython.display import Audio

data, sample_rate = sf.read("tests/sky.wav", dtype="float32")
new_sample_rate = 24_000

# Compute new number of samples
num_samples = int(len(data) * new_sample_rate / sample_rate)

# Resample using FFT-based method
resampled_data = resample(data, num_samples)

print(f"Original: {sample_rate} Hz, Resampled: {new_sample_rate} Hz, Shape: {resampled_data.shape}")
Audio(resampled_data, rate=24_000)

Original: 44100 Hz, Resampled: 24000 Hz, Shape: (305994,)


In [4]:
import mlx.core as mx

input = model.codec.encode(mx.array(resampled_data[None, None, :]))
output = model.codec.decode(input)
Audio(output.flatten(), rate=model.sampling_rate)

In [12]:
prompt = "When I heard the release demo, I was shocked, angered, and in disbelief that Mr. Altman would pursue a voice so eerily similar to mine that my closest friends and news outlets could not tell the difference."
conditioning_prompt = model.create_speaker([{"text": prompt, "audio": output}])
pcm2 = model("This man has the second biggest set of balls on this planet, only second to the person who set the safety rope up in the first place.", speaker=conditioning_prompt)
Audio(pcm2, rate=model.sampling_rate)

507.435083ms prompt processing: 512 tokens (1008.996060 tokens/s)


88it [00:01, 78.78it/s]


Generated in 1.12s (77.75 tokens/s, 12.86ms/token), 6.22x realtime


In [None]:
from smoltts_mlx.lm.generate import generate_blocking, GenerationSettings

text1 = mx.concat([
    model.prompt_encoder.encode_text_turn("user", "Hello, this is a test"),
    model.prompt_encoder.encode_text_turn("assistant")
], axis=1)[mx.newaxis, :, :]
out = generate_blocking(model.lm, text1, GenerationSettings(), audio_only=True)
text2 = mx.concat()



# self-conditioning

In [14]:
prompt_text = "Cool how the youtube algorithm actually gives you hidden gems like this out of nowhere that were just buried in the mass library of videos."
test_pcm = model(prompt_text, voice=None, speaker=None)
Audio(test_pcm, rate=model.sampling_rate)

62.220812ms prompt processing: 151 tokens (2426.840723 tokens/s)


94it [00:01, 73.03it/s]


Generated in 1.29s (72.19 tokens/s, 13.85ms/token), 5.78x realtime


In [20]:
clone_prompt = model.create_speaker([{"text": prompt_text, "audio": test_pcm[mx.newaxis, mx.newaxis, :]}])
output = model(" I look back now and insta anxiety and PTSD at your video. Dang, I was willing to do anything to provide a meal for the family!", speaker=clone_prompt)
Audio(output, rate=model.sampling_rate)

425.414085ms prompt processing: 373 tokens (876.792783 tokens/s)


88it [00:01, 74.13it/s]


Generated in 1.19s (73.22 tokens/s, 13.66ms/token), 5.86x realtime
