# SmolTTS

In [None]:
from smoltts_mlx import SmolTTS

model = SmolTTS(checkpoint_dir="../inits/emilia/v1")

## Testing speaker conditioning

In [None]:
import numpy as np
import soundfile as sf
from scipy.signal import resample
from IPython.display import Audio

data, sample_rate = sf.read("tests/sky.wav", dtype="float32")
new_sample_rate = 24_000

# Compute new number of samples
num_samples = int(len(data) * new_sample_rate / sample_rate)

# Resample using FFT-based method
resampled_data = resample(data, num_samples)

print(f"Original: {sample_rate} Hz, Resampled: {new_sample_rate} Hz, Shape: {resampled_data.shape}")
Audio(resampled_data, rate=24_000)

In [None]:
import mlx.core as mx

input = model.codec.encode(mx.array(resampled_data[None, None, :]))
output = model.codec.decode(input)
Audio(output.flatten(), rate=model.sampling_rate)

In [None]:
prompt = "When I heard the release demo, I was shocked, angered, and in disbelief that Mr. Altman would pursue a voice so eerily similar to mine that my closest friends and news outlets could not tell the difference."
conditioning_prompt = model.create_speaker([{"text": prompt, "audio": resampled_data[mx.newaxis, mx.newaxis, :]}])
pcm2 = model("Tourists flock to it like dung beetles drawn to a shimmering heap of industrial slag, mistaking rusted girders for romance. ", speaker=conditioning_prompt)
Audio(pcm2, rate=model.sampling_rate)


In [None]:
from smoltts_mlx.lm.generate import generate_blocking, GenerationSettings

text1 = mx.concat([
    model.prompt_encoder.encode_text_turn("user", "Hello, this is a test"),
    model.prompt_encoder.encode_text_turn("assistant")
], axis=1)[mx.newaxis, :, :]
out = generate_blocking(model.lm, text1, GenerationSettings(), audio_only=True)
text2 = mx.concat()



# self-conditioning

In [None]:
prompt_text = "My name is Donald Trump, and I approve this message."
test_pcm = model(prompt_text, voice=None, speaker=None)
Audio(test_pcm, rate=model.sampling_rate)

In [None]:
clone_prompt = model.create_speaker([{"text": prompt_text, "audio": test_pcm[mx.newaxis, mx.newaxis, :]}])
output = model(" I look back now and insta anxiety and PTSD at your video.", speaker=clone_prompt)
Audio(output, rate=model.sampling_rate)