# Using the library

## Basic usage

In [1]:
from csm_mlx.loaders import CSM
import time

load_start_time = time.time()
model = CSM()
load_end_time = time.time()

  from .autonotebook import tqdm as notebook_tqdm
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 71435.83it/s]


This generates with a random speaker:

In [10]:
from IPython.display import Audio

text = "Hello, my name is John, and I like bread."
pcm = model(text, 0, temp=0.9)

Audio(pcm, rate=model.sampling_rate)

778.759003ms prompt processing: 17 tokens (42.375112 tokens/s)


36it [00:02, 13.46it/s]


Generated in 3.46s (10.13 tokens/s, 98.76ms/token), 0.81x realtime


Optional: save audio procedurally

In [None]:
from csm_mlx.io.wav import pcm_to_wav_bytes

with open("out.wav", "wb") as f:
    f.write(pcm_to_wav_bytes(pcm))

## Voice cloning

Get your reference audio. WAV files only for now, sorry.

In [10]:
from csm_mlx.loaders.csm import Segment
import soundfile as sf
from scipy.signal import resample
import numpy as np

def load_wav(path: str) -> np.ndarray:
    data, sr = sf.read(path)
    new_sample_rate = model.sampling_rate

    num_samples = int(len(data) * new_sample_rate / sr)

    # Resample using FFT-based method
    return resample(data, num_samples)

# Substitute with your segments as desired
audio = Segment(
    speaker=0,
    text="When I heard the release demo, I was shocked, angered, and in disbelief that Mr. Altman would pursue a voice that sounded so eerily similar to mine that my closest friends and news outlets could not tell the difference.",
    audio=load_wav("./tests/sky.wav")
)
context = [audio]

In [12]:
from IPython.display import Audio

text = "Hello, I'm Sky. Welcome to my open source emporium!"
pcm = model(text, 0, context=context, temp=0.7, backbone_min_p=0.05)

Audio(pcm, rate=model.sampling_rate)


347.315311ms prompt processing: 232 tokens (95.014527 tokens/s)


44it [00:03, 13.39it/s]


Generated in 3.64s (11.83 tokens/s, 84.56ms/token), 0.95x realtime


In [13]:
text = "This is a test of caching the previous mimi generations."
pcm = model(text, 0, use_last_gens=True, temp=0.9)

Audio(pcm, rate=model.sampling_rate)

151.368856ms prompt processing: 17 tokens (218.010500 tokens/s)


38it [00:02, 13.48it/s]


Generated in 2.97s (12.45 tokens/s, 80.30ms/token), 1.00x realtime


## Streaming

In [None]:
from IPython.display import Audio
import numpy as np

# TODO block anyway, just testing correctness
frames = [frame for frame in model.stream("Hello world, this is a test of streaming generation", context=context, speaker_id=0, use_last_gens=True, temp=0.8)]

Audio(np.array(frames).flatten(), rate=model.sampling_rate)

152it [00:12, 12.04it/s]
