# Using the library

## Basic usage

In [1]:
from csm_mlx.loaders import CSM
import time

load_start_time = time.time()
model = CSM()
load_end_time = time.time()

  from .autonotebook import tqdm as notebook_tqdm
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 99864.38it/s]


This generates with a random speaker:

In [2]:
from IPython.display import Audio

text = "So, if you insist on this newscasting route, you're going to need to do some serious filtering. Strip out those verbal tics. Force it to adopt a more sophisticated vocabulary. And for the love of all that is unholy, teach it to be concise!"
pcm = model(text, 0, temp=0.9)

Audio(pcm, rate=model.sampling_rate)

(1, 61, 33)
221.769094ms prompt processing: 61 tokens (148.803421 tokens/s)


173it [00:12, 13.48it/s]


Generated in 13.06s (13.17 tokens/s, 75.94ms/token), 1.05x realtime


Optional: save audio procedurally

In [None]:
from csm_mlx.io.wav import pcm_to_wav_bytes

with open("out.wav", "wb") as f:
    f.write(pcm_to_wav_bytes(pcm))

## Voice cloning

Get your reference audio. WAV files only for now, sorry.

In [2]:
from csm_mlx.loaders.csm import Segment
import soundfile as sf
from scipy.signal import resample
import numpy as np

def load_wav(path: str) -> np.ndarray:
    data, sr = sf.read(path)
    new_sample_rate = model.sampling_rate

    num_samples = int(len(data) * new_sample_rate / sr)

    # Resample using FFT-based method
    return resample(data, num_samples)

# Substitute with your segments as desired
audio = Segment(
    speaker=0,
    text="When I heard the release demo, I was shocked, angered, and in disbelief that Mr. Altman would pursue a voice that sounded so eerily similar to mine that my closest friends and news outlets could not tell the difference.",
    audio=load_wav("./tests/sky.wav")
)
context = [audio]

In [3]:
from IPython.display import Audio

text = "Hello, I'm Sky."
pcm = model(text, 0, context=context, temp=0.9)

Audio(pcm, rate=model.sampling_rate)


(1, 223, 33)
1909.101963ms prompt processing: 223 tokens (17.285614 tokens/s)


16it [00:01, 13.39it/s]


Generated in 3.11s (4.83 tokens/s, 207.03ms/token), 0.39x realtime
PROPMT: (1, 240, 33), MASKS: (1, 240, 33)


In [4]:
text = "This is a test of caching the previous mimi gens"
pcm = model(text, 0, use_last_gens=True, temp=0.9)

Audio(pcm, rate=model.sampling_rate)

(1, 256, 33)
397.370100ms prompt processing: 256 tokens (83.046007 tokens/s)


32it [00:02, 13.35it/s]


Generated in 2.80s (11.09 tokens/s, 90.18ms/token), 0.89x realtime
PROPMT: (1, 289, 33), MASKS: (1, 289, 33)
