# Using the library

## Basic usage

In [1]:
from csm_mlx.loaders import CSM
import time

load_start_time = time.time()
model = CSM()
load_end_time = time.time()

  from .autonotebook import tqdm as notebook_tqdm
Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 107546.26it/s]


This generates with a random speaker:

In [2]:
from IPython.display import Audio

text = "Hello, my name is John, and I like bread."
pcm = model(text, 0, temp=0.9)

Audio(pcm, rate=model.sampling_rate)

297.755957ms prompt processing: 17 tokens (110.829017 tokens/s)


58it [00:04, 12.95it/s]


Generated in 4.78s (11.92 tokens/s, 83.86ms/token), 0.95x realtime


Optional: save audio procedurally

In [None]:
from csm_mlx.io.wav import pcm_to_wav_bytes

with open("out.wav", "wb") as f:
    f.write(pcm_to_wav_bytes(pcm))

## Voice cloning

Get your reference audio. WAV files only for now, sorry.

In [3]:
from csm_mlx.loaders.csm import Segment
import soundfile as sf
from scipy.signal import resample
import numpy as np

def load_wav(path: str) -> np.ndarray:
    data, sr = sf.read(path)
    new_sample_rate = model.sampling_rate

    num_samples = int(len(data) * new_sample_rate / sr)

    # Resample using FFT-based method
    return resample(data, num_samples)

# Substitute with your segments as desired
audio = Segment(
    speaker=0,
    text="When I heard the release demo, I was shocked, angered, and in disbelief that Mr. Altman would pursue a voice that sounded so eerily similar to mine that my closest friends and news outlets could not tell the difference.",
    audio=load_wav("./tests/sky.wav")
)
context = [audio]

In [None]:
from IPython.display import Audio

text = "Hello, I'm Sky. Welcome to my open source emporium!"
pcm = model(text, 0, context=context, temp=0.7, backbone_min_p=0.05)

Audio(pcm, rate=model.sampling_rate)


363.270044ms prompt processing: 232 tokens (90.841512 tokens/s)


46it [00:03, 13.29it/s]


Generated in 3.83s (11.76 tokens/s, 85.04ms/token), 0.94x realtime


In [5]:
text = "This is a test of caching the previous mimi gens"
pcm = model(text, 0, use_last_gens=True, temp=0.9)

Audio(pcm, rate=model.sampling_rate)

160.643101ms prompt processing: 16 tokens (205.424322 tokens/s)


34it [00:02, 13.28it/s]


Generated in 2.72s (12.12 tokens/s, 82.48ms/token), 0.97x realtime
