# Using the library

## Basic usage

In [None]:
from csm_mlx.loaders import CSM
import time

load_start_time = time.time()
model = CSM(depth=16)
load_end_time = time.time()

Fetching 7 files: 100%|██████████| 7/7 [00:00<00:00, 110376.42it/s]


This generates with a random speaker:

In [18]:
from IPython.display import Audio

text = "Hello, my name is John, and I like bread."
pcm = model(text, 0, temp=1, backbone_min_p=0.05)

Audio(pcm, rate=model.sampling_rate)

304.677010ms prompt processing: 17 tokens (55.796793 tokens/s)


61it [00:02, 22.46it/s]


Generated in 3.02s (19.85 tokens/s, 50.37ms/token), 1.59x realtime


Optional: save audio procedurally

In [None]:
from csm_mlx.io.wav import pcm_to_wav_bytes

with open("out.wav", "wb") as f:
    f.write(pcm_to_wav_bytes(pcm))

## Voice cloning

Get your reference audio. WAV files only for now, sorry.

In [24]:
from csm_mlx.loaders.csm import Segment
import soundfile as sf
from scipy.signal import resample
import numpy as np

def load_wav(path: str) -> np.ndarray:
    data, sr = sf.read(path)
    new_sample_rate = model.sampling_rate

    num_samples = int(len(data) * new_sample_rate / sr)

    # Resample using FFT-based method
    return resample(data, num_samples)

# Substitute with your segments as desired
audio = Segment(
    speaker=0,
    text="When I heard the release demo, I was shocked, angered, and in disbelief that Mr. Altman would pursue a voice that sounded so eerily similar to mine that my closest friends and news outlets could not tell the difference.",
    audio=load_wav("./tests/sky.wav")
)
context = [audio]

In [25]:
from IPython.display import Audio

text = "besides the stream and the decoder compile, did you do anything else to get the speed up on your 3090? No way that should have a 3x higher rtf than an a100"
pcm = model(text, 0, context=context, temp=1, backbone_min_p=0.1)

Audio(pcm, rate=model.sampling_rate)


1515.042067ms prompt processing: 258 tokens (21.781573 tokens/s)


130it [00:09, 13.11it/s]


Generated in 11.43s (11.28 tokens/s, 88.63ms/token), 0.90x realtime


In [1]:
text = "This is a test of caching the previous mimi generations."
pcm = model(text, 0, use_last_gens=True, temp=0.9)

Audio(pcm, rate=model.sampling_rate)

NameError: name 'model' is not defined

## Streaming

In [None]:
from IPython.display import Audio
import numpy as np

# TODO block anyway, just testing correctness
frames = [frame for frame in model.stream("Hello world, this is a test of streaming generation", context=context, speaker_id=0, use_last_gens=True, temp=0.8)]

Audio(np.array(frames).flatten(), rate=model.sampling_rate)