<a href="https://colab.research.google.com/github/weedge/doraemon-nb/blob/main/moshi.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# init

In [None]:
#!pip install -e "git+https://git@github.com/kyutai-labs/moshi#egg=moshi&subdirectory=moshi"


In [None]:
!pip install moshi

In [None]:
!pip install -q gradio

In [None]:
!pip install -q torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121

# run moshi server

In [7]:
!python -m moshi.server --gradio-tunnel

Killing tunnel localhost:8998 <> https://d2bbacd8cd5d2e9c1b.gradio.live
^C


# torch + moshi lib

In [1]:
from huggingface_hub import hf_hub_download
import torch

from moshi.models import loaders, LMGen

mimi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MIMI_NAME)
mimi = loaders.get_mimi(mimi_weight, device='cpu')
mimi.set_num_codebooks(8)  # up to 32 for mimi, but limited to 8 for moshi.

wav = torch.randn(1, 1, 24000 * 10)  # should be [B, C=1, T]
with torch.no_grad():
    codes = mimi.encode(wav)  # [B, K = 8, T]
    decoded = mimi.decode(codes)

    # Supports streaming too.
    frame_size = int(mimi.sample_rate / mimi.frame_rate)
    all_codes = []
    with mimi.streaming(batch_size=1):
        for offset in range(0, wav.shape[-1], frame_size):
            frame = wav[:, :, offset: offset + frame_size]
            codes = mimi.encode(frame)
            assert codes.shape[-1] == 1, codes.shape
            all_codes.append(codes)

## WARNING: When streaming, make sure to always feed a total amount of audio that is a multiple
#           of the frame size (1920), otherwise the last frame will not be complete, and thus
#           will not be encoded. For simplicity, we recommend feeding in audio always in multiple
#           of the frame size, so that you always know how many time steps you get back in `codes`.

# Now if you have a GPU around.
mimi.cuda()
moshi_weight = hf_hub_download(loaders.DEFAULT_REPO, loaders.MOSHI_NAME)
moshi = loaders.get_moshi_lm(moshi_weight, device='cuda')
lm_gen = LMGen(moshi, temp=0.8, temp_text=0.7)  # this handles sampling params etc.
out_wav_chunks = []
# Now we will stream over both Moshi I/O, and decode on the fly with Mimi.
with torch.no_grad(), lm_gen.streaming(1), mimi.streaming(1):
    for idx, code in enumerate(all_codes):
        tokens_out = lm_gen.step(code.cuda())
        # tokens_out is [B, 1 + 8, 1], with tokens_out[:, 1] representing the text token.
        if tokens_out is not None:
            wav_chunk = mimi.decode(tokens_out[:, 1:])
            out_wav_chunks.append(wav_chunk)
        print(idx, end='\r')
out_wav = torch.cat(out_wav_chunks, dim=-1)



In [2]:
import torch, torchaudio

print(out_wav.shape)
# Squeeze out_wav to 2D before saving
d2_out_wav = out_wav.squeeze(0)
print(d2_out_wav.shape)

if d2_out_wav.dim() != 2:
    raise ValueError("out_wav must be a 2D tensor")

if not isinstance(mimi.sample_rate, (int, float)) or mimi.sample_rate <= 0:
    raise ValueError("Invalid sample rate")

torchaudio.save("output.wav", d2_out_wav.to("cpu"), mimi.sample_rate)


torch.Size([1, 1, 238080])
torch.Size([1, 238080])
torch.Size([1, 238080]) <class 'torch.Tensor'>


In [3]:
from IPython.display import Audio
Audio("output.wav", rate=mimi.sample_rate)


# transformers mimi

In [1]:
!pip install -q -U datasets[audio]

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/116.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/179.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/134.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m15.8 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver doe

In [2]:
#!pip install -q -U transformers

!pip install git+https://github.com/huggingface/transformers.git@main



Collecting git+https://github.com/huggingface/transformers.git@main
  Cloning https://github.com/huggingface/transformers.git (to revision main) to /tmp/pip-req-build-z0p0zzm5
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-z0p0zzm5
  Resolved https://github.com/huggingface/transformers.git to commit 3ea3ab62d80d91f9bdd16bd3cacd8133fb0d4566
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting tokenizers<0.21,>=0.20 (from transformers==4.47.0.dev0)
  Downloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading tokenizers-0.20.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m30.1 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels

In [1]:
!pip show transformers

Name: transformers
Version: 4.47.0.dev0
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: peft, sentence-transformers


In [17]:
from datasets import load_dataset, Audio
from transformers import MimiModel, AutoFeatureExtractor

# load a demonstration datasets
librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

# load the model + feature extractor (for pre-processing the audio)
model = MimiModel.from_pretrained("kyutai/mimi")
print(model)
feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/mimi")
print(feature_extractor)

MimiModel(
  (encoder): MimiEncoder(
    (layers): ModuleList(
      (0): MimiConv1d(
        (conv): Conv1d(1, 64, kernel_size=(7,), stride=(1,))
      )
      (1): MimiResnetBlock(
        (block): ModuleList(
          (0): ELU(alpha=1.0)
          (1): MimiConv1d(
            (conv): Conv1d(64, 32, kernel_size=(3,), stride=(1,))
          )
          (2): ELU(alpha=1.0)
          (3): MimiConv1d(
            (conv): Conv1d(32, 64, kernel_size=(1,), stride=(1,))
          )
        )
        (shortcut): Identity()
      )
      (2): ELU(alpha=1.0)
      (3): MimiConv1d(
        (conv): Conv1d(64, 128, kernel_size=(8,), stride=(4,))
      )
      (4): MimiResnetBlock(
        (block): ModuleList(
          (0): ELU(alpha=1.0)
          (1): MimiConv1d(
            (conv): Conv1d(128, 64, kernel_size=(3,), stride=(1,))
          )
          (2): ELU(alpha=1.0)
          (3): MimiConv1d(
            (conv): Conv1d(64, 128, kernel_size=(1,), stride=(1,))
          )
        )
        (s

  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)


In [20]:
# cast the audio data to the correct sampling rate for the model
librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=feature_extractor.sampling_rate))
audio_sample = librispeech_dummy[0]["audio"]["array"]
print(type(audio_sample),audio_sample.shape)
tensor_audio=torch.from_numpy(audio_sample)
tensor_audio_2d=tensor_audio.unsqueeze(0)
print(type(tensor_audio_2d),tensor_audio_2d.shape)

torchaudio.save(f"audio_sample.wav", tensor_audio_2d, feature_extractor.sampling_rate)


<class 'numpy.ndarray'> (140520,)
<class 'torch.Tensor'> torch.Size([1, 140520])


In [21]:
from IPython.display import Audio
Audio("audio_sample.wav", rate=feature_extractor.sampling_rate)


In [24]:
# pre-process the inputs
inputs = feature_extractor(raw_audio=audio_sample, sampling_rate=feature_extractor.sampling_rate, return_tensors="pt")
print(inputs,inputs["input_values"].shape)

# explicitly encode then decode the audio inputs
encoder_outputs = model.encode(inputs["input_values"])
print("explicitly encode the audio inputs",type(encoder_outputs),encoder_outputs,encoder_outputs.audio_codes.shape)
audio_values_1 = model.decode(encoder_outputs.audio_codes)[0]
print("explicitly decode the audio inputs",type(audio_values_1),audio_values_1.shape)

# or the equivalent with a forward pass
audio_values_2 = model(inputs["input_values"]).audio_values
print("model forward pass",type(audio_values_2),audio_values_2.shape)

{'padding_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]], dtype=torch.int32), 'input_values': tensor([[[0.0023, 0.0025, 0.0019,  ..., 0.0006, 0.0010, 0.0008]]])} torch.Size([1, 1, 140520])
explicitly encode the audio inputs <class 'transformers.models.mimi.modeling_mimi.MimiEncoderOutput'> MimiEncoderOutput(audio_codes=tensor([[[1049, 1946,  861,  ...,  312,  753,  708],
         [ 887, 1056, 1211,  ..., 1625, 1940,   67],
         [1335, 1742, 1742,  ..., 1212,  265,  269],
         ...,
         [  77,  436, 1864,  ...,   97,  195,  754],
         [ 427, 1564, 1196,  ..., 1704,  159,  159],
         [1780, 1082, 1470,  ..., 1752,  324, 1380]]]), encoder_past_key_values=None) torch.Size([1, 32, 74])
explicitly decode the audio inputs <class 'torch.Tensor'> torch.Size([1, 1, 142080])
model forward pass <class 'torch.Tensor'> torch.Size([1, 1, 140520])


In [14]:
import torch, torchaudio

audio_values = audio_values_2
#audio_values = audio_values_1

print(audio_values.shape)
# Squeeze audio_values to 2D before saving
d2_out_wav = audio_values.squeeze(0)
print(d2_out_wav.shape)

if d2_out_wav.dim() != 2:
    raise ValueError("out_wav must be a 2D tensor")

if not isinstance(feature_extractor.sampling_rate, (int, float)) or feature_extractor.sampling_rate <= 0:
    raise ValueError("Invalid sample rate")

torchaudio.save("output.wav", d2_out_wav.to("cpu"), feature_extractor.sampling_rate)


torch.Size([1, 1, 140520])
torch.Size([1, 140520])


In [15]:
from IPython.display import Audio
Audio("output.wav", rate=feature_extractor.sampling_rate)
