In [None]:
from transformers import MimiModel, AutoFeatureExtractor

device = "cpu"
feature_extractor = AutoFeatureExtractor.from_pretrained("kyutai/mimi")
model = MimiModel.from_pretrained("kyutai/mimi")
model = model.to(device)

In [None]:
import torchaudio
import torchaudio.transforms as T

def load_and_process_wav(file_path):
    """
    Load a WAV file, convert it to mono, resample it to 24kHz, and return as a tensor.

    Parameters:
        file_path (str): Path to the WAV file.

    Returns:
        torch.Tensor: Processed audio tensor.
    """
    # Load the audio file
    waveform, sample_rate = torchaudio.load(file_path)

    # Convert to mono if not already
    if waveform.size(0) > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)

    # Resample to 24kHz if needed
    target_sample_rate = 24000
    if sample_rate != target_sample_rate:
        resampler = T.Resample(orig_freq=sample_rate, new_freq=target_sample_rate)
        waveform = resampler(waveform)

    return waveform

In [None]:
import os

def run_llama_generate(
    text="Can you generate five simple sentences for my child to practice speaking",
    temp=0.1,
    checkpoint_path="../dual-ar/checkpoints/smoltts_scratch/",
    working_dir="../../fish-speech.rs"  # Replace with your desired working directory
):
    # Store current working directory
    original_dir = os.getcwd()
    
    try:
        # Change to desired working directory
        os.chdir(working_dir)
        
        # Construct the command
        cmd = f'cargo run --release --features cuda --bin llama_generate -- '\
              f'--text "{text}" '\
              f'--checkpoint {checkpoint_path} '\
              f'--temp {temp}'
        
        # Execute command
        return os.system(cmd)
        
    finally:
        # Always return to original directory
        os.chdir(original_dir)

# Example usage:
# run_llama_generate(
#     text="Write a short story about a cat",
#     temp=0.2,
#     working_dir="/path/to/your/project"
# )

In [None]:
import numpy as np
import torch
from IPython.display import Audio, display

# run_llama_generate(
#     text="Here's how Bob talks, here's what language is, now speak like Bob saying this new thing",
#     temp=0.05
# )
# Load and process the data
test_arr = np.load("../../out.npy")
test_input = torch.from_numpy(test_arr[:,:200]).to(device).to(torch.long)
print(test_input.shape)

# Generate audio
out_pcm = model.decode(test_input)

# Convert to CPU and get numpy array for playback
audio_data = out_pcm.audio_values[0].detach().to("cpu").numpy()

# Create and display audio widget
# Note: sample_rate=24000 matches your original save command
display(Audio(audio_data, rate=24000, autoplay=False))

In [None]:
test_input[0, 0, :]

In [None]:
import numpy as np

pcm = load_and_process_wav("../../fish-speech.rs/voices/nova.wav")
codes = model.encode(pcm.to("cuda").unsqueeze(0))
np.save("nova.npy", codes["audio_codes"].squeeze(0)[:8, :].cpu().numpy())

In [None]:
codes["audio_codes"].squeeze(0)[:8,:].shape