<a href="https://colab.research.google.com/github/Balta8/AI-Telecom-Assistant/blob/main/Chatterbox_Egyptian_Demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!curl -LsSf https://astral.sh/uv/install.sh | sh

downloading uv 0.10.4 x86_64-unknown-linux-gnu
no checksums to verify
installing to /usr/local/bin
  uv
  uvx
everything's installed!


In [3]:
!uv pip install resampy==0.4.3 librosa==0.10.0 s3tokenizer transformers==4.46.3 diffusers==0.29.0 omegaconf==2.3.0 resemble-perth==1.0.1 silero-vad==5.1.2 conformer==0.3.2 safetensors numpy==1.26.0 --system

[2mUsing Python 3.12.12 environment at: /usr[0m
[2mAudited [1m11 packages[0m [2min 90ms[0m[0m


In [4]:
!git clone https://github.com/Oddadmix/chatterbox-multilingual-finetuning.git

fatal: destination path 'chatterbox-multilingual-finetuning' already exists and is not an empty directory.


In [5]:
%cd chatterbox-multilingual-finetuning

/content/chatterbox-multilingual-finetuning


In [6]:
import random
import numpy as np
import torch
from pathlib import Path
import os

# Fix for numpy.dtype size changed error - Removing redundant uninstall/reinstall
# as uv pip install handled this previously and in-session reinstall causes issues.
# !pip uninstall -y numpy transformers
# !pip install numpy==1.26.0 transformers==4.46.3

from huggingface_hub import snapshot_download

from src.chatterbox.mtl_tts import ChatterboxMultilingualTTS, SUPPORTED_LANGUAGES


In [7]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"🚀 Running on device: {DEVICE}")

REPO_ID = "oddadmix/chatterbox-egyptian-v0"

ckpt_dir = Path(
    snapshot_download(
        repo_id=REPO_ID,
        repo_type="model",
        revision="main",
        allow_patterns=[
            "ve.pt",
            "t3_mtl23ls_v2.safetensors",
            "s3gen.pt",
            "grapheme_mtl_merged_expanded_v1.json",
            "conds.pt",
            "Cangjie5_TC.json",
        ]
    )
)

# --- Global Model Initialization ---
MODEL = ChatterboxMultilingualTTS.from_checkpoint(str(ckpt_dir) + "/", DEVICE)
if hasattr(MODEL, "to") and str(getattr(MODEL, "device", "")) != DEVICE:
  MODEL.to(DEVICE)
  print(f"Model loaded successfully. Internal device: {getattr(MODEL, 'device', 'N/A')}")


🚀 Running on device: cuda


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

t3_mtl23ls_v2.safetensors:   0%|          | 0.00/2.14G [00:00<?, ?B/s]

ve.pt:   0%|          | 0.00/5.70M [00:00<?, ?B/s]

conds.pt:   0%|          | 0.00/107k [00:00<?, ?B/s]

Fetching 6 files:   0%|          | 0/6 [00:00<?, ?it/s]

grapheme_mtl_merged_expanded_v1.json: 0.00B [00:00, ?B/s]

Cangjie5_TC.json: 0.00B [00:00, ?B/s]

conds.pt:   0%|          | 0.00/107k [00:00<?, ?B/s]

ve.pt:   0%|          | 0.00/5.70M [00:00<?, ?B/s]

s3gen.pt:   0%|          | 0.00/1.06G [00:00<?, ?B/s]

t3_mtl23ls_v2.safetensors:   0%|          | 0.00/2.14G [00:00<?, ?B/s]

  deprecate("LoRACompatibleLinear", "1.0.0", deprecation_message)


Cangjie5_TC.json: 0.00B [00:00, ?B/s]



loaded PerthNet (Implicit) at step 250,000


In [8]:
import requests
from pathlib import Path

url = "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ar_f/ar_prompts2.flac"
save_path = Path("ar_prompts2.flac")

response = requests.get(url, stream=True)
response.raise_for_status()

with open(save_path, "wb") as f:
    for chunk in response.iter_content(chunk_size=8192):
        if chunk:
            f.write(chunk)

print(f"Saved to {save_path.resolve()}")

Saved to /content/chatterbox-multilingual-finetuning/ar_prompts2.flac


In [9]:
def generate_tts_audio(
    text_input: str,
    audio_prompt_path_input: str = None,
    exaggeration_input: float = 0.5,
    temperature_input: float = 0.8,
    seed_num_input: int = 0,
    cfgw_input: float = 0.5,
) -> tuple[int, np.ndarray]:
    """
    Generate speech audio from text using Chatterbox Multilingual model.
    - If a reference audio is provided, the model will try to match the speaker/style.
    - If not provided, it uses the model's default voice.
    Note: For Arabic here, the demo text + examples are Egyptian Arabic (Masri).
    """

    language_id = "ar"
    # current_model = get_or_load_model()
    # if current_model is None:
    #     raise RuntimeError("TTS model is not loaded.")

    # if seed_num_input and int(seed_num_input) != 0:
    #     set_seed(int(seed_num_input))

    text_input = (text_input or "").strip()

    print(f"Generating audio for language='{language_id}', text='{text_input[:60]}...'")

    # Keep same behavior: use uploaded/mic ref if provided, else default audio for language.
    chosen_prompt = audio_prompt_path_input or "./ar_prompts2.flac"

    generate_kwargs = {
        "exaggeration": float(exaggeration_input),
        "temperature": float(temperature_input),
        "cfg_weight": float(cfgw_input),
    }
    if chosen_prompt:
        generate_kwargs["audio_prompt_path"] = chosen_prompt
        print(f"Using audio prompt: {chosen_prompt}")
    else:
        print("No audio prompt provided; using default voice.")

    wav = MODEL.generate(
        text_input,  # max chars
        language_id=language_id,
        **generate_kwargs,
    )

    print("Audio generation complete.")
    return (MODEL.sr, wav.squeeze(0).numpy())


In [31]:
sr, audio = generate_tts_audio(text_input="حضرتك حابب واحد عرض بوكس اللي بيكون فيه ساندوتش كفتة فراخ وساندوتش شاورما؟")


Generating audio for language='ar', text='حضرتك حابب واحد عرض بوكس اللي بيكون فيه ساندوتش كفتة فراخ وس...'
Using audio prompt: ./ar_prompts2.flac


Sampling:  13%|█▎        | 133/1000 [00:04<00:31, 27.30it/s]


Audio generation complete.


In [32]:
from IPython.display import Audio, display
display(Audio(audio, rate=sr))

In [33]:
import soundfile as sf

sf.write("my_audio.mp3", audio, sr)

In [34]:
from google.colab import files
files.download("my_audio.mp3")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>