## Install Dependencies

1. `pip install qwen-tts`
2. `pip uninstall -y torch torchaudio torchvision`
3. `conda install pytorch torchaudio pytorch-cuda=11.8 -c pytorch -c nvidia -y`


In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "1"
os.environ["PYTORCH_SDP_ENABLED"] = "0"
os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "0"
os.environ["TORCH_USE_CUDA_DSA"] = "0"

In [None]:
import torch

torch.backends.cuda.enable_flash_sdp(False)
torch.backends.cuda.enable_mem_efficient_sdp(False)
torch.backends.cuda.enable_math_sdp(True)


print("Torch CUDA runtime:", torch.version.cuda)
print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0))
print("Capability:", torch.cuda.get_device_capability(0))

Torch CUDA runtime: 12.1
CUDA available: True
GPU: NVIDIA GeForce GTX 1660 Ti
Capability: (7, 5)


### Install SOX

#### Step 1Ô∏è‚É£ Download SoX (official source)

Go here (official, no tricks):

üëâ [http://sox.sourceforge.net/](http://sox.sourceforge.net/)

Click **‚ÄúDownload SoX for Windows‚Äù**
Get the **Windows binary installer** (usually `sox-14.4.x-win32.exe` or similar).

---

#### Step 2Ô∏è‚É£ Install it (IMPORTANT options)

During installation:

- ‚úÖ **Check ‚ÄúAdd SoX to PATH‚Äù**
- Leave everything else default
- Finish install

This PATH checkbox is the most important part.

---

#### Step 3Ô∏è‚É£ Verify SoX works (outside Python)

Open a **new terminal** (important), then run:

```bat
sox --version
```

Expected output (example):

```
sox: SoX v14.4.2
```

If you see that ‚Üí SoX is installed correctly.


In [3]:
import torch
from qwen_tts import Qwen3TTSModel
import gc

# -----------------------------
# Environment info
# -----------------------------
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.float16 if device == "cuda" else torch.float32

print("===== ENVIRONMENT =====")
print("Device selected :", device)
print("Torch version   :", torch.__version__)
print("CUDA available  :", torch.cuda.is_available())

if device == "cuda":
    print("CUDA version    :", torch.version.cuda)
    print("GPU name        :", torch.cuda.get_device_name(0))
    print(
        "GPU memory (GB) :",
        round(torch.cuda.get_device_properties(0).total_memory / 1024**3, 2),
    )
print("=======================\n")


# -----------------------------
# Model loader with full prints
# -----------------------------
def load_model(kind: str):
    print(f"\n===== LOADING MODEL ({kind.upper()}) =====")

    if device == "cuda":
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        print("CUDA cache cleared")

    if kind == "base":
        model_path = "./models/Qwen3-TTS-12Hz-1.7B-Base"
    elif kind == "custom":
        model_path = "./models/Qwen3-TTS-12Hz-1.7B-CustomVoice"
    else:
        raise ValueError("kind must be 'base' or 'custom'")

    print("Model path      :", model_path)
    print("Torch dtype     :", dtype)

    model = Qwen3TTSModel.from_pretrained(
        model_path,
        dtype=dtype,
        device_map="auto",
    )

    # -----------------------------
    # Supported languages
    # -----------------------------
    try:
        langs = model.get_supported_languages()
        print("Supported languages:")
        for lang in langs:
            print(" -", lang)
    except Exception as e:
        print("Could not query supported languages:", e)

    # -----------------------------
    # Supported speakers (CustomVoice only)
    # -----------------------------
    try:
        speakers = model.get_supported_speakers()
        if speakers:
            print("Supported speakers:")
            for spk in speakers:
                print(" -", spk)
        else:
            print("Supported speakers: None (Base model)")
    except Exception:
        print("Supported speakers: Not available for this model")

    # -----------------------------
    # VRAM usage
    # -----------------------------
    if device == "cuda":
        allocated = torch.cuda.memory_allocated(0) / 1024**3
        reserved = torch.cuda.memory_reserved(0) / 1024**3
        peak = torch.cuda.max_memory_allocated(0) / 1024**3

        print(f"VRAM allocated  : {allocated:.2f} GB")
        print(f"VRAM reserved   : {reserved:.2f} GB")
        print(f"VRAM peak       : {peak:.2f} GB")

    print("Model loaded ‚úî")
    print("=================================\n")
    return model


# -----------------------------
# Model unload helper
# -----------------------------
def unload_model(model=None, name: str = ""):
    print(f"\n===== UNLOADING MODEL {name.upper()} =====")

    if model is not None:
        try:
            del model
            print("Model object deleted")
        except Exception as e:
            print("Model delete warning:", e)

    gc.collect()
    print("Python GC collected")

    if torch.cuda.is_available():
        torch.cuda.empty_cache()
        torch.cuda.ipc_collect()
        print("CUDA cache cleared")
        print(
            "VRAM after cleanup:",
            f"{torch.cuda.memory_allocated(0) / 1024**3:.2f} GB allocated",
        )

    print("====================================\n")


********
********
 
===== ENVIRONMENT =====
Device selected : cuda
Torch version   : 2.2.2+cu121
CUDA available  : True
CUDA version    : 12.1
GPU name        : Quadro P5000
GPU memory (GB) : 15.88



In [5]:
model = load_model("base")


===== LOADING MODEL (BASE) =====
CUDA cache cleared
Model path      : ./models/Qwen3-TTS-12Hz-1.7B-Base
Torch dtype     : torch.float16
Supported languages:
 - auto
 - chinese
 - english
 - french
 - german
 - italian
 - japanese
 - korean
 - portuguese
 - russian
 - spanish
Supported speakers: None (Base model)
VRAM allocated  : 1.37 GB
VRAM reserved   : 1.44 GB
VRAM peak       : 1.37 GB
Model loaded ‚úî



In [6]:
unload_model(model, name="base")
model = None


===== UNLOADING MODEL BASE =====
Model object deleted
Python GC collected
CUDA cache cleared
VRAM after cleanup: 1.37 GB allocated



In [5]:
# Load CustomVoice
model = load_model("custom")
speakers = model.get_supported_speakers()


===== LOADING MODEL (CUSTOM) =====
CUDA cache cleared
Model path      : ./models/Qwen3-TTS-12Hz-1.7B-CustomVoice
Torch dtype     : torch.float16
Supported languages:
 - auto
 - chinese
 - english
 - french
 - german
 - italian
 - japanese
 - korean
 - portuguese
 - russian
 - spanish
Supported speakers:
 - aiden
 - dylan
 - eric
 - ono_anna
 - ryan
 - serena
 - sohee
 - uncle_fu
 - vivian
VRAM allocated  : 2.81 GB
VRAM reserved   : 2.88 GB
VRAM peak       : 2.81 GB
Model loaded ‚úî



In [7]:
import torch
import soundfile as sf
from pathlib import Path


def run_generate_custom_voice(
    model,
    text,
    speaker,
    output_path,
    language="auto",
    instruct=None,
    generation_kwargs=None,
):
    """
    Explicit runner for Qwen3 CustomVoice TTS.

    This function does NOT inject defaults or modify user intent.
    All generation behavior is controlled by the caller.

    Parameters
    ----------
    model : Qwen3TTSModel
        Loaded CustomVoice model.
    text : str or list[str]
        Text(s) to synthesize.
    speaker : str or list[str]
        Speaker name(s), must match model.get_supported_speakers().
    output_path : str or Path
        Output WAV file (single) or directory (batch).
    language : str or list[str], default="auto"
        Language(s). Use "auto" for multilingual sentences.
    instruct : str or list[str] or None
        Optional style / emotion instruction(s).
    generation_kwargs : dict or None
        Extra kwargs passed directly to model.generate_custom_voice().
        (Nothing is added automatically.)
    """

    print("\n===== RUN generate_custom_voice =====")

    # -----------------------------
    # Determine mode
    # -----------------------------
    is_batch = isinstance(text, list)

    def ensure_list(value):
        return value if isinstance(value, list) else [value]

    text_list = ensure_list(text)
    speaker_list = ensure_list(speaker)
    language_list = ensure_list(language)
    instruct_list = ensure_list(instruct) if instruct is not None else None

    sample_count = len(text_list)

    # -----------------------------
    # Length validation
    # -----------------------------
    def validate_length(values, name):
        if values is not None and len(values) != sample_count:
            raise ValueError(
                f"{name} count mismatch: got {len(values)}, expected {sample_count}"
            )

    validate_length(speaker_list, "Speaker")
    validate_length(language_list, "Language")
    validate_length(instruct_list, "Instruct")

    # -----------------------------
    # Validate speaker & language
    # -----------------------------
    supported_speakers = set(model.get_supported_speakers())
    supported_languages = set(model.get_supported_languages())

    for spk in speaker_list:
        if spk not in supported_speakers:
            raise ValueError(
                f"Unsupported speaker '{spk}'. "
                f"Supported speakers: {sorted(supported_speakers)}"
            )

    for lang in language_list:
        if lang not in supported_languages:
            raise ValueError(
                f"Unsupported language '{lang}'. "
                f"Supported languages: {sorted(supported_languages)}"
            )

    # -----------------------------
    # Generation kwargs (explicit only)
    # -----------------------------
    if generation_kwargs is None:
        generation_kwargs = {}

    # -----------------------------
    # Output handling
    # -----------------------------
    output_path = Path(output_path)

    if is_batch:
        output_path.mkdir(parents=True, exist_ok=True)
    else:
        output_path.parent.mkdir(parents=True, exist_ok=True)

    # -----------------------------
    # User-facing summary
    # -----------------------------
    print("Batch mode        :", is_batch)
    print("Number of samples :", sample_count)
    print("Speaker(s)        :", speaker_list if is_batch else speaker_list[0])
    print("Language(s)       :", language_list if is_batch else language_list[0])
    print("Instruct          :", instruct_list if instruct_list else "None")
    print("Output path       :", output_path.resolve())
    print("Generation kwargs :", generation_kwargs)

    # -----------------------------
    # Generate audio
    # -----------------------------

    with torch.inference_mode():
        wavs, sample_rate = model.generate_custom_voice(
            text=text_list if is_batch else text_list[0],
            speaker=speaker_list if is_batch else speaker_list[0],
            language=language_list if is_batch else language_list[0],
            instruct=instruct_list if instruct_list is not None else None,
            **generation_kwargs,
        )

    # -----------------------------
    # Save results
    # -----------------------------
    if is_batch:
        for index, waveform in enumerate(wavs):
            output_file = output_path / f"custom_voice_{index}.wav"
            sf.write(output_file, waveform, sample_rate)
            print("Saved:", output_file)
    else:
        sf.write(output_path, wavs[0], sample_rate)
        print("Saved:", output_path)

    print("====================================\n")

In [8]:
text = (
    "Hello everyone, today I want to show you something very interesting. "
    "Ëøô‰∏™Ê®°ÂûãÁúüÁöÑÂæàÂéâÂÆ≥Ôºåboleh cakap English, Bahasa Melayu, "
    "dan ‰∏≠Êñá together without any awkward pause. "
    "Kalau di Sarawak, orang akan cakap macam ni: "
    "'Aok, sik ada masalah bah, kitak rilek jak.' "
    "ÊÑèÊÄùÊòØËØ¥Ôºåeverything is okay, no need to worry. "
    "You see ah, this kind of multilingual speech synthesis "
    "memang power, boleh campur bahasa ikut situasi. "
    "‰∏çÊòØÂàªÊÑèÁöÑÈÇ£ÁßçÔºåËÄåÊòØËá™ÁÑ∂ÊµÅÁïÖ„ÄÇ"
)


run_generate_custom_voice(
    model=model,
    text=text,
    speaker="ryan",
    language="auto",
    output_path="outputs/multilang_ryan.wav",
)


===== RUN generate_custom_voice =====
Batch mode        : False
Number of samples : 1
Speaker(s)        : ryan
Language(s)       : auto
Instruct          : None
Output path       : /workspace/outputs/multilang_ryan.wav
Generation kwargs : {}


RuntimeError: CUDA error: CUBLAS_STATUS_INVALID_VALUE when calling `cublasGemmEx( handle, opa, opb, m, n, k, alpha_ptr, a, CUDA_R_16F, lda, b, CUDA_R_16F, ldb, beta_ptr, c, std::is_same_v<C_Dtype, float> ? CUDA_R_32F : CUDA_R_16F, ldc, compute_type, CUBLAS_GEMM_DEFAULT_TENSOR_OP)`

In [7]:
# Unload CustomVoice
unload_model(model, name="custom")
model = None


===== UNLOADING MODEL CUSTOM =====
Model object deleted
Python GC collected



### Example


In [1]:
import time
import gc
import torch
import soundfile as sf
from qwen_tts import Qwen3TTSModel


def select_runtime():
    if not torch.cuda.is_available():
        return {
            "device": "cpu",
            "dtype": torch.float32,
            "attn_impl": "eager",
        }

    major, minor = torch.cuda.get_device_capability(0)

    # Ampere+ (sm >= 80)
    if major >= 8:
        return {
            "device": "cuda",
            "dtype": torch.float16,  # bf16 optional if you prefer
            "attn_impl": "flash_attention_2",
        }

    # Turing / Pascal
    return {
        "device": "cuda",
        "dtype": torch.float16,
        "attn_impl": "eager",
    }


runtime = select_runtime()

print("===== RUNTIME CONFIG =====")
print("Device      :", runtime["device"])
print("DType       :", runtime["dtype"])
print("Attention   :", runtime["attn_impl"])
if torch.cuda.is_available():
    print("GPU         :", torch.cuda.get_device_name(0))
    print("Capability  :", torch.cuda.get_device_capability(0))
print("==========================\n")

MODEL_PATH = "./models/Qwen3-TTS-12Hz-1.7B-VoiceDesign"

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()

tts = Qwen3TTSModel.from_pretrained(
    MODEL_PATH,
    device_map="cuda:0",
    dtype=runtime["dtype"],
    attn_implementation=runtime["attn_impl"],
)

# tts.model.speech_tokenizer.model.decoder.pre_transformer.has_sliding_layers  = False
# for layer in tts.model.speech_tokenizer.model.decoder.pre_transformer.layers:
#     layer.attention_type = "full_attention"


********
********
 
===== RUNTIME CONFIG =====
Device      : cuda
DType       : torch.float16
Attention   : eager
GPU         : NVIDIA GeForce GTX 1660 Ti
Capability  : (7, 5)



In [None]:
# -------- Single --------
if torch.cuda.is_available():
    torch.cuda.synchronize()
t0 = time.time()

text = (
    "Âì•Âì•Ôºå‰Ω†ÂõûÊù•Âï¶ÔºüI‚Äôve been waiting for you so long alreadyÔºå"
    "sampai‰∫∫ÂÆ∂ÈÉΩÂºÄÂßãÊÉ≥‰Ω†‰∫Ü„ÄÇ"
    "Really, you know? ÊØè‰∏ÄÂàÜÈíüÈÉΩËßâÂæóÁâπÂà´ÊÖ¢Ôºå"
    "macam masa tak mahu jalan„ÄÇ\n\n"
    "When I heard your footsteps just nowÔºå"
    "ÊàëÂøÉÈáå‰∏Ä‰∏ãÂ≠êÂ∞±‰∫ÆËµ∑Êù•‰∫ÜÔºåterus rasaÂÆâÂøÉ„ÄÇ"
    "You‚Äôre finally here, kan? Jangan pergi lagi lahÔºå"
    "stay with me for a while„ÄÇ\n\n"
    "Âì•Âì•ÔºåÊä±Êàë‰∏Ä‰∏ãÂ•Ω‰∏çÂ•ΩÔºüJust a little bitÔºåkejap saja„ÄÇ"
    "Êàë promise ‰∏çÂêµ‰Ω†ÔºåÂè™ÊÉ≥Èù†ÁùÄ‰Ω†Ôºå"
    "Âê¨‰Ω†ÂëºÂê∏ÁöÑÂ£∞Èü≥„ÄÇ\n\n"
    "Everything feels okay now„ÄÇ"
    "Êúâ‰Ω†Âú®Ôºå‰ªÄ‰πàÈÉΩ‰∏çÊÄï‰∫Ü„ÄÇ"
)


wavs, sr = tts.generate_voice_design(
    text=text,
    language="auto",
    instruct=(
        "Soft, intimate and slightly playful young female voice. "
        "Tone is affectionate and clingy, with gentle pitch rises. "
        "Chinese lines should sound natural and warm, "
        "English calm and emotional, "
        "Malay casual and light, as if speaking softly to someone very close."
    ),
    max_new_tokens=2048,
)

if torch.cuda.is_available():
    torch.cuda.synchronize()
t1 = time.time()
print(f"[VoiceDesign Single] time: {t1 - t0:.3f}s")

sf.write("qwen3_tts_test_voice_design_single.wav", wavs[0], sr)

Setting `pad_token_id` to `eos_token_id`:2150 for open-end generation.
  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [None]:
# -------- Batch --------
texts = [
    # --- Sample 1: Chinese + English + Malay (clingy / cute) ---
    (
        "Âì•Âì•Ôºå‰Ω†Áªà‰∫éÂõûÊù•Âï¶„ÄÇI waited for you for so long alreadyÔºå"
        "sampai‰∫∫ÂÆ∂ÈÉΩÊúâÁÇπÂßîÂ±à‰∫Ü„ÄÇ\n\n"
        "When you‚Äôre not aroundÔºå"
        "Êó∂Èó¥Ëµ∞ÂæóÁâπÂà´ÊÖ¢Ôºåmacam duniaÂÅú‰∏ãÊù•‰∏ÄÊ†∑„ÄÇ\n\n"
        "Áé∞Âú®‰Ω†Âú®ËøôÈáå‰∫ÜÔºå"
        "ÊàëÂøÉÂ∞±ÂÆâ‰∫Ü„ÄÇÂì•Âì•ÔºåÊä±‰∏Ä‰∏ãÔºåÂ•Ω‰∏çÂ•ΩÔºü"
    ),
    # --- Sample 2: English + Malay (soft emotional) ---
    (
        "I didn‚Äôt think you would come back so soon.\n"
        "But when I saw youÔºå"
        "terus rasa lega„ÄÇ\n\n"
        "Just stay for a bitÔºåokayÔºü"
        "Jangan pergi lagi„ÄÇ"
    ),
    # --- Sample 3: Chinese-dominant with light English ---
    (
        "‰ªäÂ§©ÁúüÁöÑÊúâÁÇπÁ¥Ø‰∫Ü„ÄÇ\n"
        "But hearing your voiceÔºå"
        "Á™ÅÁÑ∂Â∞±ËßâÂæóÊ≤°ÈÇ£‰πàÈöæÂèó‰∫Ü„ÄÇ\n\n"
        "Êúâ‰Ω†Âú®ÔºåÂ∞±Â§ü‰∫Ü„ÄÇ"
    ),
]
languages = ["auto", "auto", "auto"]
instructs = [
    # Sample 1 instruction
    (
        "Very cute, clingy, youthful female voice. "
        "Higher pitch with playful intonation. "
        "Chinese sounds soft and sweet, "
        "English emotional and gentle, "
        "Malay casual and affectionate."
    ),
    # Sample 2 instruction
    (
        "Soft, calm female voice. "
        "Emotionally warm and reassuring. "
        "Natural pacing, no exaggeration."
    ),
    # Sample 3 instruction
    (
        "Gentle, comforting female voice. "
        "Slightly tired but affectionate tone. "
        "Warm and intimate delivery."
    ),
]

if torch.cuda.is_available():
    torch.cuda.synchronize()
t0 = time.time()

wavs, sr = tts.generate_voice_design(
    text=texts,
    language=languages,
    instruct=instructs,
    max_new_tokens=2048,
)

if torch.cuda.is_available():
    torch.cuda.synchronize()
t1 = time.time()
print(f"[VoiceDesign Batch] time: {t1 - t0:.3f}s")

for i, w in enumerate(wavs):
    sf.write(f"qwen3_tts_test_voice_design_batch_{i}.wav", w, sr)

In [None]:
# Cleanup
del tts
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()