In [None]:
import os, numpy as np, torch
from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor
from qwen_omni_utils import process_mm_info
import time

# --- Apple Silicon-safe env (no CUDA/vLLM/flash-attn) ---
os.environ.pop("CUDA_VISIBLE_DEVICES", None)
os.environ.pop("VLLM_USE_V1", None)
os.environ.pop("VLLM_WORKER_MULTIPROC_METHOD", None)

MODEL_PATH = "Qwen/Qwen3-Omni-30B-A3B-Instruct"   # << audio captioner = audio->text
USE_AUDIO_IN_VIDEO = False                          # audio-only
RETURN_AUDIO = False                                # we just want text

# Prefer MPS if available, else CPU
device = "mps" if torch.backends.mps.is_available() else "cpu"

print(f'time:{time.time()} processor')
processor = Qwen3OmniMoeProcessor.from_pretrained(MODEL_PATH)

print(f'time:{time.time()} loading model')
# NOTE: flash_attention_2 is NOT supported on MPS, so omit it
model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
    MODEL_PATH,
    dtype="auto",            # default dtype; MPS often runs fp16 internally
    device_map={"": device}, # send everything to MPS/CPU
)

def transcribe_audio(audio_path_or_url: str, prompt: str = "Describe the style, rhythm, dynamics, and expressed emotions of this piece of music. Identify the instruments used and suggest possible scenarios from which this music might originate."):
    # Build messages: audio + task text
    print(f'time:{time.time()} build messages')
    messages = [{
        "role": "user",
        "content": [
            {"type": "audio", "audio": audio_path_or_url},
            {"type": "text",  "text": prompt},
        ],
    }]

    print(f'time:{time.time()} text apply chat template')
    text = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
    print(f'time:{time.time()} process_mm_info')
    audios, images, videos = process_mm_info(messages, use_audio_in_video=USE_AUDIO_IN_VIDEO)

    print(f'time:{time.time()} inputs')
    inputs = processor(
        text=text,
        audio=audios,
        images=images,
        videos=videos,
        return_tensors="pt",
        padding=True,
        use_audio_in_video=USE_AUDIO_IN_VIDEO
    )

    # Move to device & dtype
    inputs = {k: (v.to(device) if hasattr(v, "to") else v) for k, v in inputs.items()}
    if hasattr(model, "dtype"):
        for k in ("input_ids", "attention_mask"):
            if k in inputs and hasattr(inputs[k], "to"):
                inputs[k] = inputs[k].to(model.dtype)

    print(f'time:{time.time()} generate')
    # Generate text (no audio out)
    text_ids, _ = model.generate(
        **inputs,
        thinker_return_dict_in_generate=True,
        thinker_max_new_tokens=2048,
        thinker_do_sample=False,
        return_audio=False,
        use_audio_in_video=USE_AUDIO_IN_VIDEO,
    )

    print(f'time:{time.time()} batch_decode')
    out = processor.batch_decode(
        text_ids.sequences[:, inputs["input_ids"].shape[1]:],
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )[0]
    print(f'time:{time.time()} out')
    return out.strip()

: 