In [1]:
!pip install sounddevice -q


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import grpc
from tritonclient.grpc import service_pb2, service_pb2_grpc

host = "localhost:8001"
model_name = 'orpheus'
model_version = "1"

In [3]:
channel = grpc.insecure_channel(host)
grpc_stub = service_pb2_grpc.GRPCInferenceServiceStub(channel)

In [4]:
## Checks server health is ok!
try:
    request = service_pb2.ServerLiveRequest()
    response = grpc_stub.ServerLive(request)
    print("server {}".format(response))
except Exception as ex:
    print(ex)

server live: true



In [5]:
#Checks model outputs
request = service_pb2.ModelMetadataRequest(name=model_name, version=model_version)
response = grpc_stub.ModelMetadata(request)
print("model metadata:\n{}".format(response))

model metadata:
name: "orpheus"
versions: "1"
platform: "python"
inputs {
  name: "text"
  datatype: "BYTES"
  shape: -1
}
inputs {
  name: "speaker_id"
  datatype: "BYTES"
  shape: -1
}
outputs {
  name: "audio"
  datatype: "FP32"
  shape: -1
}
outputs {
  name: "sampling_rate"
  datatype: "FP32"
  shape: 1
}



In [6]:
"""
Verbose streaming client for a decoupled Triton TTS model (`orpheus`).

• Sends one request with BYTES inputs:  text  +  speaker_id
• Receives many responses, each carrying:
      audio          : FP32 1‑D tensor
      sampling_rate  : FP32 scalar (may appear only once)

Each chunk is converted to 16‑bit PCM and appended to output.wav while
progress messages stream to the console.
"""
import queue
import wave
from functools import partial
from time import perf_counter

import numpy as np
import tritonclient.grpc as grpcclient
from tritonclient.utils import InferenceServerException

# ─────────────────────────────────────────────────────────────────────────────
# CONFIG
# ─────────────────────────────────────────────────────────────────────────────
MODEL_NAME  = "orpheus"
TEXT        = "Man, social media has completely changed how we interact..."
SPEAKER_ID  = "tara"
REQUEST_ID  = "tts‑4"
OUT_WAV     = "output.wav"

# ─────────────────────────────────────────────────────────────────────────────
# HELPERS
# ─────────────────────────────────────────────────────────────────────────────
class _UserData:
    """Queue where the stream callback deposits every result / error."""
    def __init__(self):
        self.completed_reqs = queue.Queue()

def _callback(user_data, result, error):
    if error:
        user_data.completed_reqs.put(error)
    else:
        user_data.completed_reqs.put(result)

def _fp32_to_pcm16(float_chunk: np.ndarray) -> bytes:
    """
    Convert float32 ‑1…1 → int16 PCM.
    The input array is read‑only; make a clipped *copy* before scaling.
    """
    tmp = np.clip(float_chunk, -1.0, 1.0)           # copy – writable
    return (tmp * 32767.0).astype(np.int16).tobytes()

def _is_final_response(resp) -> bool:
    params = getattr(resp, "parameters", None)
    if not params:
        return False
    flag = params.get("triton_final_response")
    return flag.bool_param

# ─────────────────────────────────────────────────────────────────────────────
# BUILD REQUEST
# ─────────────────────────────────────────────────────────────────────────────
inp_text = grpcclient.InferInput("text", [1], "BYTES")
inp_text.set_data_from_numpy(np.array([TEXT.encode("utf‑8")], dtype=object))

inp_spk  = grpcclient.InferInput("speaker_id", [1], "BYTES")
inp_spk.set_data_from_numpy(np.array([SPEAKER_ID.encode("utf‑8")], dtype=object))

inputs  = [inp_text, inp_spk]
outputs = [
    grpcclient.InferRequestedOutput("audio"),
    grpcclient.InferRequestedOutput("sampling_rate"),
]

# ─────────────────────────────────────────────────────────────────────────────
# OPEN STREAM → SEND → CONSUME
# ─────────────────────────────────────────────────────────────────────────────
user_data   = _UserData()
start_clock = perf_counter()

with grpcclient.InferenceServerClient("localhost:8001") as triton:
    print("⏳ Opening gRPC bidirectional stream …")
    triton.start_stream(callback=partial(_callback, user_data))

    print(f"🚀 Sending request «{REQUEST_ID}» ({len(TEXT)} B text)")
    triton.async_stream_infer(
        model_name = MODEL_NAME,
        inputs     = inputs,
        outputs    = outputs,
        request_id = REQUEST_ID,
    )

    wav_handle       = None
    sampling_rate    = None
    chunk_counter    = 0
    pcm_frame_total  = 0

    while True:
        msg = user_data.completed_reqs.get()              # blocks

        # — handle server‑side error ————————————————————————————————
        if isinstance(msg, InferenceServerException):
            raise msg

        # — read tensors ————————————————————————————————————————————
        audio_np = msg.as_numpy("audio")                 
        if audio_np is None:
            continue                                      

        sr_arr = msg.as_numpy("sampling_rate")            
        if sr_arr is not None and sampling_rate is None:
            sampling_rate = int(sr_arr[0])

        # — open WAV lazily once we know the sampling rate ————————————————
        if wav_handle is None and sampling_rate is not None:
            sampwidth = audio_np.dtype.itemsize
            wav_handle = wave.open(OUT_WAV, "wb")
            wav_handle.setnchannels(1)
            wav_handle.setsampwidth(sampwidth)
            wav_handle.setframerate(sampling_rate)
            print(f"📼 Opened {OUT_WAV}  @  {sampling_rate} Hz — {audio_np.dtype}")

        # — convert + write chunk ————————————————————————————————
        if wav_handle:
            wav_handle.writeframes(audio_np)
            pcm_frame_total += audio_np.size          

        elapsed = perf_counter() - start_clock
        print(
            f"🔊 chunk {chunk_counter:03d} | {len(audio_np)} samples"
            f" | {elapsed:5.2f}s since start"
        )
        chunk_counter += 1
        
        # — detect final response ————————————————————————————————
        if _is_final_response(msg.get_response()):
            print("📬  final‑response flag received")
            break

# ─────────────────────────────────────────────────────────────────────────────
# DONE
# ─────────────────────────────────────────────────────────────────────────────
if wav_handle:
    wav_handle.close()
    duration_sec = pcm_frame_total / sampling_rate
    print(f"✅ Finished – wrote {duration_sec:0.2f} s audio to {OUT_WAV}")
else:
    print("⚠️  No audio received!")

⏳ Opening gRPC bidirectional stream …
🚀 Sending request «tts‑4» (59 B text)
📼 Opened output.wav  @  24000 Hz — int16
🔊 chunk 000 | 2048 samples |  1.32s since start
🔊 chunk 001 | 2048 samples |  1.54s since start
🔊 chunk 002 | 2048 samples |  1.75s since start
🔊 chunk 003 | 2048 samples |  1.96s since start
🔊 chunk 004 | 2048 samples |  2.18s since start
🔊 chunk 005 | 2048 samples |  2.39s since start
🔊 chunk 006 | 2048 samples |  2.61s since start
🔊 chunk 007 | 2048 samples |  2.82s since start
🔊 chunk 008 | 2048 samples |  3.03s since start
🔊 chunk 009 | 2048 samples |  3.24s since start
🔊 chunk 010 | 2048 samples |  3.46s since start
🔊 chunk 011 | 2048 samples |  3.67s since start
🔊 chunk 012 | 2048 samples |  3.88s since start
🔊 chunk 013 | 2048 samples |  4.10s since start
🔊 chunk 014 | 2048 samples |  4.31s since start
🔊 chunk 015 | 2048 samples |  4.52s since start
🔊 chunk 016 | 2048 samples |  4.74s since start
🔊 chunk 017 | 2048 samples |  4.95s since start
🔊 chunk 018 | 2048 