# CosyVoice WebSocket on VSCode Colab Kernel (Explicit Setup)

This notebook intentionally avoids wrapper scripts and runs every step explicitly.

What it does:
1. Resolve repo path (clone if needed).
2. Create an isolated venv (no global Colab package pollution).
3. Clone CosyVoice and install a filtered requirements set compatible with modern Python kernels.
4. Start websocket server (`/tts`) with `TTS_BACKEND=cosyvoice`.
5. Run smoke test and optional TTFT benchmark.

Run cells top-to-bottom.


In [None]:
import os
import pathlib
import textwrap

# Editable config
REPO_URL = os.environ.get('REPO_URL', 'https://github.com/Aryan-Seth/sigiq_task.git')
GIT_REF = os.environ.get('GIT_REF', 'main')
WORK_ROOT = os.environ.get('WORK_ROOT', '/content' if os.path.isdir('/content') else '/tmp/colab_ws')
REPO_PARENT = os.environ.get('REPO_PARENT', f'{WORK_ROOT}/sigiq_takehome')
# REPO_ROOT may be repo root or {repo}/tts_ws depending on checkout layout.
REPO_ROOT = os.environ.get('REPO_ROOT', REPO_PARENT)
COSYVOICE_REPO_DIR = os.environ.get('COSYVOICE_REPO_DIR', f'{WORK_ROOT}/CosyVoice')
VENV_DIR = os.environ.get('VENV_DIR', f'{WORK_ROOT}/venvs/tts_ws')

COSYVOICE_MODEL_DIR = os.environ.get('COSYVOICE_MODEL_DIR', 'FunAudioLLM/CosyVoice2-0.5B')
COSYVOICE_MODE = os.environ.get('COSYVOICE_MODE', 'sft')
PORT = os.environ.get('PORT', '8000')
HOST = os.environ.get('HOST', '0.0.0.0')

env_file = pathlib.Path('/tmp/tts_ws_colab_env.sh')
env_file.write_text(
    '\n'.join([
        '#!/usr/bin/env bash',
        f'export REPO_URL="{REPO_URL}"',
        f'export GIT_REF="{GIT_REF}"',
        f'export WORK_ROOT="{WORK_ROOT}"',
        f'export REPO_PARENT="{REPO_PARENT}"',
        f'export REPO_ROOT="{REPO_ROOT}"',
        f'export COSYVOICE_REPO_DIR="{COSYVOICE_REPO_DIR}"',
        f'export VENV_DIR="{VENV_DIR}"',
        f'export COSYVOICE_MODEL_DIR="{COSYVOICE_MODEL_DIR}"',
        f'export COSYVOICE_MODE="{COSYVOICE_MODE}"',
        f'export HOST="{HOST}"',
        f'export PORT="{PORT}"',
    ]) + '\n',
    encoding='utf-8',
)

print(textwrap.dedent(f'''
Saved env file: {env_file}
REPO_URL={REPO_URL}
GIT_REF={GIT_REF}
WORK_ROOT={WORK_ROOT}
REPO_PARENT={REPO_PARENT}
REPO_ROOT={REPO_ROOT}
COSYVOICE_REPO_DIR={COSYVOICE_REPO_DIR}
VENV_DIR={VENV_DIR}
COSYVOICE_MODEL_DIR={COSYVOICE_MODEL_DIR}
COSYVOICE_MODE={COSYVOICE_MODE}
HOST={HOST}
PORT={PORT}
'''))


In [None]:
%%bash
set -euxo pipefail
source /tmp/tts_ws_colab_env.sh
mkdir -p "$WORK_ROOT"
nvidia-smi || true

if [[ ! -d "$REPO_PARENT/.git" ]]; then
  rm -rf "$REPO_PARENT"
  git clone "$REPO_URL" "$REPO_PARENT"
fi

cd "$REPO_PARENT"
git fetch --all --prune
git checkout "$GIT_REF"
git pull --ff-only || true

# Resolve project root robustly (repo may itself be tts_ws or contain tts_ws subdir).
if [[ -f "$REPO_PARENT/requirements.runtime.txt" && -d "$REPO_PARENT/app" ]]; then
  PROJECT_ROOT="$REPO_PARENT"
elif [[ -f "$REPO_PARENT/tts_ws/requirements.runtime.txt" && -d "$REPO_PARENT/tts_ws/app" ]]; then
  PROJECT_ROOT="$REPO_PARENT/tts_ws"
else
  echo "Could not locate project root under $REPO_PARENT" >&2
  ls -la "$REPO_PARENT"
  exit 1
fi

echo "$PROJECT_ROOT" > /tmp/tts_ws_project_root.txt
echo "Resolved PROJECT_ROOT=$PROJECT_ROOT"
git -C "$PROJECT_ROOT" rev-parse --short HEAD
ls -la "$PROJECT_ROOT" | head -n 80


In [None]:
%%bash
set -euxo pipefail
source /tmp/tts_ws_colab_env.sh
PROJECT_ROOT="$(cat /tmp/tts_ws_project_root.txt)"

python3 -m venv "$VENV_DIR"
source "$VENV_DIR/bin/activate"
python -m pip install -U pip setuptools wheel

# Base deps for this websocket project in isolated venv.
python -m pip install -r "$PROJECT_ROOT/requirements.runtime.txt"

python - <<'PY'
import sys
print('venv python', sys.executable)
print('python version', sys.version)
PY


In [None]:
%%bash
set -euxo pipefail
source /tmp/tts_ws_colab_env.sh
PROJECT_ROOT="$(cat /tmp/tts_ws_project_root.txt)"
source "$VENV_DIR/bin/activate"

if [[ ! -d "$COSYVOICE_REPO_DIR" ]]; then
  git clone --recursive https://github.com/FunAudioLLM/CosyVoice.git "$COSYVOICE_REPO_DIR"
else
  cd "$COSYVOICE_REPO_DIR"
  git submodule update --init --recursive
fi

REQ_IN="$COSYVOICE_REPO_DIR/requirements.txt"
REQ_OUT="/tmp/cosyvoice.requirements.filtered.txt"

# Filter packages known to break or unnecessary for this inference-only workflow.
awk '
  /^--extra-index-url/ { print; next }
  /^grpcio==/ { print "grpcio>=1.62.2"; next }
  /^grpcio-tools==/ { print "grpcio-tools>=1.62.2"; next }
  /^deepspeed==/ { next }
  /^openai-whisper==/ { next }
  /^pyworld==/ { next }
  /^tensorrt-cu12==/ { next }
  /^tensorrt-cu12-bindings==/ { next }
  /^tensorrt-cu12-libs==/ { next }
  { print }
' "$REQ_IN" > "$REQ_OUT"

python -m pip install --prefer-binary -r "$REQ_OUT"

python - <<'PY'
import importlib
mods=['torch','torchaudio','onnxruntime','modelscope','hyperpyyaml','transformers','inflect']
for m in mods:
    importlib.import_module(m)
    print('ok', m)
PY


In [None]:
%%bash
set -euxo pipefail
source /tmp/tts_ws_colab_env.sh
PROJECT_ROOT="$(cat /tmp/tts_ws_project_root.txt)"
source "$VENV_DIR/bin/activate"

pkill -f "uvicorn app.server:app" >/dev/null 2>&1 || true

cd "$PROJECT_ROOT"
export TTS_BACKEND=cosyvoice
export COSYVOICE_REPO_DIR="$COSYVOICE_REPO_DIR"
export COSYVOICE_MODEL_DIR="$COSYVOICE_MODEL_DIR"
export COSYVOICE_MODE="$COSYVOICE_MODE"
export TTS_PROFILE=1

nohup python -m uvicorn app.server:app --host "$HOST" --port "$PORT" --log-level warning > "$WORK_ROOT/tts_ws_server.log" 2>&1 &
echo $! > /tmp/tts_ws_server.pid
sleep 3
echo "PID=$(cat /tmp/tts_ws_server.pid)"
tail -n 80 "$WORK_ROOT/tts_ws_server.log" || true


In [None]:
%%bash
set -euxo pipefail
source /tmp/tts_ws_colab_env.sh
source "$VENV_DIR/bin/activate"

python - <<'PY'
import asyncio
import base64
import json
import os
import wave
import websockets

PORT = int(os.environ.get('PORT', '8000'))
WORK_ROOT = os.environ.get('WORK_ROOT', '/tmp')
out_wav = os.path.join(WORK_ROOT, 'cosy_smoke.wav')

async def run():
    chunks = []
    metrics = None
    async with websockets.connect(f'ws://127.0.0.1:{PORT}/tts', max_size=None) as ws:
        await ws.send(json.dumps({'text': ' ', 'flush': False, 'run_id': 'smoke'}))
        await ws.send(json.dumps({'text': 'Hello from explicit Colab notebook.', 'flush': False, 'run_id': 'smoke'}))
        await ws.send(json.dumps({'text': '', 'flush': True, 'run_id': 'smoke'}))
        await ws.send(json.dumps({'text': '', 'flush': False, 'run_id': 'smoke'}))
        try:
            while True:
                msg = await ws.recv()
                payload = json.loads(msg if isinstance(msg, str) else msg.decode('utf-8'))
                if payload.get('type') == 'metrics':
                    metrics = payload.get('metrics')
                    continue
                b64 = str(payload.get('audio', ''))
                if b64:
                    chunks.append(base64.b64decode(b64))
        except websockets.ConnectionClosed:
            pass

    pcm = b''.join(chunks)
    os.makedirs(os.path.dirname(out_wav), exist_ok=True)
    with wave.open(out_wav, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(44100)
        wf.writeframes(pcm)

    print('audio bytes:', len(pcm))
    print('output wav:', out_wav)
    print('server metrics:', metrics)

asyncio.run(run())
PY


In [None]:
%%bash
set -euxo pipefail
source /tmp/tts_ws_colab_env.sh
PROJECT_ROOT="$(cat /tmp/tts_ws_project_root.txt)"
source "$VENV_DIR/bin/activate"

cd "$PROJECT_ROOT"
python benchmark_ttft.py \
  --uri "ws://127.0.0.1:${PORT}/tts" \
  --backend cosyvoice \
  --no-start-server \
  --lengths 80,160,320 \
  --runs-per-length 2 \
  --chunk-mode ramp \
  --chunk-plan 4,8,32 \
  --delay 0.01 \
  --math-normalizer rule \
  --json-out "$WORK_ROOT/ttft_cosyvoice_colab_explicit.json"

echo "Benchmark written to $WORK_ROOT/ttft_cosyvoice_colab_explicit.json"


In [None]:
%%bash
set -euxo pipefail
if [[ -f /tmp/tts_ws_server.pid ]]; then
  kill "$(cat /tmp/tts_ws_server.pid)" >/dev/null 2>&1 || true
fi
pkill -f "uvicorn app.server:app" >/dev/null 2>&1 || true
echo "server stopped"
