From 2b01aed832b9746beb40891d0509a7cb17502560 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sam=20B=C3=BCth?= Date: Thu, 11 May 2023 14:32:22 +0200 Subject: [PATCH] Initial commit --- .dockerignore | 3 ++ .github/workflows/publish.yml | 44 ++++++++++++++++ Dockerfile | 15 ++++++ README.md | 3 ++ docker-compose.yml | 13 +++++ requirements.txt | 2 + wyoming_tts/__init__.py | 1 + wyoming_tts/__main__.py | 98 +++++++++++++++++++++++++++++++++++ wyoming_tts/handler.py | 88 +++++++++++++++++++++++++++++++ 9 files changed, 267 insertions(+) create mode 100644 .dockerignore create mode 100644 .github/workflows/publish.yml create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 docker-compose.yml create mode 100644 requirements.txt create mode 100644 wyoming_tts/__init__.py create mode 100644 wyoming_tts/__main__.py create mode 100644 wyoming_tts/handler.py diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..0cdf809 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,3 @@ +.github +docker-compose.yml +Dockerfile \ No newline at end of file diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml new file mode 100644 index 0000000..7585202 --- /dev/null +++ b/.github/workflows/publish.yml @@ -0,0 +1,44 @@ +--- +# This workflow uses actions that are not certified by GitHub. +# They are provided by a third-party and are governed by +# separate terms of service, privacy policy, and support +# documentation. + +# GitHub recommends pinning actions to a commit SHA. +# To get a newer version, you will need to update the SHA. +# You can also reference a tag or branch, but the action may change without warning. + +name: Publish Docker image + +on: + release: + types: [published] + +jobs: + push_to_registry: + name: Push Docker image to Docker Hub + runs-on: ubuntu-latest + steps: + - name: Check out the repo + uses: actions/checkout@v3 + + - name: Log in to Docker Hub + uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a + with: + username: ${{ secrets.DOCKER_USERNAME }} + password: ${{ secrets.DOCKER_PASSWORD }} + + - name: Extract metadata (tags, labels) for Docker + id: meta + uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7 + with: + images: samboo/wyoming-tts + + - name: Build and push Docker image + uses: docker/build-push-action@3b5e8027fcad23fda98b2e3ac259d8d67585f671 + with: + context: . + file: ./Dockerfile + push: true + tags: ${{ steps.meta.outputs.tags }} + labels: ${{ steps.meta.outputs.labels }} diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..bd8e516 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3.7 + +WORKDIR /app + +RUN mkdir /data && mkdir -p /root/.local/share && ln -s /data /root/.local/share/tts + +COPY requirements.txt requirements.txt + +RUN pip3 install --no-cache-dir -r requirements.txt + +COPY . . + +VOLUME [ "/data" ] + +ENTRYPOINT ["python3", "wyoming_tts"] \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..5acefc0 --- /dev/null +++ b/README.md @@ -0,0 +1,3 @@ +# wyoming TTS + +coqui-ai TTS Wyoming protocol implementation \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..92147cb --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,13 @@ +version: '3.0' + +services: + tts: + image: samboo/wyoming-tts + restart: always + command: --uri tcp://0.0.0.0:10201 --voice tts_models/de/thorsten/vits + environment: + - COQUI_STUDIO_TOKEN= #optional + volumes: + - ./tts:/data + ports: + - 10201:10201 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f880d5c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,2 @@ +wyoming==0.0.1 +tts~=0.13.3 \ No newline at end of file diff --git a/wyoming_tts/__init__.py b/wyoming_tts/__init__.py new file mode 100644 index 0000000..fbc599d --- /dev/null +++ b/wyoming_tts/__init__.py @@ -0,0 +1 @@ +"""Wyoming server for tts.""" diff --git a/wyoming_tts/__main__.py b/wyoming_tts/__main__.py new file mode 100644 index 0000000..c4cdfcb --- /dev/null +++ b/wyoming_tts/__main__.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +import argparse +import asyncio +import logging +from functools import partial + +from TTS.api import TTS +from wyoming.info import Attribution, Info, TtsProgram, TtsVoice +from wyoming.server import AsyncServer + +from handler import PiperEventHandler + +_LOGGER = logging.getLogger(__name__) + +async def main() -> None: + """Main entry point.""" + parser = argparse.ArgumentParser() + parser.add_argument( + "--voice", + default=None, + help="The Voice to use for TTS", + ) + parser.add_argument( + "--speaker", + help="Set the target speaker", + ) + parser.add_argument( + "--language", + help="Set the target language", + ) + parser.add_argument("--samples-per-chunk", type=int, default=1024) + parser.add_argument("--uri", required=True, help="unix:// or tcp://") + parser.add_argument("--debug", action="store_true", help="Log DEBUG messages") + args = parser.parse_args() + logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO) + + if (args.voice == None): + _LOGGER.info("The following voices are available (specify with --voice [model_name]): ") + _LOGGER.info("\n".join(TTS.list_models())) + exit() + + tts = TTS(args.voice) + + if (tts.is_multi_lingual and args.language is None): + _LOGGER.error("The following languages are available (specify with --language [lang]): ") + _LOGGER.info("\n".join(tts.languages)) + exit() + if (tts.is_multi_speaker and args.speaker is None): + _LOGGER.error("The following speakers are available (specify with --speakers [speaker]): ") + _LOGGER.info("\n".join(tts.speakers)) + exit() + + language = None + if (tts.is_multi_lingual is False): + language = args.voice.split("/")[1] + _LOGGER.info("Using language: %s", language) + + _LOGGER.info("TTS ready") + + wyoming_info = Info( + tts=[ + TtsProgram( + name="coqui-ai TTS", + attribution=Attribution( + name="coqui-ai", url="https://github.com/coqui-ai/TTS" + ), + installed=True, + voices=[ + TtsVoice( + name=speaker, + attribution=Attribution( + name="coqui-ai", url="https://github.com/coqui-ai/TTS" + ), + installed=True, + languages=tts.languages if tts.is_multi_lingual else [language], + ) for speaker in ([args.speaker] if tts.is_multi_speaker else ["Default"]) # Preparation for multi speaker support in wyoming event + ], + ) + ], + ) + + server = AsyncServer.from_uri(args.uri) + _LOGGER.info("Ready") + await server.run( + partial( + PiperEventHandler, + wyoming_info, + args, + tts + ) + ) + + + +# ----------------------------------------------------------------------------- + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/wyoming_tts/handler.py b/wyoming_tts/handler.py new file mode 100644 index 0000000..9df6d6f --- /dev/null +++ b/wyoming_tts/handler.py @@ -0,0 +1,88 @@ +"""Event handler for clients of the server.""" +import argparse +import logging +import math +import wave + +from TTS.api import TTS +from wyoming.audio import AudioChunk, AudioStart, AudioStop +from wyoming.event import Event +from wyoming.info import Describe, Info +from wyoming.server import AsyncEventHandler +from wyoming.tts import Synthesize + +_LOGGER = logging.getLogger(__name__) + +class PiperEventHandler(AsyncEventHandler): + def __init__( + self, + wyoming_info: Info, + cli_args: argparse.Namespace, + tts: TTS, + *args, + ) -> None: + super().__init__(*args) + self.cli_args = cli_args + self.wyoming_info_event = wyoming_info.event() + self.tts = tts + + async def handle_event(self, event: Event) -> bool: + if Describe.is_type(event.type): + await self.write_event(self.wyoming_info_event) + _LOGGER.debug("Sent info") + return True + + if not Synthesize.is_type(event.type): + _LOGGER.warning("Unexpected event: %s", event) + return True + synthesize = Synthesize.from_event(event) + raw_text = synthesize.text + text = raw_text.strip() + + output_path = "/tmp/output.wav" + _LOGGER.debug(event) + tts_args = dict() + if (self.tts.is_multi_lingual): + tts_args["language"] = self.cli_args.language + + if (self.tts.is_multi_speaker): + tts_args["speaker"] = self.cli_args.speaker + self.tts.tts_to_file(text, **tts_args, file_path=output_path) + wav_file: wave.Wave_read = wave.open(output_path, "rb") + with wav_file: + rate = wav_file.getframerate() + width = wav_file.getsampwidth() + channels = wav_file.getnchannels() + + await self.write_event( + AudioStart( + rate=rate, + width=width, + channels=channels, + ).event(), + ) + + # Audio + audio_bytes = wav_file.readframes(wav_file.getnframes()) + bytes_per_sample = width * channels + bytes_per_chunk = bytes_per_sample * self.cli_args.samples_per_chunk + num_chunks = int(math.ceil(len(audio_bytes) / bytes_per_chunk)) + + # Split into chunks + for i in range(num_chunks): + offset = i * bytes_per_chunk + chunk = audio_bytes[offset : offset + bytes_per_chunk] + await self.write_event( + AudioChunk( + audio=chunk, + rate=rate, + width=width, + channels=channels, + ).event(), + ) + + await self.write_event(AudioStop().event()) + _LOGGER.debug("Completed request") + + + return True \ No newline at end of file