From 2b01aed832b9746beb40891d0509a7cb17502560 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sam=20B=C3=BCth?= <samuel.bueth@telekom.de>
Date: Thu, 11 May 2023 14:32:22 +0200
Subject: [PATCH] Initial commit

---
 .dockerignore                 |  3 ++
 .github/workflows/publish.yml | 44 ++++++++++++++++
 Dockerfile                    | 15 ++++++
 README.md                     |  3 ++
 docker-compose.yml            | 13 +++++
 requirements.txt              |  2 +
 wyoming_tts/__init__.py       |  1 +
 wyoming_tts/__main__.py       | 98 +++++++++++++++++++++++++++++++++++
 wyoming_tts/handler.py        | 88 +++++++++++++++++++++++++++++++
 9 files changed, 267 insertions(+)
 create mode 100644 .dockerignore
 create mode 100644 .github/workflows/publish.yml
 create mode 100644 Dockerfile
 create mode 100644 README.md
 create mode 100644 docker-compose.yml
 create mode 100644 requirements.txt
 create mode 100644 wyoming_tts/__init__.py
 create mode 100644 wyoming_tts/__main__.py
 create mode 100644 wyoming_tts/handler.py

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..0cdf809
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,3 @@
+.github
+docker-compose.yml
+Dockerfile
\ No newline at end of file
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
new file mode 100644
index 0000000..7585202
--- /dev/null
+++ b/.github/workflows/publish.yml
@@ -0,0 +1,44 @@
+---
+# This workflow uses actions that are not certified by GitHub.
+# They are provided by a third-party and are governed by
+# separate terms of service, privacy policy, and support
+# documentation.
+
+# GitHub recommends pinning actions to a commit SHA.
+# To get a newer version, you will need to update the SHA.
+# You can also reference a tag or branch, but the action may change without warning.
+
+name: Publish Docker image
+
+on:
+  release:
+    types: [published]
+
+jobs:
+  push_to_registry:
+    name: Push Docker image to Docker Hub
+    runs-on: ubuntu-latest
+    steps:
+      - name: Check out the repo
+        uses: actions/checkout@v3
+
+      - name: Log in to Docker Hub
+        uses: docker/login-action@f4ef78c080cd8ba55a85445d5b36e214a81df20a
+        with:
+          username: ${{ secrets.DOCKER_USERNAME }}
+          password: ${{ secrets.DOCKER_PASSWORD }}
+
+      - name: Extract metadata (tags, labels) for Docker
+        id: meta
+        uses: docker/metadata-action@9ec57ed1fcdbf14dcef7dfbe97b2010124a938b7
+        with:
+          images: samboo/wyoming-tts
+
+      - name: Build and push Docker image
+        uses: docker/build-push-action@3b5e8027fcad23fda98b2e3ac259d8d67585f671
+        with:
+          context: .
+          file: ./Dockerfile
+          push: true
+          tags: ${{ steps.meta.outputs.tags }}
+          labels: ${{ steps.meta.outputs.labels }}
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..bd8e516
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,15 @@
+FROM python:3.7
+
+WORKDIR /app
+
+RUN mkdir /data && mkdir -p /root/.local/share && ln -s /data /root/.local/share/tts
+
+COPY requirements.txt requirements.txt
+
+RUN pip3 install --no-cache-dir -r requirements.txt
+
+COPY . .
+
+VOLUME [ "/data" ]
+
+ENTRYPOINT ["python3", "wyoming_tts"]
\ No newline at end of file
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..5acefc0
--- /dev/null
+++ b/README.md
@@ -0,0 +1,3 @@
+# wyoming TTS
+
+coqui-ai TTS Wyoming protocol implementation 
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
new file mode 100644
index 0000000..92147cb
--- /dev/null
+++ b/docker-compose.yml
@@ -0,0 +1,13 @@
+version: '3.0'
+
+services:
+  tts:
+    image: samboo/wyoming-tts
+    restart: always
+    command: --uri tcp://0.0.0.0:10201 --voice tts_models/de/thorsten/vits
+    environment:
+      - COQUI_STUDIO_TOKEN= #optional
+    volumes:
+      - ./tts:/data
+    ports:
+      - 10201:10201
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..f880d5c
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+wyoming==0.0.1
+tts~=0.13.3
\ No newline at end of file
diff --git a/wyoming_tts/__init__.py b/wyoming_tts/__init__.py
new file mode 100644
index 0000000..fbc599d
--- /dev/null
+++ b/wyoming_tts/__init__.py
@@ -0,0 +1 @@
+"""Wyoming server for tts."""
diff --git a/wyoming_tts/__main__.py b/wyoming_tts/__main__.py
new file mode 100644
index 0000000..c4cdfcb
--- /dev/null
+++ b/wyoming_tts/__main__.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+import argparse
+import asyncio
+import logging
+from functools import partial
+
+from TTS.api import TTS
+from wyoming.info import Attribution, Info, TtsProgram, TtsVoice
+from wyoming.server import AsyncServer
+
+from handler import PiperEventHandler
+
+_LOGGER = logging.getLogger(__name__)
+
+async def main() -> None:
+    """Main entry point."""
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--voice",
+        default=None,
+        help="The Voice to use for TTS",
+    )
+    parser.add_argument(
+        "--speaker",
+        help="Set the target speaker",
+    )
+    parser.add_argument(
+        "--language",
+        help="Set the target language",
+    )
+    parser.add_argument("--samples-per-chunk", type=int, default=1024)
+    parser.add_argument("--uri", required=True, help="unix:// or tcp://")
+    parser.add_argument("--debug", action="store_true", help="Log DEBUG messages")
+    args = parser.parse_args()
+    logging.basicConfig(level=logging.DEBUG if args.debug else logging.INFO)
+
+    if (args.voice == None): 
+        _LOGGER.info("The following voices are available (specify with --voice [model_name]): ")
+        _LOGGER.info("\n".join(TTS.list_models()))
+        exit()
+
+    tts = TTS(args.voice)
+
+    if (tts.is_multi_lingual and args.language is None): 
+        _LOGGER.error("The following languages are available (specify with --language [lang]): ")
+        _LOGGER.info("\n".join(tts.languages))
+        exit()
+    if (tts.is_multi_speaker and args.speaker is None):
+        _LOGGER.error("The following speakers are available (specify with --speakers [speaker]): ")
+        _LOGGER.info("\n".join(tts.speakers))
+        exit()
+
+    language = None
+    if (tts.is_multi_lingual is False):
+        language = args.voice.split("/")[1]
+        _LOGGER.info("Using language: %s", language)
+
+    _LOGGER.info("TTS ready")
+
+    wyoming_info = Info(
+        tts=[
+            TtsProgram(
+                name="coqui-ai TTS",
+                attribution=Attribution(
+                    name="coqui-ai", url="https://github.com/coqui-ai/TTS"
+                ),
+                installed=True,
+                voices=[
+                    TtsVoice(
+                        name=speaker,
+                        attribution=Attribution(
+                            name="coqui-ai", url="https://github.com/coqui-ai/TTS"
+                        ),
+                        installed=True,
+                        languages=tts.languages if tts.is_multi_lingual else [language],
+                    ) for speaker in ([args.speaker] if tts.is_multi_speaker else ["Default"]) # Preparation for multi speaker support in wyoming event
+                ],
+            )
+        ],
+    )
+    
+    server = AsyncServer.from_uri(args.uri)
+    _LOGGER.info("Ready")
+    await server.run(
+        partial(
+                PiperEventHandler,
+                wyoming_info,
+                args,
+                tts
+            )
+    )
+
+
+
+# -----------------------------------------------------------------------------
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/wyoming_tts/handler.py b/wyoming_tts/handler.py
new file mode 100644
index 0000000..9df6d6f
--- /dev/null
+++ b/wyoming_tts/handler.py
@@ -0,0 +1,88 @@
+"""Event handler for clients of the server."""
+import argparse
+import logging
+import math
+import wave
+
+from TTS.api import TTS
+from wyoming.audio import AudioChunk, AudioStart, AudioStop
+from wyoming.event import Event
+from wyoming.info import Describe, Info
+from wyoming.server import AsyncEventHandler
+from wyoming.tts import Synthesize
+
+_LOGGER = logging.getLogger(__name__)
+
+class PiperEventHandler(AsyncEventHandler):
+    def __init__(
+        self,
+        wyoming_info: Info,
+        cli_args: argparse.Namespace,
+        tts: TTS,
+        *args,
+    ) -> None:
+        super().__init__(*args)
+        self.cli_args = cli_args
+        self.wyoming_info_event = wyoming_info.event()
+        self.tts = tts
+
+    async def handle_event(self, event: Event) -> bool:
+        if Describe.is_type(event.type):
+            await self.write_event(self.wyoming_info_event)
+            _LOGGER.debug("Sent info")
+            return True
+
+        if not Synthesize.is_type(event.type):
+            _LOGGER.warning("Unexpected event: %s", event)
+            return True
+        synthesize = Synthesize.from_event(event)
+        raw_text = synthesize.text
+        text = raw_text.strip()
+
+        output_path = "/tmp/output.wav"
+        _LOGGER.debug(event)
+        tts_args = dict()
+        if (self.tts.is_multi_lingual):
+            tts_args["language"] = self.cli_args.language
+
+        if (self.tts.is_multi_speaker):
+            tts_args["speaker"] = self.cli_args.speaker
+        self.tts.tts_to_file(text, **tts_args, file_path=output_path)
+        wav_file: wave.Wave_read = wave.open(output_path, "rb")
+        with wav_file:
+            rate = wav_file.getframerate()
+            width = wav_file.getsampwidth()
+            channels = wav_file.getnchannels()
+
+            await self.write_event(
+                AudioStart(
+                    rate=rate,
+                    width=width,
+                    channels=channels,
+                ).event(),
+            )
+
+            # Audio
+            audio_bytes = wav_file.readframes(wav_file.getnframes())
+            bytes_per_sample = width * channels
+            bytes_per_chunk = bytes_per_sample * self.cli_args.samples_per_chunk
+            num_chunks = int(math.ceil(len(audio_bytes) / bytes_per_chunk))
+
+            # Split into chunks
+            for i in range(num_chunks):
+                offset = i * bytes_per_chunk
+                chunk = audio_bytes[offset : offset + bytes_per_chunk]
+                await self.write_event(
+                    AudioChunk(
+                        audio=chunk,
+                        rate=rate,
+                        width=width,
+                        channels=channels,
+                    ).event(),
+                )
+
+        await self.write_event(AudioStop().event())
+        _LOGGER.debug("Completed request")
+
+
+        return True
\ No newline at end of file