FunAudioLLM · chenxwh · Dec 26, 2024
diff --git a/README.md b/README.md
@@ -1,7 +1,7 @@
 [![SVG Banners](https://svg-banners.vercel.app/api?type=origin&text1=CosyVoice🤠&text2=Text-to-Speech%20💖%20Large%20Language%20Model&width=800&height=210)](https://github.com/Akshay090/svg-banners)
 
 ## 👉🏻 CosyVoice 👈🏻
-**CosyVoice 2.0**: [Demos](https://funaudiollm.github.io/cosyvoice2/); [Paper](https://arxiv.org/abs/2412.10117); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B); [HuggingFace](https://huggingface.co/spaces/FunAudioLLM/CosyVoice2-0.5B)
+**CosyVoice 2.0**: [Demos](https://funaudiollm.github.io/cosyvoice2/); [Paper](https://arxiv.org/abs/2412.10117); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B); [HuggingFace](https://huggingface.co/spaces/FunAudioLLM/CosyVoice2-0.5B); [Replicate Demo and API](https://replicate.com/chenxwh/cosyvoice2-0.5b)
 
 **CosyVoice 1.0**: [Demos](https://fun-audio-llm.github.io); [Paper](https://funaudiollm.github.io/pdf/CosyVoice_v1.pdf); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice-300M)
 

diff --git a/cog.yaml b/cog.yaml
@@ -0,0 +1,55 @@
+# Configuration for Cog ⚙️
+# Reference: https://cog.run/yaml
+
+build:
+  # set to true if your model requires a GPU
+  gpu: true
+
+  # a list of ubuntu apt packages to install
+  system_packages:
+    - "libgl1-mesa-glx"
+    - "libglib2.0-0"
+    - "libsox-dev"
+    - "sox"
+
+  # python version in the form '3.11' or '3.11.4'
+  python_version: "3.11"
+
+  # a list of packages in the format <package-name>==<version>
+  python_packages:
+    - ipython
+    - pynini
+    - torch==2.3.1
+    - torchaudio==2.3.1
+    - transformers==4.40.1
+    - conformer==0.3.2
+    - diffusers==0.27.2
+    - gdown==5.1.0
+    - grpcio==1.57.0
+    - grpcio-tools==1.57.0
+    - huggingface-hub==0.23.5
+    - hydra-core==1.3.2
+    - HyperPyYAML==1.2.2
+    - inflect==7.3.1
+    - librosa==0.10.2
+    - lightning==2.2.4
+    - matplotlib==3.7.5
+    - modelscope==1.15.0
+    - networkx==3.1
+    - omegaconf==2.3.0
+    - onnx==1.16.0
+    - onnxruntime-gpu
+    - openai-whisper==20231117
+    - protobuf==4.25
+    - rich==13.7.1
+    - soundfile==0.12.1
+    - uvicorn==0.30.0
+    - wget==3.2
+    - fastapi==0.111.0
+    - fastapi-cli==0.0.4
+    - WeTextProcessing==1.0.3
+
+  run:
+    - curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.6.0/pget_linux_x86_64" && chmod +x /usr/local/bin/pget
+
+predict: "predict.py:Predictor"
diff --git a/predict.py b/predict.py
@@ -0,0 +1,88 @@
+# Prediction interface for Cog ⚙️
+# https://cog.run/python
+
+import os
+import sys
+import subprocess
+import time
+from cog import BasePredictor, Input, Path
+import torchaudio
+
+sys.path.insert(0, os.path.abspath("third_party/Matcha-TTS"))
+
+from cosyvoice.cli.cosyvoice import CosyVoice2
+from cosyvoice.utils.file_utils import load_wav
+
+
+MODEL_CACHE = "pretrained_models"
+MODEL_URL = (
+    f"https://weights.replicate.delivery/default/FunAudioLLM/CosyVoice/model_cache.tar"
+)
+
+
+def download_weights(url, dest):
+    start = time.time()
+    print("downloading url: ", url)
+    print("downloading to: ", dest)
+    subprocess.check_call(["pget", "-x", url, dest], close_fds=False)
+    print("downloading took: ", time.time() - start)
+
+
+class Predictor(BasePredictor):
+    def setup(self) -> None:
+        """Load the model into memory to make running multiple predictions efficient"""
+
+        if not os.path.exists(MODEL_CACHE):
+            print("downloading")
+            download_weights(MODEL_URL, MODEL_CACHE)
+
+        self.cosyvoice = CosyVoice2(
+            "pretrained_models/CosyVoice2-0.5B",
+            load_jit=True,
+            load_onnx=False,
+            load_trt=False,
+        )
+
+    def predict(
+        self,
+        source_audio: Path = Input(description="Source audio"),
+        source_transcript: str = Input(
+            description="Transcript of the source audio, you can use models such as whisper to transcribe first"
+        ),
+        tts_text: str = Input(description="Text of the audio to generate"),
+        task: str = Input(
+            choices=[
+                "zero-shot voice clone",
+                "cross-lingual voice clone",
+                "Instructed Voice Generation",
+            ],
+            default="zero-shot voice clone",
+        ),
+        instruction: str = Input(
+            description="Instruction for Instructed Voice Generation task", default=""
+        ),
+    ) -> Path:
+        """Run a single prediction on the model"""
+        if task == "Instructed Voice Generation":
+            assert len(instruction) > 0, "Please specify the instruction."
+
+        prompt_speech_16k = load_wav(str(source_audio), 16000)
+
+        if task == "zero-shot voice clone":
+            output = self.cosyvoice.inference_zero_shot(
+                tts_text, source_transcript, prompt_speech_16k, stream=False
+            )
+        elif task == "cross-lingual voice clone":
+            output = self.cosyvoice.inference_cross_lingual(
+                tts_text, prompt_speech_16k, stream=False
+            )
+        else:
+            output = self.cosyvoice.inference_instruct2(
+                tts_text, instruction, prompt_speech_16k, stream=False
+            )
+
+        out_path = "/tmp/out.wav"
+        torchaudio.save(
+            out_path, list(output)[0]["tts_speech"], self.cosyvoice.sample_rate
+        )
+        return Path(out_path)