Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Replicate demo and API #790

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
[![SVG Banners](https://svg-banners.vercel.app/api?type=origin&text1=CosyVoice🤠&text2=Text-to-Speech%20💖%20Large%20Language%20Model&width=800&height=210)](https://github.com/Akshay090/svg-banners)

## 👉🏻 CosyVoice 👈🏻
**CosyVoice 2.0**: [Demos](https://funaudiollm.github.io/cosyvoice2/); [Paper](https://arxiv.org/abs/2412.10117); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B); [HuggingFace](https://huggingface.co/spaces/FunAudioLLM/CosyVoice2-0.5B)
**CosyVoice 2.0**: [Demos](https://funaudiollm.github.io/cosyvoice2/); [Paper](https://arxiv.org/abs/2412.10117); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice2-0.5B); [HuggingFace](https://huggingface.co/spaces/FunAudioLLM/CosyVoice2-0.5B); [Replicate Demo and API](https://replicate.com/chenxwh/cosyvoice2-0.5b)

**CosyVoice 1.0**: [Demos](https://fun-audio-llm.github.io); [Paper](https://funaudiollm.github.io/pdf/CosyVoice_v1.pdf); [Modelscope](https://www.modelscope.cn/studios/iic/CosyVoice-300M)

55 changes: 55 additions & 0 deletions cog.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
# Configuration for Cog ⚙️
# Reference: https://cog.run/yaml

build:
# set to true if your model requires a GPU
gpu: true

# a list of ubuntu apt packages to install
system_packages:
- "libgl1-mesa-glx"
- "libglib2.0-0"
- "libsox-dev"
- "sox"

# python version in the form '3.11' or '3.11.4'
python_version: "3.11"

# a list of packages in the format <package-name>==<version>
python_packages:
- ipython
- pynini
- torch==2.3.1
- torchaudio==2.3.1
- transformers==4.40.1
- conformer==0.3.2
- diffusers==0.27.2
- gdown==5.1.0
- grpcio==1.57.0
- grpcio-tools==1.57.0
- huggingface-hub==0.23.5
- hydra-core==1.3.2
- HyperPyYAML==1.2.2
- inflect==7.3.1
- librosa==0.10.2
- lightning==2.2.4
- matplotlib==3.7.5
- modelscope==1.15.0
- networkx==3.1
- omegaconf==2.3.0
- onnx==1.16.0
- onnxruntime-gpu
- openai-whisper==20231117
- protobuf==4.25
- rich==13.7.1
- soundfile==0.12.1
- uvicorn==0.30.0
- wget==3.2
- fastapi==0.111.0
- fastapi-cli==0.0.4
- WeTextProcessing==1.0.3

run:
- curl -o /usr/local/bin/pget -L "https://github.com/replicate/pget/releases/download/v0.6.0/pget_linux_x86_64" && chmod +x /usr/local/bin/pget

predict: "predict.py:Predictor"
88 changes: 88 additions & 0 deletions predict.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# Prediction interface for Cog ⚙️
# https://cog.run/python

import os
import sys
import subprocess
import time
from cog import BasePredictor, Input, Path
import torchaudio

sys.path.insert(0, os.path.abspath("third_party/Matcha-TTS"))

from cosyvoice.cli.cosyvoice import CosyVoice2
from cosyvoice.utils.file_utils import load_wav


MODEL_CACHE = "pretrained_models"
MODEL_URL = (
f"https://weights.replicate.delivery/default/FunAudioLLM/CosyVoice/model_cache.tar"
)


def download_weights(url, dest):
start = time.time()
print("downloading url: ", url)
print("downloading to: ", dest)
subprocess.check_call(["pget", "-x", url, dest], close_fds=False)
print("downloading took: ", time.time() - start)


class Predictor(BasePredictor):
def setup(self) -> None:
"""Load the model into memory to make running multiple predictions efficient"""

if not os.path.exists(MODEL_CACHE):
print("downloading")
download_weights(MODEL_URL, MODEL_CACHE)

self.cosyvoice = CosyVoice2(
"pretrained_models/CosyVoice2-0.5B",
load_jit=True,
load_onnx=False,
load_trt=False,
)

def predict(
self,
source_audio: Path = Input(description="Source audio"),
source_transcript: str = Input(
description="Transcript of the source audio, you can use models such as whisper to transcribe first"
),
tts_text: str = Input(description="Text of the audio to generate"),
task: str = Input(
choices=[
"zero-shot voice clone",
"cross-lingual voice clone",
"Instructed Voice Generation",
],
default="zero-shot voice clone",
),
instruction: str = Input(
description="Instruction for Instructed Voice Generation task", default=""
),
) -> Path:
"""Run a single prediction on the model"""
if task == "Instructed Voice Generation":
assert len(instruction) > 0, "Please specify the instruction."

prompt_speech_16k = load_wav(str(source_audio), 16000)

if task == "zero-shot voice clone":
output = self.cosyvoice.inference_zero_shot(
tts_text, source_transcript, prompt_speech_16k, stream=False
)
elif task == "cross-lingual voice clone":
output = self.cosyvoice.inference_cross_lingual(
tts_text, prompt_speech_16k, stream=False
)
else:
output = self.cosyvoice.inference_instruct2(
tts_text, instruction, prompt_speech_16k, stream=False
)

out_path = "/tmp/out.wav"
torchaudio.save(
out_path, list(output)[0]["tts_speech"], self.cosyvoice.sample_rate
)
return Path(out_path)