<a href="https://colab.research.google.com/github/Dave1475/TTS/blob/dev/OpenVoice_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
%cd /content
!git clone -b dev https://github.com/camenduru/OpenVoice
%cd /content/OpenVoice

!apt -y install -qq aria2
!aria2c --console-log-level=error -c -x 16 -s 16 -k 1M https://huggingface.co/camenduru/OpenVoice/resolve/main/checkpoints_1226.zip -d /content -o checkpoints_1226.zip
!unzip /content/checkpoints_1226.zip

!pip install -q gradio==3.50.2 langid faster-whisper whisper-timestamped unidecode eng-to-ipa pypinyin cn2an



In [None]:
from google.colab.output import eval_js
print(eval_js("google.colab.kernel.proxyPort(8000)"))

In [None]:
from flask import Flask, request, send_file
import torch
import langid
from openvoice import se_extractor
from openvoice.api import BaseSpeakerTTS, ToneColorConverter
import os
import io

app = Flask(__name__)

# Initialize paths and device
en_ckpt_base = 'checkpoints/base_speakers/EN'
zh_ckpt_base = 'checkpoints/base_speakers/ZH'
ckpt_converter = 'checkpoints/converter'
device = 'cuda' if torch.cuda.is_available() else 'cpu'
output_dir = 'outputs'
os.makedirs(output_dir, exist_ok=True)

# Load models
en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')
zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device)
zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth')
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')

# Load speaker embeddings
en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device)

# Reference audio path - Replace with your actual reference audio path
REFERENCE_AUDIO = "resources/demo_speaker2.mp3"

# Supported languages
supported_languages = ['zh', 'en']

def generate_speech(text, style='default'):
    """
    Generate speech from text using OpenVoice
    """
    # Detect language
    language_predicted = langid.classify(text)[0].strip()

    if language_predicted not in supported_languages:
        raise ValueError(f"Unsupported language: {language_predicted}")

    # Select appropriate model and settings based on language
    if language_predicted == "zh":
        tts_model = zh_base_speaker_tts
        source_se = zh_source_se
        language = 'Chinese'
        if style != 'default':
            raise ValueError("Only 'default' style is supported for Chinese")
    else:
        tts_model = en_base_speaker_tts
        source_se = en_source_default_se if style == 'default' else en_source_style_se
        language = 'English'
        valid_styles = ['default', 'whispering', 'shouting', 'excited',
                       'cheerful', 'terrified', 'angry', 'sad', 'friendly']
        if style not in valid_styles:
            raise ValueError(f"Invalid style for English. Must be one of: {valid_styles}")

    # Generate target speaker embedding
    target_se, _ = se_extractor.get_se(REFERENCE_AUDIO, tone_color_converter,
                                      target_dir='processed', vad=True)

    # Generate initial speech
    src_path = f'{output_dir}/tmp.wav'
    tts_model.tts(text, src_path, speaker=style, language=language)

    # Convert tone color
    output_path = f'{output_dir}/output.wav'
    tone_color_converter.convert(
        audio_src_path=src_path,
        src_se=source_se,
        tgt_se=target_se,
        output_path=output_path,
        message="@MyShell"
    )

    return output_path

@app.route('/tts')
def text_to_speech():
    """
    Endpoint for text-to-speech conversion
    Usage: /tts?text=hello&style=default
    """
    try:
        # Get parameters
        text = request.args.get('text')
        style = request.args.get('style', 'default')

        if not text:
            return "No text provided", 400

        if len(text) > 900:
            return "Text length limited to 200 characters", 400

        # Generate speech
        output_path = generate_speech(text, style)

        # Return the audio file
        return send_file(
            output_path,
            mimetype="audio/wav",
        )

    except ValueError as e:
        return str(e), 400
    except Exception as e:
        return f"Error generating speech: {str(e)}", 500

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=8000, debug=True)
