In [21]:
pip install gradio transformers torch librosa soundfile werkzeug




In [22]:
import gradio as gr
import torch
from transformers import AutoProcessor, AutoModel
import numpy as np
from scipy.io.wavfile import write
import os
import gc
import secrets
import time


The CFG class serves as a centralized configuration holder for the TTS application. It defines various settings and parameters that dictate how the application behaves.

VOICES: A dictionary that maps human-readable voice labels to their corresponding voice presets. For example:

"male EN" corresponds to an English male voice.
"female Chinese" corresponds to a Chinese female voice

Post-processing Thresholds:

AMPLITUDE_THRESHOLD: Sets the minimum amplitude level below which audio samples are considered noise and can be trimmed from the audio output.
TIME_THRESHOLD: Defines the duration (in samples) to determine how long low-amplitude sections should be to qualify for trimming. Calculated as half a second at a 24kHz sample rate.
IGNORE_INITIAL_STEPS: Specifies the number of initial samples to ignore when applying the amplitude threshold, allowing the model to skip any startup noise.

In [23]:
# Configuration Class
class CFG:
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

    # Voice Presets
    VOICES = {
        "male EN": "v2/en_speaker_6",    # Example Male Voice 1 (English)
        "male Chinese": "v2/zh_speaker_3",    # Example Male Voice 2 (Chinese)
        "female EN": "v2/en_speaker_9",  # Example Female Voice 1 (English)
        "female Chinese": "v2/zh_speaker_4",  # Example Female Voice 2 (Chinese)
    }

    # Model settings
    MODEL_NAME = 'suno/bark'

    # Post-processing thresholds
    AMPLITUDE_THRESHOLD = 0.05
    TIME_THRESHOLD = int(24_000 * 0.5)  # Half a second at 24kHz
    IGNORE_INITIAL_STEPS = int(24_000 * 0.5)  # Ignore first half-second

    # Paths
    AUDIO_OUTPUT_PATH = '/content/static/audio'  # Directory to store generated audio files


In [24]:
os.makedirs(CFG.AUDIO_OUTPUT_PATH, exist_ok=True)


**Bark Model**

The BARK model by Suno is a state-of-the-art Text-to-Speech (TTS) system designed to produce high-quality, natural-sounding speech. It supports multiple languages and voice presets, allowing users to select different voice characteristics such as gender and language. BARK leverages advanced deep learning techniques to understand and generate speech that closely mimics human intonation and rhythm.

In [26]:
# Load Processor and Model once when the server starts
processor = AutoProcessor.from_pretrained(
    CFG.MODEL_NAME,
    voice_preset=CFG.VOICES["male EN"],  # Temporary default, will be overridden per request
    return_tensors='pt'
)

model = AutoModel.from_pretrained(
    CFG.MODEL_NAME,
    torch_dtype=torch.float16,
).to(CFG.DEVICE)

# Set model to evaluation mode
model.eval()


  WeightNorm.apply(module, name, dim)
  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)


BarkModel(
  (semantic): BarkSemanticModel(
    (input_embeds_layer): Embedding(129600, 1024)
    (position_embeds_layer): Embedding(1024, 1024)
    (drop): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0-23): 24 x BarkBlock(
        (layernorm_1): BarkLayerNorm()
        (layernorm_2): BarkLayerNorm()
        (attn): BarkSelfAttention(
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
          (att_proj): Linear(in_features=1024, out_features=3072, bias=False)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=False)
        )
        (mlp): BarkMLP(
          (in_proj): Linear(in_features=1024, out_features=4096, bias=False)
          (out_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (dropout): Dropout(p=0.0, inplace=False)
          (gelu): GELU(approximate='none')
        )
      )
    )
    (layernorm_final): BarkLayerNorm()
    (lm_head): Linear(in_feat

In [27]:
# Utility Functions
def split_sentences(text):
    import re
    sentences = re.split(r'\. |\.\n|\.\n\n|!|\?|;', text)
    sentences = [sentence.strip() + '.' for sentence in sentences if sentence.strip()]
    number_of_sentences = len(sentences)
    return sentences, number_of_sentences



In [28]:
def count_tokens(text, processor=processor):
    return len(processor.tokenizer(text)['input_ids'])

In [29]:
def slice_array_wave(input_array, amplitude_threshold, time_threshold, ignore_initial_steps=0):
    low_amplitude_indices = np.abs(input_array) < amplitude_threshold
    consecutive_count = 0
    for i, is_low_amplitude in enumerate(low_amplitude_indices[ignore_initial_steps:]):
        if is_low_amplitude:
            consecutive_count += 1
        else:
            consecutive_count = 0

        if consecutive_count >= time_threshold:
            return input_array[:i + int(time_threshold / 4)]

    return input_array

The perform_inference function is the heart of the TTS application. It orchestrates the conversion of input text into synthesized speech.

In [30]:
def perform_inference(text, voice_preset):
    sentences, number_of_sentences = split_sentences(text)
    print(f'\nNumber of sentences in this text: {number_of_sentences}\n')

    all_audio_arrays = []

    for sentence_number, current_sentence in enumerate(sentences, start=1):
        print(f'Processing sentence {sentence_number}/{number_of_sentences}...')
        start_time = time.time()

        # Prepare input
        inputs = processor(
            text=current_sentence,
            return_tensors="pt",
            return_attention_mask=True,
            max_length=1024,
            voice_preset=voice_preset,
            add_special_tokens=False,
        ).to(CFG.DEVICE)

        # Count tokens
        n_tokens = count_tokens(current_sentence, processor)

        # Model inference
        with torch.inference_mode():
            result = model.generate(
                **inputs,
                do_sample=True,
                semantic_max_new_tokens=512,  # Reduced from 1024
                pad_token_id=processor.tokenizer.pad_token_id,
            )

        # Save results
        audio_array = result.cpu().numpy().squeeze()
        all_audio_arrays.append(audio_array)
        elapsed_time = round((time.time() - start_time), 2)

        print(f'''
              Sentence {sentence_number}/{number_of_sentences} processed:
              	Number of tokens in sentence: {n_tokens}
              	Length of sentence: {len(current_sentence)}
              	Shape of tensor for this sentence: {result.size()}
              	Elapsed time for this sentence: {elapsed_time} s
              ''')

        # Clean up
        del result
        gc.collect()

    # Concatenate and post-process audio without slicing to ensure completeness
    concatenated_array = np.concatenate(all_audio_arrays)

    # Save as .wav file
    filename = 'final_audio.wav'
    filepath = os.path.join(CFG.AUDIO_OUTPUT_PATH, filename)
    write(filepath, rate=24000, data=concatenated_array.astype(np.float32))  # Assuming 24,000 Hz sample rate
    print(f"Final audio saved as {filename}")

    return filepath


The generate_tts function acts as the bridge between the user interface (Gradio) and the core TTS processing logic.

In [31]:
# Define the Inference Function for Gradio
def generate_tts(text, voice):
    audio_path = perform_inference(text, CFG.VOICES[voice])
    return audio_path

This section sets up the user interface (UI) using Gradio, a Python library that simplifies the creation of web-based interfaces for machine learning models.

In [34]:
# Define the Gradio interface using updated API
iface = gr.Interface(
    fn=generate_tts,
    inputs=[
        gr.Textbox(lines=10, label="Enter Text"),
        gr.Radio(choices=list(CFG.VOICES.keys()), label="Select Voice Speaker")
    ],
    outputs=gr.Audio(type="filepath", label="Generated Audio"),
    title="AI-Based Text-to-Speech Tool",
    description="Enter text and select a voice to generate speech."
)

# Launch the interface
iface.launch(share=True, debug=True)

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://fbf759d8f00387f660.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)



Number of sentences in this text: 1

Processing sentence 1/1...

              Sentence 1/1 processed:
              	Number of tokens in sentence: 29
              	Length of sentence: 108
              	Shape of tensor for this sentence: torch.Size([1, 246080])
              	Elapsed time for this sentence: 37.52 s
              
Final audio saved as final_audio.wav
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://fbf759d8f00387f660.gradio.live




In [33]:
总结
通过更新  接口组件并确保您的 TTS 模型支持中文（简体）语音生成，您可以创建一个功能强大且用户友好的文本转语音工具。上述示例代码提供了一个全面的框架，您可以根据需要进行调整和优化。如果在实施过程中遇到任何问题，请随时提供详细信息，我将乐意进一步协助您！

SyntaxError: invalid character '（' (U+FF08) (<ipython-input-33-8d1d8c2d714d>, line 2)