Installation

In [None]:
!pip install transformers torch torchaudio

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

Using a Pretrained Model(Bark)

In [None]:
from transformers import AutoProcessor, AutoModel
import scipy
import torch
import numpy as np

# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"

# Load model
processor = AutoProcessor.from_pretrained("suno/bark")
model = AutoModel.from_pretrained("suno/bark").to(device)

# Split long text into sentences
text = "Hello, Welcome to the TEXT TO SPEECH Project! Here, we will convert text to speech."
sentences = text.split('. ')  # Simple sentence splitting

# Generate audio for each sentence
audio_arrays = []
for sentence in sentences:
    if not sentence.strip():
        continue

    inputs = processor(
        text=sentence.strip(),
        return_tensors="pt",
        truncation=True,
        max_length=256
    ).to(device)

    # Generate with attention mask and reduced memory usage
    with torch.inference_mode():
        audio_array = model.generate(**inputs, do_sample=True, fine_temperature=0.4, coarse_temperature=0.8)
        audio_arrays.append(audio_array.cpu().numpy().squeeze())

# Combine all audio segments
final_audio = np.concatenate(audio_arrays)

# Save output
sample_rate = model.generation_config.sample_rate
scipy.io.wavfile.write("bark_output.wav", rate=sample_rate, data=final_audio)
print("Audio saved successfully!")

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


Audio saved successfully!


Using SpeechT5 for More Control

In [None]:
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
import torch
import soundfile as sf

processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

inputs = processor(text="Hello, Welcome to the TEXT TO SPEECH Project! Here, we will going to give text as a input and We will get audio as output. Amazing right! .", return_tensors="pt")

# Use a default speaker embedding if you don't have specific ones
speaker_embeddings = torch.randn((1, 512))  # Random embedding as example

speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
sf.write("speecht5_output.wav", speech.numpy(), samplerate=16000)

Using Bark for Expressive Speech

In [None]:
from transformers import AutoProcessor, AutoModel
import scipy
import torch
import numpy as np
from IPython.display import Audio

# Check for GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load model
try:
    processor = AutoProcessor.from_pretrained("suno/bark")
    model = AutoModel.from_pretrained("suno/bark").to(device)
    print("Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")
    raise

def generate_expressive_speech(text, voice_preset="v2/en_speaker_6", temperature=0.7):
    """
    Generate expressive speech with Bark TTS

    Parameters:
    - text: Input text with expressive prompts
    - voice_preset: Voice style preset
    - temperature: Controls randomness (0.0-1.0)
    """
    try:
        # Prepare inputs - ensure everything goes to the right device
        inputs = processor(
            text=text,
            return_tensors="pt",
            voice_preset=voice_preset
        )

        # Move all input tensors to the same device as model
        inputs = {k: v.to(device) for k, v in inputs.items()}

        # Generate audio
        with torch.inference_mode():
            audio_array = model.generate(
                **inputs,
                do_sample=True,
                temperature=temperature,
                semantic_max_new_tokens=500
            )

        return audio_array.cpu().numpy().squeeze(), model.generation_config.sample_rate

    except Exception as e:
        print(f"Generation error: {e}")
        return None, None

# Example with expressive prompts
expressive_text = """
[clears throat] Welcome to our text-to-speech demo! [laughs]
I can express [whispers] whispering voices,
and [shouts] LOUD VOICES. [music]
"""

# Generate audio
audio, sr = generate_expressive_speech(expressive_text)

if audio is not None:
    # Save to file
    scipy.io.wavfile.write("bark_output.wav", rate=sr, data=audio)

    # Play directly in notebook
    display(Audio(audio, rate=sr))
    print("🎧 Audio saved as 'bark_output.wav'")
else:
    print("❌ Failed to generate audio")

Using device: cuda
Model loaded successfully!


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:10000 for open-end generation.


🎧 Audio saved as 'bark_output.wav'
