In [1]:
from elevenlabs.client import ElevenLabs
from elevenlabs import play
import os
from dotenv import load_dotenv

# Load environment variables from .env file
load_dotenv()

# Get API key from environment variable
client = ElevenLabs(api_key=os.getenv("ELEVENLABS_API_KEY"))

# Cassidy: 56AoDkrOh6qfVPDXZ7Pt
audio = client.text_to_speech.convert(
    text="Ssssssss",
    voice_id="56AoDkrOh6qfVPDXZ7Pt",
    model_id="eleven_multilingual_v2",
    output_format="mp3_44100_128",
    voice_settings={"speed": 0.8, "stability": 0.95, "similarity_boost": 0.75, "style": 0.0},
)

# Convert generator to bytes
audio_bytes = b"".join(audio)

# save audio to file
with open("sound_samples/s_sound.mp3", "wb") as f:
    f.write(audio_bytes)
play.play(audio_bytes)

In [None]:
def tts(text, voice_id="56AoDkrOh6qfVPDXZ7Pt", speed=0.8, stability=0.95, similarity_boost=0.75, style=0.0):
    audio = client.text_to_speech.convert(
        text=text,
        voice_id=voice_id,
        model_id="eleven_multilingual_v2",
        output_format="mp3_44100_128",
        voice_settings={"speed": speed, "stability": stability, "similarity_boost": similarity_boost, "style": style}
    )
    # Convert generator to bytes
    audio_bytes = b''.join(audio)
    # save audio to file
    with open("sound_samples/output.mp3", "wb") as f:
        f.write(audio_bytes)
    return audio_bytes

In [16]:
from pydub import AudioSegment

def stitch_audios(audio_list):
    curent_audio = AudioSegment.empty()
    for audio in audio_list:
        segment = AudioSegment.from_file(audio, format="mp3")
        pause_duration = 500  # 500 milliseconds pause
        pause = AudioSegment.silent(duration=pause_duration)
        curent_audio += pause
        curent_audio += segment
    return curent_audio

In [3]:
import os

api_key = os.getenv("ELEVENLABS_API_KEY")

# Debug: Check if API key is loaded
print(f"API Key loaded: {api_key[:10]}..." if api_key else "API Key NOT FOUND!")

API Key loaded: sk_1e963ee...


In [7]:
from io import BytesIO

def generate_clone(audio):
    # Convert generator to bytes if needed
    if hasattr(audio, '__iter__') and not isinstance(audio, (bytes, bytearray)):
        audio_bytes = b''.join(audio)
    else:
        audio_bytes = audio
    
    # save audio to file
    with open("sound_samples/clone_input.mp3", "wb") as f:
        f.write(audio_bytes)

    with open("sound_samples/clone_input.mp3", "rb") as audio_file:
        voice = client.voices.ivc.create(
            name="userClone",
            files=[audio_file],  # Pass the open file handle
            description="The user's custom voice clone",
            remove_background_noise=True
        )
    
    # save voice id to file
    with open("sound_samples/clone_voice_id.txt", "w") as f:
        f.write(voice.voice_id)
    return voice

In [21]:
# play.play(tts("This is a test of the text to speech system."))
stitched_audio = stitch_audios(["sound_samples/test_phrase.mp3", "sound_samples/s_sound.mp3"])
stiched_audio_bytes = stitched_audio.export(format="mp3").read()
play.play(stiched_audio_bytes)

In [8]:
# from tts import tts, generate_clone
# from pydub import AudioSegment
# from io import BytesIO
# import librosa
# from tts import generate_clone

# Just read the file directly - no conversion needed!
audio = open("sound_samples/welcome_speech.mp3", "rb")
audio_bytes = audio.read()
voice = generate_clone(audio_bytes)

## Voice Cloning Requirements

ElevenLabs voice cloning requires:
- **Minimum 1 minute** of audio (ideally 3-5 minutes)
- **Clear speech** with one voice only
- **Minimal background noise**

The `welcome_speech.mp3` file might be too short. Let's check its duration first.

In [None]:
# Check audio duration
import librosa

audio_path = "sound_samples/welcome_speech.mp3"
y, sr = librosa.load(audio_path)
duration = librosa.get_duration(y=y, sr=sr)
print(f"Audio duration: {duration:.2f} seconds ({duration/60:.2f} minutes)")
print(f"Sample rate: {sr} Hz")
print(f"Audio shape: {y.shape}")

# ElevenLabs requires at least 60 seconds (1 minute)
if duration < 60:
    print(f"\n⚠️  WARNING: Audio is too short for voice cloning!")
    print(f"   Minimum required: 60 seconds")
    print(f"   Your audio: {duration:.2f} seconds")
    print(f"   You need {60 - duration:.2f} more seconds")
else:
    print(f"\n✅ Audio duration is sufficient for voice cloning")