In [1]:
import sys
import os
import time
import pathlib
import textwrap
import base64
import mimetypes
import re
import struct
from IPython.display import Markdown as md, display, Audio
from google import genai
from google.genai import types

print(f"Python {sys.version.split()[0]} | Working dir: {pathlib.Path.cwd()}")



Python 3.11.5 | Working dir: c:\Users\PCS\Desktop\AI-BookComp\LangChain-Ecosystem-101\PodCast-Agent\notebooks


In [2]:
# --- Here We Load Env ---
from dotenv import load_dotenv
load_dotenv(dotenv_path=os.path.join(os.getcwd(), '.env'))

# --- Here We Check --- 
GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
    print("ERROR: GEMINI_API_KEY not found in .env file.")
else:
    print("GEMINI_API_KEY loaded successfully.")

GEMINI_API_KEY loaded successfully.


In [None]:
def timed(fn, *a, **kw):
    """
        Time a function execution and return result with timing info
    """
    t0 = time.perf_counter()
    out = fn(*a, **kw)
    return out, f"{(time.perf_counter()-t0)*1000:.0f} ms"

def save_binary_file(file_name, data):
    """
        Save binary data to a file
    """
    f = open(file_name, "wb")
    f.write(data)
    f.close()
    print(f"File saved to: {file_name}")
    return file_name

def convert_to_wav(audio_data: bytes, mime_type: str) -> bytes:
    """
        Convert audio data to WAV format with proper header
    """
    parameters = parse_audio_mime_type(mime_type)
    bits_per_sample = parameters["bits_per_sample"]
    sample_rate = parameters["rate"]
    num_channels = 1
    data_size = len(audio_data)
    bytes_per_sample = bits_per_sample // 8
    block_align = num_channels * bytes_per_sample
    byte_rate = sample_rate * block_align
    chunk_size = 36 + data_size 
    
    header = struct.pack(
        "<4sI4s4sIHHIIHH4sI",
        b"RIFF",          # ChunkID
        chunk_size,       # ChunkSize (total file size - 8 bytes)
        b"WAVE",          # Format
        b"fmt ",          # Subchunk1ID
        16,               # Subchunk1Size (16 for PCM)
        1,                # AudioFormat (1 for PCM)
        num_channels,     # NumChannels
        sample_rate,      # SampleRate
        byte_rate,        # ByteRate
        block_align,      # BlockAlign
        bits_per_sample,  # BitsPerSample
        b"data",          # Subchunk2ID
        data_size         # Subchunk2Size (size of audio data)
    )
    return header + audio_data

def parse_audio_mime_type(mime_type: str) -> dict[str, int | None]:
    """
        Parse bits per sample and rate from an audio MIME type string
    """
    bits_per_sample = 16
    rate = 24000
    
    #  --- Extract rate from parameters ---
    parts = mime_type.split(";")
    for param in parts:
        param = param.strip()
        if param.lower().startswith("rate="):
            try:
                rate_str = param.split("=", 1)[1]
                rate = int(rate_str)
            except (ValueError, IndexError):
                pass
        elif param.startswith("audio/L"):
            try:
                bits_per_sample = int(param.split("L", 1)[1])
            except (ValueError, IndexError):
                pass
    
    return {"bits_per_sample": bits_per_sample, "rate": rate}

In [6]:
try:
    client = genai.Client(api_key=GEMINI_API_KEY)
    print("Gemini client initialized successfully.")
except Exception as e:
    print(f"Failed to initialize Gemini client: {e}")

Gemini client initialized successfully.


In [7]:
def generate_single_speaker_audio(text, voice_name="Zephyr",output_file="single_speaker.wav"):
    """
        Generate Audio With Single Speaker 
    """
    # --- Get The Model --- 
    model = "gemini-2.5-flash-preview-tts"
    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_text(text=text)
            ],
        ),
    ]
    # --- Here We Config the Model --- 
    generate_content_config = types.GenerateContentConfig(
        temperature=1,
        response_modalities=["audio"],
        speech_config=types.SpeechConfig(
            voice_config=types.VoiceConfig(
                prebuilt_voice_config = types.PrebuiltVoiceConfig(
                    voice_name=voice_name
                )
            ),
        ),
    )
    # --- Handle Steam --- 
    file_index = 0
    audio_data = b""
    
    for chunk in client.models.generate_content_stream(
        model=model,
        contents=contents,
        config=generate_content_config,
    ):
        if (
            chunk.candidates is None
            or chunk.candidates[0].content is None
            or chunk.candidates[0].content.parts is None
        ):
            continue
        if chunk.candidates[0].content.parts[0].inline_data and chunk.candidates[0].content.parts[0].inline_data.data:
            inline_data = chunk.candidates[0].content.parts[0].inline_data
            data_buffer = inline_data.data
            file_extension = mimetypes.guess_extension(inline_data.mime_type)
            if file_extension is None:
                file_extension = ".wav"
                data_buffer = convert_to_wav(inline_data.data, inline_data.mime_type)
            
            audio_data += data_buffer
            file_index += 1
        else:
            print(chunk.text)
        # --- Save The Audio File -- 
        if audio_data:
            save_binary_file(output_file,audio_data)
            return output_file
        return None 
# --- Test it --- 
test_text = "Hello , My Name is Amir Abdallah and this my Gemini's Text to Speech Native Audio Capabilities , We're exploring how this can used for Podcast generation ."
result, timing = timed(generate_single_speaker_audio,test_text,"Leda","test_single_speaker.wav")
display(md(f"### Single Speaker Test - {timing}"))
if result:
    display(Audio(result))
else:
    display(md(" Failed to generate audio"))

File saved to: test_single_speaker.wav


### Single Speaker Test - 8377 ms

In [8]:
def generate_multi_speaker_audio(script, output_file="multi_speaker.wav"):
    """
        Generate audio with multiple speakers
    """
    model = "gemini-2.5-flash-preview-tts"
    
    contents = [
        types.Content(
            role="user",
            parts=[
                types.Part.from_text(text=script),
            ],
        ),
    ]
    
    generate_content_config = types.GenerateContentConfig(
        temperature=1,
        response_modalities=["audio"],
        speech_config=types.SpeechConfig(
            multi_speaker_voice_config=types.MultiSpeakerVoiceConfig(
                speaker_voice_configs=[
                    types.SpeakerVoiceConfig(
                        speaker="Speaker 1",
                        voice_config=types.VoiceConfig(
                            prebuilt_voice_config=types.PrebuiltVoiceConfig(
                                voice_name="Zephyr"
                            )
                        ),
                    ),
                    types.SpeakerVoiceConfig(
                        speaker="Speaker 2",
                        voice_config=types.VoiceConfig(
                            prebuilt_voice_config=types.PrebuiltVoiceConfig(
                                voice_name="Puck"
                            )
                        ),
                    ),
                ]
            ),
        ),
    )
    
    file_index = 0
    audio_data = b""
    
    for chunk in client.models.generate_content_stream(
        model=model,
        contents=contents,
        config=generate_content_config,
    ):
        if (
            chunk.candidates is None
            or chunk.candidates[0].content is None
            or chunk.candidates[0].content.parts is None
        ):
            continue
        if chunk.candidates[0].content.parts[0].inline_data and chunk.candidates[0].content.parts[0].inline_data.data:
            inline_data = chunk.candidates[0].content.parts[0].inline_data
            data_buffer = inline_data.data
            file_extension = mimetypes.guess_extension(inline_data.mime_type)
            if file_extension is None:
                file_extension = ".wav"
                data_buffer = convert_to_wav(inline_data.data, inline_data.mime_type)
            
            audio_data += data_buffer
            file_index += 1
        else:
            print(chunk.text)
    
    # --- Save It ---
    if audio_data:
        save_binary_file(output_file, audio_data)
        return output_file
    return None

conversation_script = """
Speaker 1: Welcome to our podcast about AI and content creation! Today we're exploring how AI is transforming the way we create and consume content.

Speaker 2: That's right! It's fascinating how tools like large language models can now help us generate scripts, articles, and even full podcast episodes.

Speaker 1: Absolutely. And with text-to-speech technology, we can even generate the audio for these podcasts without ever recording a human voice.

Speaker 2: The possibilities are endless, but it's important to maintain quality and ensure the content remains engaging and authentic.
"""

result, timing = timed(generate_multi_speaker_audio, conversation_script, "test_multi_speaker.wav")

display(md(f"### Multi-Speaker Test - {timing}"))
if result:
    display(Audio(result))
else:
    display(md("Failed to generate audio"))

File saved to: test_multi_speaker.wav


### Multi-Speaker Test - 21604 ms