In [1]:
# Step 1: Install the Google Gemini AI SDK
!pip install -q google-generativeai

In [2]:
# API KEY
import os
os.environ["GOOGLE_API_KEY"] = "ENTER_YOUR_API_KEY"

In [3]:
from google import genai
from google.genai import types

client = genai.Client()

# Speech Generation

In [11]:
"""
1. text to audio
2. config: GenerateContentConfig
3. response modality: audio
4. GenerateContentConfig: SpeechConfig
5. SpeechConfig: VoiceConfig (for generating output in single voice)
6. VoiceConfig: PrebuilVoiceConfig
7. PrebuilVoiceConfig: voiceName
"""

response_1 = client.models.generate_content(
    model="gemini-2.5-flash-preview-tts",
    contents="Say cheerfully: Hey how're you doing ?",
    config=types.GenerateContentConfig(
        response_modalities=['AUDIO'],
        speech_config=types.SpeechConfig(
            voice_config=types.VoiceConfig(
                prebuilt_voice_config=types.PrebuiltVoiceConfig(
                    voice_name="Achernar"
                )
                )
            )
            )
)

In [12]:
# displaying response_1
response_1

GenerateContentResponse(
  automatic_function_calling_history=[],
  candidates=[
    Candidate(
      content=Content(
        parts=[
          Part(
            inline_data=Blob(
              data=<... Max depth ...>,
              mime_type=<... Max depth ...>
            )
          ),
        ],
        role='model'
      ),
      finish_reason=<FinishReason.STOP: 'STOP'>,
      index=0
    ),
  ],
  model_version='gemini-2.5-flash-preview-tts',
  response_id='FIjmaP2jK9aaqtsPhfq82As',
  sdk_http_response=HttpResponse(
    headers=<dict len=10>
  ),
  usage_metadata=GenerateContentResponseUsageMetadata(
    candidates_token_count=44,
    candidates_tokens_details=[
      ModalityTokenCount(
        modality=<MediaModality.AUDIO: 'AUDIO'>,
        token_count=44
      ),
    ],
    prompt_token_count=11,
    prompt_tokens_details=[
      ModalityTokenCount(
        modality=<MediaModality.TEXT: 'TEXT'>,
        token_count=11
      ),
    ],
    total_token_count=55
  )
)

In [17]:
# storing binary audio data into data_1 variable
data_1 = response_1.candidates[0].content.parts[0].inline_data.data

In [18]:
# converting this binary data into (.wav) file

"""
1. channel: 1(mono), 2(stereo)
2. samplewidth: clearity 1-8bits, 2-16bits
3. framerates: hertz
"""

import wave

def wave_file(filename, input):
  with wave.open(filename, 'wb') as w:
    w.setnchannels(1)
    w.setsampwidth(2)
    w.setframerate(24000)
    w.writeframes(input)

In [24]:
# calling wav_file function
filename_1='Speech_Out_1.wav'
wave_file(filename_1, data_1)

In [25]:
# playing the audio
from IPython.display import Audio
Audio(filename="/content/Speech_Out_1.wav")

In [26]:
response_2 = client.models.generate_content(
    model="gemini-2.5-flash-preview-tts",
    contents="Say shocking: What happened ?",
    config=types.GenerateContentConfig(
        response_modalities=['AUDIO'],
        speech_config=types.SpeechConfig(
            voice_config=types.VoiceConfig(
                prebuilt_voice_config=types.PrebuiltVoiceConfig(
                    voice_name="Alnilam"
                )
                )
            )
            )
)

In [27]:
# displaying response_2
response_2

GenerateContentResponse(
  automatic_function_calling_history=[],
  candidates=[
    Candidate(
      content=Content(
        parts=[
          Part(
            inline_data=Blob(
              data=<... Max depth ...>,
              mime_type=<... Max depth ...>
            )
          ),
        ],
        role='model'
      ),
      finish_reason=<FinishReason.STOP: 'STOP'>,
      index=0
    ),
  ],
  model_version='gemini-2.5-flash-preview-tts',
  response_id='dYnmaM7rOajYqtsPie6RmQs',
  sdk_http_response=HttpResponse(
    headers=<dict len=10>
  ),
  usage_metadata=GenerateContentResponseUsageMetadata(
    candidates_token_count=35,
    candidates_tokens_details=[
      ModalityTokenCount(
        modality=<MediaModality.AUDIO: 'AUDIO'>,
        token_count=35
      ),
    ],
    prompt_token_count=7,
    prompt_tokens_details=[
      ModalityTokenCount(
        modality=<MediaModality.TEXT: 'TEXT'>,
        token_count=7
      ),
    ],
    total_token_count=42
  )
)

In [28]:
# storing binary audio data into data_2 variable
data_2 = response_2.candidates[0].content.parts[0].inline_data.data

In [29]:
# calling wav_file function
filename_2='Speech_Out_2.wav'
wave_file(filename_2, data_2)

In [30]:
# playing the audio
from IPython.display import Audio
Audio("/content/Speech_Out_2.wav")

In [31]:
# Multi Tone Paragraph
content = [
    "Say cheerfully: Good Morning Professor",
    "Say excitedly: Tommorow we'll go for a vacation",
    "Say angrly: How dare you to touch my laptop",
    "Say encouragingly: You can do it, i know you're very smart",
    "Say sadly: I wish, we could meet"
]

response_3 = client.models.generate_content(
    model="gemini-2.5-flash-preview-tts",
    contents=content,
    config=types.GenerateContentConfig(
        response_modalities=['AUDIO'],
        speech_config=types.SpeechConfig(
            voice_config=types.VoiceConfig(
                prebuilt_voice_config=types.PrebuiltVoiceConfig(
                    voice_name="Puck"
                )
                )
            )
            )
)

In [32]:
# displaying response_3
response_3

GenerateContentResponse(
  automatic_function_calling_history=[],
  candidates=[
    Candidate(
      content=Content(
        parts=[
          Part(
            inline_data=Blob(
              data=<... Max depth ...>,
              mime_type=<... Max depth ...>
            )
          ),
        ],
        role='model'
      ),
      finish_reason=<FinishReason.STOP: 'STOP'>,
      index=0
    ),
  ],
  model_version='gemini-2.5-flash-preview-tts',
  response_id='jYnmaMDILqbAqtsPhp-OmQc',
  sdk_http_response=HttpResponse(
    headers=<dict len=10>
  ),
  usage_metadata=GenerateContentResponseUsageMetadata(
    candidates_token_count=364,
    candidates_tokens_details=[
      ModalityTokenCount(
        modality=<MediaModality.AUDIO: 'AUDIO'>,
        token_count=364
      ),
    ],
    prompt_token_count=56,
    prompt_tokens_details=[
      ModalityTokenCount(
        modality=<MediaModality.TEXT: 'TEXT'>,
        token_count=56
      ),
    ],
    total_token_count=420
  )
)

In [33]:
# storing binary audio data into data_3 variable
data_3 = response_3.candidates[0].content.parts[0].inline_data.data

In [34]:
# calling wav_file function
filename_3='Speech_Out_3.wav'
wave_file(filename_3, data_3)

In [35]:
# playing the audio
from IPython.display import Audio
Audio("/content/Speech_Out_3.wav")

In [73]:
# Multiple Voice (like two people talk with each other)

"""
1. text to audio
2. config: GenerateContentConfig
3. response modality: audio
4. GenerateContentConfig: SpeechConfig
5. SpeechConfig: MultiSpeakerVoiceConfig (for generating output in multiple voice)
6. MultiSpeakerVoiceConfig: SpeakerVoiceConfigs (speaker)
7. SpeakerVoiceConfigs: VoiceConfig
8. VoiceConfig: PrebuilVoiceConfig
9. PrebuilVoiceConfig: voiceName
"""

prompt = """
<speaker name="Robert">Hey Elizabeth, can you believe we’re finally in college?</speaker>
<speaker name="Elizabeth">I know, right? It feels so strange but exciting at the same time.</speaker>
<speaker name="Robert">Yeah, I was so nervous this morning. I even forgot my ID card.</speaker>
<speaker name="Elizabeth">(laughs) Classic first-day moment. Don’t worry, I almost got lost finding my class!</speaker>
"""

response_4 = client.models.generate_content(
    model="gemini-2.5-flash-preview-tts",
    contents=prompt,
    config=types.GenerateContentConfig(
        response_modalities=["AUDIO"],
        speech_config=types.SpeechConfig(
            multi_speaker_voice_config=types.MultiSpeakerVoiceConfig(
                speaker_voice_configs=[
                    types.SpeakerVoiceConfig(
                        speaker="Robert",
                        voice_config=types.VoiceConfig(
                            prebuilt_voice_config=types.PrebuiltVoiceConfig(
                                voice_name="Puck"
                            )
                        )
                    ),
                    types.SpeakerVoiceConfig(
                        speaker="Elizabeth",
                        voice_config=types.VoiceConfig(
                            prebuilt_voice_config=types.PrebuiltVoiceConfig(
                                voice_name="Kore"
                            )
                        )
                    ),
                ]
            )
        )
    )
)


In [74]:
# displaying response_4
response_4

GenerateContentResponse(
  automatic_function_calling_history=[],
  candidates=[
    Candidate(
      content=Content(
        parts=[
          Part(
            inline_data=Blob(
              data=<... Max depth ...>,
              mime_type=<... Max depth ...>
            )
          ),
        ],
        role='model'
      ),
      finish_reason=<FinishReason.STOP: 'STOP'>,
      index=0
    ),
  ],
  model_version='gemini-2.5-flash-preview-tts',
  response_id='_Y3maNz3H_DZqtsPy4f-yAI',
  sdk_http_response=HttpResponse(
    headers=<dict len=10>
  ),
  usage_metadata=GenerateContentResponseUsageMetadata(
    candidates_token_count=497,
    candidates_tokens_details=[
      ModalityTokenCount(
        modality=<MediaModality.AUDIO: 'AUDIO'>,
        token_count=497
      ),
    ],
    prompt_token_count=104,
    prompt_tokens_details=[
      ModalityTokenCount(
        modality=<MediaModality.TEXT: 'TEXT'>,
        token_count=104
      ),
    ],
    total_token_count=601
  )
)

In [75]:
# storing binary audio data into data_4 variable
data_4 = response_4.candidates[0].content.parts[0].inline_data.data

In [76]:
# calling wav_file function
filename_4='Speech_Out_4.wav'
wave_file(filename_4, data_4)

In [77]:
# playing the audio
from IPython.display import Audio
Audio("/content/Speech_Out_4.wav")