In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Narrate a Multi-character Story with Text-to-Speech and Gemini

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/speech/use-cases/storytelling/storytelling.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fspeech%2Fuse-cases%2Fstorytelling%2Fstorytelling.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Run in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/blob/main/speech/use-cases/storytelling/storytelling.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/speech/use-cases/storytelling/storytelling.ipynb">
      <img width="32px" src="https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>


---

* Author: Holt Skinner
* Created: Jan 2024

---

## Overview

This notebook demostrates how to use the [Text-to-Speech API](https://cloud.google.com/text-to-speech) to read a story with each character having a distinct voice.

### Objective

This tutorial uses the following Google Cloud AI services and resources:

- [Cloud Text-to-Speech API](https://cloud.google.com/text-to-speech/docs)
- Cloud Storage

The steps performed include:

- Parse the input story text in play script format. (`Character: Lines`)
- Assign each character to a voice.
- Synthesize each line based on character voice.
- Combine audio files into one MP3 file.

Planned expansions:

- Upload audio to Cloud Storage
- Read in story text using [Document AI OCR](https://cloud.google.com/document-ai)
- Convert story to play script format using Gemini.
  - Possibility: Use Gemini to generate [SSML](https://cloud.google.com/text-to-speech/docs/ssml) directly from book text.
- Create alternative implementation using LangChain.
- Add [Journey voices](https://cloud.google.com/text-to-speech/docs/voice-types#journey_voices) once more voices are supported.


### Costs

This tutorial uses billable components of Google Cloud:

* Text-to-Speech
* Cloud Storage

Learn about [Text-to-Speech pricing](https://cloud.google.com/text-to-speech/pricing),
and [Cloud Storage pricing](https://cloud.google.com/storage/pricing),
and use the [Pricing Calculator](https://cloud.google.com/products/calculator/)
to generate a cost estimate based on your projected usage.

## Getting Started


### Install Vertex AI SDK, other packages and their dependencies

Install the following packages required to execute this notebook.

In [None]:
# Install the packages
%pip install --user --upgrade -q google-cloud-aiplatform google-cloud-texttospeech pydub pandas tqdm

If you're running on a Mac, you will need to install [FFmpeg](https://ffmpeg.org/).

In [None]:
import platform

# Check if the system is macOS
if platform.system() == "Darwin":
    # Install using Homebrew
    !brew install ffmpeg

### Run the following cell to restart the kernel.

In [None]:
# Automatically restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️</b>
</div>

Set the project and region.

* Please note the **available regions** for Text-to-Speech, see [documentation](https://cloud.google.com/text-to-speech/docs/endpoints)

In [None]:
PROJECT_ID = "YOUR_PROJECT_ID"  # @param {type:"string"}

TTS_LOCATION = "us"  # @param {type:"string"}
VERTEXAI_LOCATION = "us-central1"  # @param {type:"string"}

### Authenticating your notebook environment

* If you are using **Colab** to run this notebook, run the cell below and continue.
* If you are using **Vertex AI Workbench**, check out the setup instructions [here](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/setup-env).

In [None]:
import sys

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

    ! gcloud config set project {PROJECT_ID}
    ! gcloud auth application-default login -q

Initialize the [Vertex AI SDK](https://cloud.google.com/vertex-ai/docs/python-sdk/use-vertex-ai-python-sdk)

In [None]:
import vertexai

# Initialize Vertex AI
vertexai.init(project=PROJECT_ID, location=VERTEXAI_LOCATION)

### Download source texts from Google Cloud Storage

This public bucket contains some stories generated by PaLM.

In [None]:
! gsutil cp gs://github-repo/speech/storytelling/*.txt .

### Import libraries

In [None]:
from IPython.display import Audio

import json
import os
from pathlib import Path
from typing import Dict, List, Tuple

from pydub import AudioSegment
from tqdm import tqdm
import pandas as pd

from google.api_core.client_options import ClientOptions
from google.cloud import texttospeech_v1beta1 as texttospeech
from vertexai.preview.generative_models import GenerativeModel, GenerationConfig

### Define constants

In [None]:
DEFAULT_LANGUAGE = "en"
# Voice used for narration, scene details, etc.
DEFAULT_VOICE = "en-GB-Neural2-B"

tts_client = texttospeech.TextToSpeechClient(
    client_options=ClientOptions(
        api_endpoint=f"{TTS_LOCATION}-texttospeech.googleapis.com"
    )
)
model = GenerativeModel("gemini-pro")

SILENCE_LENGTH = 200  # In Milliseconds
TXT_EXTENSION = ".txt"

### Helper functions

In [None]:
def list_voices(
    language_code: str = DEFAULT_LANGUAGE, voice_type: str = "Neural2"
) -> List[Dict]:
    response = tts_client.list_voices(language_code=language_code)

    return [
        {
            "name": voice.name,
            "gender": texttospeech.SsmlVoiceGender(voice.ssml_gender).name.lower(),
        }
        for voice in response.voices
        if voice_type in voice.name and voice.name != DEFAULT_VOICE
    ]


def create_character_map(characters: List[str], voices: List[str]) -> Dict[str, str]:
    responses = model.generate_content(
        f"""Your job is to uniquely and appropriately match character names to voices available with Google Cloud Text to Speech.

The following is a list of available voices for Google Cloud Text to Speech in a JSON list.

{voices}

The following is a list of character names in a JSON list:

{characters}

Output a JSON formatted object mapping Character Names to Voice Names:
""",
        generation_config=GenerationConfig(
            max_output_tokens=2048, temperature=0.9, top_p=1
        ),
        safety_settings=[],
        stream=True,
    )

    for response in responses:
        json_string = response.text.replace("`", "").replace("json", "")
        return json.loads(json_string)


def synthesize_text(
    text: str, output: str, voice_name: str, language_code: str = DEFAULT_LANGUAGE
):
    response = tts_client.synthesize_speech(
        input=texttospeech.SynthesisInput(text=text),
        voice=texttospeech.VoiceSelectionParams(
            language_code=language_code,
            name=voice_name,
        ),
        audio_config=texttospeech.AudioConfig(
            audio_encoding=texttospeech.AudioEncoding.MP3
        ),
    )

    # The response's audio_content is binary.
    with open(output, "wb") as f:
        f.write(response.audio_content)


def combine_audio_files(audio_files: List[str], filename: str) -> str:
    full_audio = AudioSegment.silent(duration=SILENCE_LENGTH)

    for file in audio_files:
        sound = AudioSegment.from_mp3(file)
        silence = AudioSegment.silent(duration=SILENCE_LENGTH)
        full_audio += sound + silence
        os.remove(file)

    outfile_name = f"{Path(filename).stem}-complete.mp3"
    full_audio.export(outfile_name, format="mp3")
    return outfile_name


def get_characters(input_file: str) -> List[str]:
    character_list = []
    with open(input_file, "r") as f:
        lines = f.readlines()

    start_line = lines.index("Characters:\n")

    for i in range(start_line + 2, len(lines)):
        if lines[i] == "\n":
            break
        character_list.append(lines[i].strip())
    return character_list


def parse_file(
    input_file: str, character_to_voice: Dict[str, Tuple[str, str]]
) -> List[str]:
    with open(input_file, "r") as f:
        lines = f.readlines()

    line_number = 1
    output_files = []
    filename = Path(input_file).stem

    for line in tqdm(lines, "Parsing input file"):
        split_line = line.strip().split(": ", 1)

        character = split_line[0]
        if not character:  # Skip blank lines
            continue

        voice = character_to_voice.get(character, DEFAULT_VOICE)

        if len(split_line) <= 1:
            dialogue = split_line[0]
        elif "Scene" in split_line[0]:
            dialogue = f"{split_line[0]}, {split_line[1]}"
        else:
            dialogue = split_line[1]

        output_file = f"{filename}-{line_number}.mp3"
        output_files.append(output_file)
        synthesize_text(dialogue, output_file, voice[0], voice[1])
        line_number += 1

    return output_files

## Call the Text-to-Speech API with script content

### Get available voices

In [None]:
all_voices = list_voices()
print(pd.DataFrame(all_voices))

### List all characters

In [None]:
input_file = "Macbeth.txt"  # @param {type:"string"}

character_list = get_characters(input_file)

if len(character_list) > len(all_voices):
    print(f"Too many characters {len(character_list)}. Max {len(all_voices)}")

### Map Characters to Voices

In [None]:
character_to_voice = create_character_map(character_list, all_voices)

print(pd.DataFrame(character_to_voice))

### Parse input text and output each line as audio

In [None]:
output_files = parse_file(input_file, character_to_voice)

### Combine audio files into a single file


In [None]:
outfile_name = combine_audio_files(output_files, input_file)
print(f"Audio content written to file {outfile_name}")

### Listen to the audio

In [None]:
Audio(outfile_name)