# MailCast: Email to audio Generation

In [1]:
import os
import base64
import google
from google.oauth2.credentials import Credentials
from google_auth_oauthlib.flow import InstalledAppFlow
from googleapiclient.discovery import build
from google.auth.transport.requests import Request
from email.mime.text import MIMEText
import requests
from google.cloud import texttospeech
from dotenv import load_dotenv
from Keys import Gemini_api, OpenAi_api

# Part 1: User Authentication

In [2]:
# Function to authenticate Gmail
"""
This function is used to autheticate the user and get the corresponding token of user gmail.
"""
def authenticate_gmail():
    SCOPES = ['https://www.googleapis.com/auth/gmail.readonly'] # gmail readonly 
    creds = None
    # if token alreadly exist
    if os.path.exists('token.json'):
        creds = Credentials.from_authorized_user_file('token.json', SCOPES)
    # if token expired or does not exist
    if not creds or not creds.valid:
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request()) # refresh the token
        else:
            # Get the new token with auth_cred file that was obtained from goolge cloud api service
            flow = InstalledAppFlow.from_client_secrets_file('Auth_creds.json', SCOPES)
            creds = flow.run_local_server(port=0)
        with open('token.json', 'w') as token:
            token.write(creds.to_json())
    return build('gmail', 'v1', credentials=creds)

# Part 2: Data Preproceing
### loading and cleaning Email

In [3]:
from bs4 import BeautifulSoup
from email.utils import parseaddr
import base64
import re 

def get_email_content(service, max_results=1):
    # get the latest Emails
    results = service.users().messages().list(userId='me', maxResults=max_results).execute()
    messages = results.get('messages', [])
    emails = []

    # getting the message 
    for message in messages:
        msg = service.users().messages().get(userId='me', id=message['id'], format='full').execute() # message 
        headers = msg['payload']['headers'] # header of the Email
        subject = next(header['value'] for header in headers if header['name'] == 'Subject') # Subject of the email

        # getting the sender information
        full_sender = next(header['value'] for header in headers if header['name'] == 'From')
        sender_name, sender_email = parseaddr(full_sender)
        sender = sender_name if sender_name else sender_email

        # Email body getting the pain text and the html tags 
        body = ""
        if 'parts' in msg['payload']:
            parts = msg['payload']['parts']
            plain_text = []
            html_text = []
            for part in parts:
                if part['mimeType'] == 'text/plain':
                    text = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8', errors='ignore').strip()
                    plain_text.append(text)
                elif part['mimeType'] == 'text/html':
                    html_data = base64.urlsafe_b64decode(part['body']['data']).decode('utf-8', errors='ignore')
                    soup = BeautifulSoup(html_data, 'html.parser')
                    relevant_text = []
                    for element in soup.find_all(['div', 'p', 'h1', 'h2', 'h3', 'li'], recursive=True):
                        text = element.get_text(strip=True)
                        if text and not any(footer in text for footer in ['Update your email preferences', 'Unsubscribe', '©']):
                            relevant_text.append(text)
                    html_text.append('\n'.join(relevant_text))
            
            body = '\n'.join(plain_text) if plain_text else '\n'.join(html_text)
            if not body or "You are reading a plain text version" in body:
                body = '\n'.join(plain_text + html_text)
        else:
            body = base64.urlsafe_b64decode(msg['payload']['body']['data']).decode('utf-8', errors='ignore').strip()
        
        # Cleaning the body 
        if body:
            # 1. Remove URLs
            body = re.sub(r'https?://\S+', '', body)
            body = re.sub(r'\[https?://\S+\]', '', body) # Handles markdown-style links

            # 2. Remove any text within angle brackets (like unsubscribe links)
            body = re.sub(r'<.*?>', '', body)

            # 3. Collapse multiple newlines into two, for readability
            body = re.sub(r'\n{3,}', '\n\n', body).strip()

        if body:
            emails.append({'subject': subject, 'sender': sender, 'body': body})
            
    return emails

# Part 3: Script generation

###### This part will pass the email body to llm to get scipt to generate the audio file.
###### It will generate the scipt in two part.
###### First it will read the whole email.
###### in second part it will generate a precise summary of the email and conbine both of them to generate the scipt.

In [4]:
import re
import google.generativeai as genai

def generate_podcast_script(email_content, api_key=None):
    if not api_key:
        api_key = Gemini_api
    genai.configure(api_key=api_key)

    clean_text = email_content['body']

    prompt = f"""
    You are a podcast host for "Inbox Insights." Your task is to create a script from an email.
    The sender is "{email_content['sender']}" and the subject is "{email_content['subject']}".

    Your script MUST follow this exact two-part structure:

    **Part 1: The Verbatim Reading**
    1.  Start with a host introduction, mentioning the sender and subject.
    2.  Use a transition phrase like, "And now, here is the full text of the email."
    3.  After the transition, you MUST reproduce the email content below **exactly as it is written**. Insert the text directly into the host's speech. Do NOT use stage directions like (Host reads email).

    **Part 2: The Summary**
    1.  After reproducing the full email text, use a transition phrase like, "And that was the email. Now for a quick summary."
    2.  Provide a concise summary of the email's key points.
    3.  Conclude the episode.

    **Formatting:** The entire output must be dialogue for the host, starting with "**Host:**".

    --- EMAIL CONTENT TO REPRODUCE VERBATIM ---
    {clean_text}
    --- END OF EMAIL CONTENT ---
    """

    model = genai.GenerativeModel('gemini-1.5-flash')
    # Adding safety settings to reduce the chance of the model refusing to process content
    safety_settings = [
        {"category": "HARM_CATEGORY_HARASSMENT", "threshold": "BLOCK_NONE"},
        {"category": "HARM_CATEGORY_HATE_SPEECH", "threshold": "BLOCK_NONE"},
        {"category": "HARM_CATEGORY_SEXUALLY_EXPLICIT", "threshold": "BLOCK_NONE"},
        {"category": "HARM_CATEGORY_DANGEROUS_CONTENT", "threshold": "BLOCK_NONE"},
    ]
    response = model.generate_content(prompt, safety_settings=safety_settings)

    if not response.parts:
        return "**Host:** I'm sorry, but the content of this email could not be processed for the podcast. This may be due to safety filters or other processing issues."
        
    script = response.text.strip()
    return script

# Part 4: Main text to speech generation
###### This part of the code is used to generate the audio file from the script generated by the llm.
###### The model used to generate the audio is OpenAi TTS. it has a limi of only 4096 tokens.
###### So if the email is longer then it wont generate the audio.
###### To address this the input is divided in to max 4000 token chunks and each chunk is then used to generate the audio.
###### and at the end combine all the chunk audio into a one single file.

In [5]:
import os
from openai import OpenAI
from pathlib import Path
import re
from pydub import AudioSegment 
import io 

def text_to_speech(script, output_file="output/podcast.mp3", api_key=None):
    if not api_key:
        api_key = OpenAi_api
    client = OpenAI(api_key=api_key)

    # Clean the script to get dialogue lines 
    dialogue_lines = []
    for line in script.split('\n'):
        cleaned_line = re.sub(r'^\*\*\w+:\*\*\s*', '', line).strip()
        if cleaned_line and not cleaned_line.startswith('('):
            dialogue_lines.append(cleaned_line)
    
    full_text = '\n'.join(dialogue_lines)

    # Chunk the text into smaller pieces under the 4096 character limit
    # We use 4000 as a safe buffer
    char_limit = 4000
    text_chunks = []
    current_chunk = ""

    for line in full_text.split('\n'):
        # If adding the next line doesn't exceed the limit, add it
        if len(current_chunk) + len(line) + 1 < char_limit:
            current_chunk += line + '\n'
        # Otherwise, this chunk is finished. Store it and start a new one.
        else:
            text_chunks.append(current_chunk)
            current_chunk = line + '\n'
    
    text_chunks.append(current_chunk) # Add the final chunk

    print(f"Text has been split into {len(text_chunks)} chunks to handle API limits.")

    # Process each chunk and combine the audio
    combined_audio = AudioSegment.empty()

    for i, chunk in enumerate(text_chunks):
        # Skip any chunks that might be empty
        if not chunk.strip():
            continue
            
        print(f"Generating audio for chunk {i + 1}/{len(text_chunks)}...")
        try:
            # Get the audio data for the current chunk
            response = client.audio.speech.create(
                model="tts-1",
                voice="nova",
                input=chunk,
            )

            # Load the audio data from the response into an in-memory file
            audio_bytes = io.BytesIO(response.content)
            
            # Load this chunk's audio into a pydub AudioSegment
            audio_segment = AudioSegment.from_mp3(audio_bytes)
            
            # Append (concatenate) this audio segment to the combined audio
            combined_audio += audio_segment

        except Exception as e:
            print(f"Error processing chunk {i+1}: {e}")
            continue # Continue to the next chunk even if one fails
            
    # Export the final, combined audio file
    try:
        print("Exporting combined audio file...")
        os.makedirs(os.path.dirname(output_file), exist_ok=True)
        combined_audio.export(output_file, format="mp3")
        print("Audio generation completed.")
    except Exception as e:
        print(f"Error exporting audio file: {e}")
        raise


    # Verify the output file
    if os.path.exists(output_file) and os.path.getsize(output_file) > 0:
        print(f"Audio file created successfully at {output_file}.")
    else:
        raise ValueError("Audio file is empty or not created. Check API key, model access, or input text.")

# Main Pipeline
###### This is the main part of the program that will use every function i have defined.
###### To get clean email and generate the script and finally use that script to generate the audio file.

In [6]:
def main():
    service = authenticate_gmail()
    emails = get_email_content(service, max_results=1)
    email = emails[0]
    print(f"Processing email: {email['subject']} from {email['sender']}")
    
    script = generate_podcast_script(email)
    print(script)
    
    output_file = "output/podcast.mp3"
    try:
        text_to_speech(script, output_file)
        print(f"\nPodcast audio saved to {output_file}")
    except Exception as e:
        print(f"Error in text_to_speech: {e}")
        if os.path.exists(output_file):
            print(f"File size: {os.path.getsize(output_file)} bytes")
        else:
            print("Output file not created.")

if __name__ == "__main__":
    main()

Please visit this URL to authorize this application: https://accounts.google.com/o/oauth2/auth?response_type=code&client_id=110272889934-0uej1eu4cjdt94c1oog5qvqqncsou8dv.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A55402%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fgmail.readonly&state=UP8rYNMIJnfomKxB3bsVvSFHAvFZoS&access_type=offline
Processing email: Fwd: Self-driving cars eye NY from Abhishek Dhanani
**Host:** Welcome back to Inbox Insights, the podcast that dives deep into the most interesting emails from your inbox. Today we're looking at an email forwarded to our listener, Abhishek Dhanani.  The subject line is, "Fwd: Self-driving cars eye NY."  And now, here is the full text of the email.

---------- Forwarded message ---------
From: Superhuman – Zain Kahn 
Date: Sat, 21 Jun 2025, 1:07 pm
Subject: Self-driving cars eye NY
To: cap.prince11@gmail.com 


June 21, 2025   |   Read Online
<

*Welcome back, Superhuman.* Researchers in Italy dropped the world’