In [1]:
import os
import json
import base64
import requests
from msal import ConfidentialClientApplication
from datetime import datetime
from PIL import Image
from io import BytesIO
import fitz 
from unidecode import unidecode
from gtts import gTTS

In [None]:
# Define constants
CLIENT_ID = os.getenv("CLIENT_ID")
TENANT_ID = os.getenv("TENANT_ID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
EMAIL = os.getenv("EMAIL")

In [3]:
# Ensure downloads folder exists
downloads_folder = "downloads"
os.makedirs(downloads_folder, exist_ok=True)

In [4]:
def sanitize_filename(filename):
    """Sanitize filenames to avoid illegal characters."""
    return "".join(c for c in filename if c.isalnum() or c in (' ', '_')).rstrip()

In [5]:
def get_access_token():
    """Fetch an access token from Azure."""
    app = ConfidentialClientApplication(
        CLIENT_ID,
        authority=f"https://login.microsoftonline.com/{TENANT_ID}",
        client_credential=CLIENT_SECRET,
    )
    token_response = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
    if "access_token" in token_response:
        return token_response["access_token"]
    else:
        raise Exception("Access token acquisition failed.")

In [7]:
def fetch_emails():
    """Fetch emails using Microsoft Graph API."""
    token = get_access_token()
    headers = {"Authorization": f"Bearer {token}"}
    endpoint = f"https://graph.microsoft.com/v1.0/users/{EMAIL}/messages"
    params = {
        "$select": "subject,sender,receivedDateTime,bodyPreview",
        "$top": 10,
        "$expand": "attachments"
    }
    response = requests.get(endpoint, headers=headers, params=params)
    response.raise_for_status()
    emails = response.json()

    for email in emails.get("value", []):
        save_email(email)

In [8]:
def text_to_speech_with_gtts(text, output_file="audio_files/output_audio.mp3"):
    try:
        # Initialize gTTS (Google Text-to-Speech)
        tts = gTTS(text, lang='en')
        
        # Ensure the directory exists where the file is to be saved
        output_directory = os.path.dirname(output_file)
        if output_directory and not os.path.exists(output_directory):
            os.makedirs(output_directory)  # Create directory if it doesn't exist
        
        # Save the speech to a file
        tts.save(output_file)
        print(f"Audio saved to {output_file}.")
        return output_file
    except Exception as e:
        print(f"Error during TTS: {e}")
        return None

In [9]:
def save_email(email_data):
    """Save email content, attachments, and generate TTS."""
    subject = email_data.get('subject', 'No Subject')
    email_folder_path = os.path.join(downloads_folder, subject)
    document_id = subject
    counter = 1
    while os.path.exists(os.path.join(downloads_folder, f"{subject}_{counter}")):
        counter += 1

    document_id = f"{subject}_{counter}"
    email_folder_path = os.path.join(downloads_folder, document_id)



    # Check if the folder already exists
    if os.path.exists(email_folder_path):
        print(f"Email with document ID {document_id} already processed. Skipping.")
        return

    os.makedirs(email_folder_path, exist_ok=True)

    # Use bodyPreview instead of full body
    email_metadata = {
        "document_id": document_id,  
        "subject": email_data.get('subject', 'No Subject'),
        "sender": email_data["sender"]["emailAddress"]["address"],
        "received_time": email_data["receivedDateTime"],
        "body_preview": email_data.get("bodyPreview", "No preview available")  # Extracting bodyPreview
    }

    metadata_path = os.path.join(email_folder_path, "metadata.json")
    with open(metadata_path, 'w') as metadata_file:
        json.dump(email_metadata, metadata_file, indent=4)

    # Generate Text-to-Speech using bodyPreview
    combined_text = (
        f"Subject: {email_metadata['subject']}. "
        f"Content: {email_metadata['body_preview']}. "  # Use bodyPreview
        f"Sender: {email_metadata['sender']}. "
        f"Received Time: {email_metadata['received_time']}."
    )
    audio_output_path = os.path.join(email_folder_path, "email_audio.mp3")
    text_to_speech_with_gtts(combined_text, audio_output_path)

    # Save attachments
    for attachment in email_data.get("attachments", []):
        attachment_name = attachment.get('name', 'Unnamed_Attachment')
        attachment_data = attachment.get('contentBytes')
        if not attachment_data:
            continue

        content = base64.b64decode(attachment_data)
        if attachment_name.lower().endswith('.pdf'):
            save_pdf_attachment(content, email_folder_path, attachment_name)
        elif any(attachment_name.lower().endswith(ext) for ext in ['jpg', 'jpeg', 'png', 'gif']):
            save_image_attachment(content, email_folder_path, attachment_name)

In [10]:
def save_pdf_attachment(content, folder_path, filename):
    """Save PDF attachments as .pdf files."""
    try:
        base_name = sanitize_filename(os.path.splitext(filename)[0])
        sanitized_filename = f"{base_name}.pdf"
        pdf_path = os.path.join(folder_path, sanitized_filename)
        with open(pdf_path, 'wb') as pdf_file:
            pdf_file.write(content)
        print(f"Saved PDF: {pdf_path}")
    except Exception as e:
        print(f"Failed to save PDF {filename}: {e}")

In [11]:
def save_image_attachment(content, folder_path, filename):
    """Save image attachments in their original format without conversion."""
    try:
        image = Image.open(BytesIO(content))
        original_format = image.format.lower()
        valid_formats = ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff']
        if original_format not in valid_formats:
            print(f"Unsupported image format {original_format}. Saving as PNG.")
            original_format = "png"
        sanitized_filename = sanitize_filename(os.path.splitext(filename)[0])
        image_filename = f"{sanitized_filename}.{original_format}"
        image_path = os.path.join(folder_path, image_filename)
        image.save(image_path, format=image.format.upper())
        print(f"Saved image: {image_path}")
    except Exception as e:
        print(f"Failed to save image {filename}: {e}")

In [12]:
def process_pdfs_in_downloads():
    """Process all PDFs in the downloads folder and extract content."""
    for root, dirs, files in os.walk(downloads_folder):
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_path = os.path.join(root, file)
                base_dir = os.path.splitext(pdf_path)[0] 
                print(f"Processing PDF: {pdf_path}")
                extract_content_from_pdf(pdf_path, base_dir)

In [13]:
def extract_content_from_pdf(pdf_path, base_dir):
    """Extract content from a PDF and save text, images, and tables."""
    try:
        document = fitz.open(pdf_path)
        json_dir = os.path.join(base_dir, 'JSON')
        img_dir = os.path.join(base_dir, 'Image')
        csv_dir = os.path.join(base_dir, 'CSV')
        os.makedirs(json_dir, exist_ok=True)
        os.makedirs(img_dir, exist_ok=True)
        os.makedirs(csv_dir, exist_ok=True)
        for page_num in range(document.page_count):
            try:
                page = document[page_num]
                page_id = page_num + 1
                page_content = {
                    "page_id": page_id,
                    "content": {}
                }
                text = page.get_text("text")
                page_content['content']['text'] = unidecode(text)
                image_list = []
                images = page.get_images(full=True)
                for img_index, img in enumerate(images):
                    try:
                        xref = img[0]
                        img_data = document.extract_image(xref)
                        img_ext = img_data["ext"]
                        img_name = f"{page_id}_image_{img_index}.{img_ext}"
                        img_path = os.path.join(img_dir, img_name)
                        with open(img_path, 'wb') as img_file:
                            img_file.write(img_data["image"])
                        image_list.append(img_name)
                    except Exception as exception:
                        print(f"Error extracting image on Page {page_id} of PDF {pdf_path}: {exception}")
                page_content['content']['image'] = image_list
                table_list = []
                page_content['content']['table'] = table_list
                json_file_path = os.path.join(json_dir, f"{page_id}.json")
                with open(json_file_path, 'w') as json_file:
                    json.dump(page_content, json_file, indent=4)
            except Exception as exception:
                print(f"Error processing Page {page_num + 1} of PDF {pdf_path}: {exception}")
    except Exception as exception:
        print(f"Failed to open the PDF document {pdf_path}: {exception}")

In [14]:
if __name__ == "__main__":
    fetch_emails()
    process_pdfs_in_downloads()

Audio saved to downloads/Test Email_7/email_audio.mp3.
Audio saved to downloads/Test Mail from Different Domain Mail_2/email_audio.mp3.
Audio saved to downloads/test (with pdf)_2/email_audio.mp3.
Saved PDF: downloads/test (with pdf)_2/risk_compilation_2018.pdf
Saved image: downloads/test (with pdf)_2/docling.jpeg
Audio saved to downloads/Mail with image_2/email_audio.mp3.
Saved PDF: downloads/Mail with image_2/Final Project Proposaledited.pdf
Saved image: downloads/Mail with image_2/docling.jpeg
Audio saved to downloads/Test Email_8/email_audio.mp3.
Audio saved to downloads/Manual mail_2/email_audio.mp3.
Saved PDF: downloads/Manual mail_2/Final Project Proposaledited.pdf
Audio saved to downloads/Test Email_9/email_audio.mp3.
Audio saved to downloads/Test Email_10/email_audio.mp3.
Audio saved to downloads/Test Email_11/email_audio.mp3.
Audio saved to downloads/Test Email_12/email_audio.mp3.
Processing PDF: downloads/Manual mail_1/Final Project Proposaledited.pdf
Processing PDF: download