# Fetching email content and PDF Handling
The code integrates with the Microsoft Graph API to interact with email data, fetching email content and attachments. It processes the retrieved emails to extract relevant attachments and transforms the data for further use. For attachments in PDF format, the code employs fitz (PyMuPDF) to handle operations such as extracting text, images, and tables.

In [64]:
import os
import json
import base64
import requests
from msal import ConfidentialClientApplication
from datetime import datetime
from PIL import Image
from io import BytesIO
import fitz 
from unidecode import unidecode


In [65]:
# Define constants
CLIENT_ID = os.getenv("CLIENT_ID")
TENANT_ID = os.getenv("TENANT_ID")
CLIENT_SECRET = os.getenv("CLIENT_SECRET")
EMAIL = os.getenv("EMAIL")

In [66]:
# Ensure downloads folder exists
downloads_folder = "downloads"
os.makedirs(downloads_folder, exist_ok=True)

In [67]:
def sanitize_filename(filename):
    """Sanitize filenames to avoid illegal characters."""
    return "".join(c for c in filename if c.isalnum() or c in (' ', '_')).rstrip()

In [68]:
def get_access_token():
    """Fetch an access token from Azure."""
    app = ConfidentialClientApplication(
        CLIENT_ID,
        authority=f"https://login.microsoftonline.com/{TENANT_ID}",
        client_credential=CLIENT_SECRET,
    )
    token_response = app.acquire_token_for_client(scopes=["https://graph.microsoft.com/.default"])
    if "access_token" in token_response:
        return token_response["access_token"]
    else:
        raise Exception("Access token acquisition failed.")

In [69]:
def fetch_emails():
    """Fetch emails using Microsoft Graph API."""
    token = get_access_token()
    headers = {"Authorization": f"Bearer {token}"}
    endpoint = f"https://graph.microsoft.com/v1.0/users/{EMAIL}/messages"
    params = {
        "$select": "subject,sender,receivedDateTime,bodyPreview",
        "$top": 10,
        "$expand": "attachments"
    }
    response = requests.get(endpoint, headers=headers, params=params)
    response.raise_for_status()
    emails = response.json()

    for email in emails.get("value", []):
        save_email(email)

In [70]:
def save_email(email_data):
    """Save email content, attachments, and generate TTS."""
    subject = email_data.get('subject', 'No Subject')
    email_folder_path = os.path.join(downloads_folder, subject)
    document_id = subject
    counter = 1
    while os.path.exists(os.path.join(downloads_folder, f"{subject}_{counter}")):
        counter += 1

    document_id = f"{subject}_{counter}"
    email_folder_path = os.path.join(downloads_folder, document_id)



    # Check if the folder already exists
    if os.path.exists(email_folder_path):
        print(f"Email with document ID {document_id} already processed. Skipping.")
        return

    os.makedirs(email_folder_path, exist_ok=True)

    # Use bodyPreview instead of full body
    email_metadata = {
        "document_id": document_id,  
        "subject": email_data.get('subject', 'No Subject'),
        "sender": email_data["sender"]["emailAddress"]["address"],
        "received_time": email_data["receivedDateTime"],
        "body_preview": email_data.get("bodyPreview", "No preview available")  # Extracting bodyPreview
    }

    metadata_path = os.path.join(email_folder_path, "metadata.json")
    with open(metadata_path, 'w') as metadata_file:
        json.dump(email_metadata, metadata_file, indent=4)

    # Generate Text-to-Speech using bodyPreview
    combined_text = (
        f"Subject: {email_metadata['subject']}. "
        f"Content: {email_metadata['body_preview']}. "  # Use bodyPreview
        f"Sender: {email_metadata['sender']}. "
        f"Received Time: {email_metadata['received_time']}."
    )

    # Save attachments
    for attachment in email_data.get("attachments", []):
        attachment_name = attachment.get('name', 'Unnamed_Attachment')
        attachment_data = attachment.get('contentBytes')
        if not attachment_data:
            continue

        content = base64.b64decode(attachment_data)
        if attachment_name.lower().endswith('.pdf'):
            save_pdf_attachment(content, email_folder_path, attachment_name)
        elif any(attachment_name.lower().endswith(ext) for ext in ['jpg', 'jpeg', 'png', 'gif']):
            save_image_attachment(content, email_folder_path, attachment_name)

In [71]:
def save_pdf_attachment(content, folder_path, filename):
    """Save PDF attachments as .pdf files."""
    try:
        base_name = sanitize_filename(os.path.splitext(filename)[0])
        sanitized_filename = f"{base_name}.pdf"
        pdf_path = os.path.join(folder_path, sanitized_filename)
        with open(pdf_path, 'wb') as pdf_file:
            pdf_file.write(content)
        print(f"Saved PDF: {pdf_path}")
    except Exception as e:
        print(f"Failed to save PDF {filename}: {e}")

In [72]:
def save_image_attachment(content, folder_path, filename):
    """Save image attachments in their original format without conversion."""
    try:
        image = Image.open(BytesIO(content))
        original_format = image.format.lower()
        valid_formats = ['jpg', 'jpeg', 'png', 'gif', 'bmp', 'tiff']
        if original_format not in valid_formats:
            print(f"Unsupported image format {original_format}. Saving as PNG.")
            original_format = "png"
        sanitized_filename = sanitize_filename(os.path.splitext(filename)[0])
        image_filename = f"{sanitized_filename}.{original_format}"
        image_path = os.path.join(folder_path, image_filename)
        image.save(image_path, format=image.format.upper())
        print(f"Saved image: {image_path}")
    except Exception as e:
        print(f"Failed to save image {filename}: {e}")

In [73]:
def process_pdfs_in_downloads():
    """Process all PDFs in the downloads folder and extract content."""
    print("Starting to process PDFs in downloads folder...")
    for root, dirs, files in os.walk(downloads_folder):
        print(f"Checking directory: {root}")
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_path = os.path.join(root, file)
                base_dir = os.path.splitext(pdf_path)[0]
                print(f"Found PDF: {pdf_path}")
                extract_content_from_pdf(pdf_path, base_dir)

In [74]:
def extract_content_from_pdf(pdf_path, base_dir):
    """Extract content from a PDF and save text, images, and tables."""
    print(f"Opening PDF: {pdf_path}")
    try:
        document = fitz.open(pdf_path)
        json_dir = os.path.join(base_dir, 'JSON')
        img_dir = os.path.join(base_dir, 'Image')
        csv_dir = os.path.join(base_dir, 'CSV')

        print(f"Creating directories for extracted content:")
        print(f"  JSON directory: {json_dir}")
        print(f"  Image directory: {img_dir}")
        print(f"  CSV directory: {csv_dir}")
        
        os.makedirs(json_dir, exist_ok=True)
        os.makedirs(img_dir, exist_ok=True)
        os.makedirs(csv_dir, exist_ok=True)

        for page_num in range(document.page_count):
            print(f"Processing page {page_num + 1} of {pdf_path}")
            try:
                page = document[page_num]
                page_id = page_num + 1
                page_content = {
                    "page_id": page_id,
                    "content": {}
                }

                # Extract text
                text = page.get_text("text")
                print(f"Extracted text for page {page_id}: {text[:100]}...")  # Print first 100 chars
                page_content['content']['text'] = unidecode(text)

                # Extract images
                image_list = []
                images = page.get_images(full=True)
                print(f"Found {len(images)} images on page {page_id}")
                for img_index, img in enumerate(images):
                    try:
                        xref = img[0]
                        img_data = document.extract_image(xref)
                        img_ext = img_data["ext"]
                        img_name = f"{page_id}_image_{img_index}.{img_ext}"
                        img_path = os.path.join(img_dir, img_name)
                        with open(img_path, 'wb') as img_file:
                            img_file.write(img_data["image"])
                        print(f"  Extracted image: {img_name}")
                        image_list.append(img_name)
                    except Exception as exception:
                        print(f"Error extracting image on Page {page_id} of PDF {pdf_path}: {exception}")
                page_content['content']['image'] = image_list

                # Extract tables (dummy for now)
                table_list = []
                print(f"No table extraction implemented yet for page {page_id}")
                page_content['content']['table'] = table_list

                # Save JSON
                json_file_path = os.path.join(json_dir, f"{page_id}.json")
                with open(json_file_path, 'w') as json_file:
                    json.dump(page_content, json_file, indent=4)
                print(f"Saved JSON for page {page_id}: {json_file_path}")
            except Exception as exception:
                print(f"Error processing Page {page_num + 1} of PDF {pdf_path}: {exception}")
    except Exception as exception:
        print(f"Failed to open the PDF document {pdf_path}: {exception}")


In [75]:
if __name__ == "__main__":
    fetch_emails()
    process_pdfs_in_downloads()

Saved PDF: downloads/test (with pdf)_1/risk_compilation_2018.pdf
Saved image: downloads/test (with pdf)_1/docling.jpeg
Saved PDF: downloads/Mail with image_1/Final Project Proposaledited.pdf
Saved image: downloads/Mail with image_1/docling.jpeg
Saved PDF: downloads/Manual mail_1/Final Project Proposaledited.pdf
Starting to process PDFs in downloads folder...
Checking directory: downloads
Checking directory: downloads/Manual mail_1
Found PDF: downloads/Manual mail_1/Final Project Proposaledited.pdf
Opening PDF: downloads/Manual mail_1/Final Project Proposaledited.pdf
Creating directories for extracted content:
  JSON directory: downloads/Manual mail_1/Final Project Proposaledited/JSON
  Image directory: downloads/Manual mail_1/Final Project Proposaledited/Image
  CSV directory: downloads/Manual mail_1/Final Project Proposaledited/CSV
Processing page 1 of downloads/Manual mail_1/Final Project Proposaledited.pdf
Extracted text for page 1: Final Project Proposal 
Due: 22nd Nov 24 03:59 pm 