# 📧 Automated Email Summarization Pipeline
This notebook demonstrates a full pipeline to summarize emails from MBOX format using Hugging Face transformers. It processes raw emails, cleans them, summarizes the body text, and exports the results into a structured JSON file.

In [None]:
!pip install transformers torch mailbox email six --quiet

In [None]:
import os
import re
import json
import glob
import mailbox
from email.message import EmailMessage
from typing import List, Dict, Any
from datetime import datetime
from transformers import pipeline

In [None]:
print("[INFO] Loading summarization model...")
summarizer = pipeline("summarization", model="facebook/bart-large-cnn")

In [None]:
def clean_email_text(text: str) -> str:
    """Clean up email body text."""
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'>.*\n', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [None]:
def summarize_text(text: str, min_length: int = 40, max_length: int = 150) -> str:
    """Generate a summary for the input text."""
    result = summarizer(text, max_length=max_length, min_length=min_length, do_sample=False)
    return result[0]['summary_text']

In [None]:
def parse_mbox(filepath: str) -> List[Dict[str, str]]:
    """Extract subject and body from emails."""
    mbox = mailbox.mbox(filepath)
    emails = []
    for msg in mbox:
        subject = msg['subject'] or 'No Subject'
        payload = msg.get_payload(decode=True)
        if payload:
            try:
                text = payload.decode('utf-8', errors='ignore')
                cleaned = clean_email_text(text)
                emails.append({"subject": subject, "body": cleaned})
            except Exception as e:
                print(f"[ERROR] Skipping email: {e}")
    return emails

In [None]:
def summarize_emails(emails: List[Dict[str, str]]) -> List[Dict[str, str]]:
    """Summarize list of emails."""
    results = []
    for email in emails:
        body = email['body']
        subject = email['subject']
        print(f"\n📨 {subject}")
        if len(body.split()) < 50:
            summary = body  # skip for short emails
        else:
            summary = summarize_text(body)
        results.append({"subject": subject, "summary": summary})
    return results

In [None]:
def save_summaries(summaries: List[Dict[str, str]], filename: str = None):
    """Save summaries to disk."""
    if not filename:
        filename = f"email_summaries_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(summaries, f, indent=4)
    print(f"[INFO] Saved to {filename}")

In [None]:
def create_sample_mbox(path: str):
    mbox = mailbox.mbox(path)
    for i in range(5):
        msg = EmailMessage()
        msg['Subject'] = f"Update {i+1}: Project Alpha"
        msg.set_content(
            f"""
Team,

We have completed milestone {i+1} of Project Alpha. Frontend has pushed new UI changes. Backend team integrated the service endpoints. Next step is QA and documentation.

Regards,
Product Owner
""")
        mbox.add(msg)
    mbox.flush()
    print(f"[INFO] Sample MBOX created at {path}")

In [None]:
# Orchestrate the full pipeline
mbox_path = "sample_emails.mbox"
if not os.path.exists(mbox_path):
    create_sample_mbox(mbox_path)

emails = parse_mbox(mbox_path)
summaries = summarize_emails(emails)
save_summaries(summaries)