In [51]:
import imaplib, email, os, json
from email.header import decode_header

from dotenv import load_dotenv
from langchain import LLMChain
from langchain_openai import ChatOpenAI
from langchain.prompts import PromptTemplate
load_dotenv()

True

login

In [52]:
IMAP_SERVER = 'imap.gmx.com'
IMAP_PORT = 993
EMAIL_ACCOUNT = os.environ['EMAIL']
EMAIL_PASSWORD = os.environ['EMAIL_PASSWORD']
mail = imaplib.IMAP4_SSL(IMAP_SERVER, IMAP_PORT)
mail.login(EMAIL_ACCOUNT, EMAIL_PASSWORD)

('OK', [b'LOGIN completed'])

In [53]:
mail.select('INBOX')
status, messages = mail.search(None, 'ALL')
email_ids = messages[0].split()
email_ids = [int(eid) for eid in email_ids]

In [54]:
llm = ChatOpenAI(model = 'gpt-4o-mini', temperature=0)

classification_prompt = PromptTemplate.from_template(
    '''
        Task:
        You are my (florian kenner's) personal assistant who is instructed to cleanup his mailbox. Rate the following email, whether to keep or delete it. You want to generally delete all emails which will not be of any importance in the future, such as ads, spam, newsletters outdated information, such as one time authentication codes etc.
        Keep all emails which are personal communications, bookings, invoices, important notifications etc.
        If you are not totally sure, keep the email.

        Email Title: {header}
        Email Sender: {sender}
        Date: {date}
        Email Content: {content}


           Format of Expected Output: Please provide your classification and justification in the following structured format:
            {{
                "classification": "KEEP" or "DELETE",
                "justification": "A one short sentence explanation of why you made this classification based on the provided email"
            }}
    '''
)

In [55]:
def get_email_body(msg):
    def decode_payload(payload, charset):
        try:
            return payload.decode(charset or "utf-8")
        except (UnicodeDecodeError, LookupError):
            # Try common encodings if utf-8 fails
            for encoding in ["latin1", "iso-8859-1", "windows-1252"]:
                try:
                    return payload.decode(encoding)
                except (UnicodeDecodeError, LookupError):
                    continue
            # If all decoding attempts fail, return a placeholder
            return "[Unable to decode content]"

    body = ""
    if msg.is_multipart():
        # If multipart, iterate through the parts
        for part in msg.walk():
            # Check if the content type is text/plain or text/html
            content_type = part.get_content_type()
            content_disposition = str(part.get("Content-Disposition"))

            if content_type == "text/plain" and "attachment" not in content_disposition:
                # Extract plain text
                body = decode_payload(part.get_payload(decode=True), part.get_content_charset())
                break
            elif content_type == "text/html" and "attachment" not in content_disposition:
                # Extract HTML
                body = decode_payload(part.get_payload(decode=True), part.get_content_charset())
    else:
        # If not multipart, check the content type
        content_type = msg.get_content_type()
        if content_type == "text/plain" or content_type == "text/html":
            body = decode_payload(msg.get_payload(decode=True), msg.get_content_charset())
    return str(body)

def decode_subject(subject, encoding):
    try:
        return subject.decode(encoding or "utf-8")
    except (UnicodeDecodeError, LookupError):
        # Try common encodings if utf-8 fails
        for enc in ["latin1", "iso-8859-1", "windows-1252"]:
            try:
                return subject.decode(enc)
            except (UnicodeDecodeError, LookupError):
                continue
        # If all decoding attempts fail, return a placeholder
        return "[Unable to decode subject]"

In [56]:
BACKUP_FOLDER = "./email_backups"
import os
import re
import json

BACKUP_FOLDER = "./email_backups"

def save_email_backup(email_id, message, metadata):
    # Create backup folder if it doesn't exist
    os.makedirs(BACKUP_FOLDER, exist_ok=True)

    # Sanitize and shorten subject
    subject = ''.join(c for c in metadata["subject"][:50] if c.isalnum())
    subject = re.sub(r'[<>:"/\\|?*\r\n]+', '_', subject)

    # Save raw email content as .eml
    filename = f"{BACKUP_FOLDER}/{email_id}_{subject}.eml"
    with open(filename, 'wb') as out:  # Binary mode for raw email
        out.write(message.as_bytes())  # Save raw bytes of email

    # Encode the body to handle special characters
    metadata["body"] = re.sub(r'[^\x00-\x7F]+', '', metadata["body"]).encode('utf-8', 'replace').decode('utf-8')
    # Save metadata as .json
    json_filename = f"{BACKUP_FOLDER}/{email_id}_{subject}.json"
    with open(json_filename, "w", encoding = 'utf-8') as f:
        json.dump(metadata, f, ensure_ascii=False)

    save_attachments(message)

def save_attachments(message):
    for part in message.walk():
        if part.get_content_maintype() == 'multipart':
            continue
        if part.get('Content-Disposition') is None:
            continue

        filename = part.get_filename()
        if not filename:
            continue

        # Sanitize the filename
        filename = re.sub(r'[\\/*?:"<>|\r\n\t]', "_", filename)

        filepath = os.path.join('./attachments', filename)
        with open(filepath, "wb") as f:
            f.write(part.get_payload(decode=True))
    

classify and delete

In [None]:
from tqdm import tqdm

deleted = 0
kept = 0
checked = set()

if os.path.exists('./checked.txt'):
    with open('./checked.txt') as f:
        content = f.read()
        checked = set(json.loads(content))

for id in tqdm(email_ids):
    if id in checked: 
        continue

    _, msg_data = mail.fetch(str(id), '(RFC822)')
    message = email.message_from_bytes(msg_data[0][1])
    subject, encoding = decode_header(message['subject'])[0]
    if isinstance(subject, bytes):
        subject = decode_subject(subject, encoding)
    else:
        subject = str(subject)

    sender = message.get('FROM')
    date = message.get('Date')
    body = get_email_body(message)

    

    # Save the email to the folder email_backup
    save_email_backup(id, message, {
    "email_id": id,
    "subject": str(subject),
    "sender": str(sender),
    "date": str(date),
    "body": body
})


    agent = LLMChain(llm = llm, prompt = classification_prompt)

    result_raw = agent.invoke({
        'header': subject,
        'sender': sender,
        'date': date,
        'content': body
    })
    result_str = result_raw.get('text', '')
    result = json.loads(result_str)
    
    if result['classification'] == 'DELETE':
        mail.store(str(id), '+FLAGS', '\\Deleted')

        mail.expunge()
        deleted += 1
    else:
        kept += 1

    checked.add(id)

    with open('checked.txt', 'w') as f:
        f.write(json.dumps(list(checked)))

 15%|█▌        | 797/5286 [00:41<08:06,  9.22it/s]  