In [1]:
import yaml
import logging
import imaplib
import email
from email.header import decode_header
from email import message_from_bytes
from datetime import date
from dateutil.relativedelta import relativedelta
import json
import pandas as pd


In [2]:
def load_credentials(filepath):
    try:
        with open(filepath, 'r') as file:
            credentials = yaml.safe_load(file)
            user = credentials['user']
            password = credentials['password']
            return user, password
    except Exception as e:
        logging.error("Failed to load credentials: {}".format(e))
        raise

def connect_to_gmail_imap(user, password):
    imap_url = 'imap.gmail.com'
    try:
        mail = imaplib.IMAP4_SSL(imap_url)
        mail.login(user, password)
        mail.select('inbox')  # Connect to the inbox.
        return mail
    except Exception as e:
        logging.error("Connection failed: {}".format(e))
        raise

def clean_header(header_value):
    """Decode and clean email headers."""
    if not header_value:
        return ""
    decoded = decode_header(header_value)
    decoded_str = ""
    for part, encoding in decoded:
        if isinstance(part, bytes):
            decoded_str += part.decode(encoding or 'utf-8', errors='ignore')
        else:
            decoded_str += part
    return decoded_str

def get_body(msg):
    """Extract plain text email body from a message object."""
    if msg.is_multipart():
        for part in msg.walk():
            content_type = part.get_content_type()
            content_disposition = str(part.get("Content-Disposition"))
            if content_type == "text/plain" and "attachment" not in content_disposition:
                try:
                    return part.get_payload(decode=True).decode(errors='ignore')
                except:
                    pass
    else:
        try:
            return msg.get_payload(decode=True).decode(errors='ignore')
        except:
            pass
    return ""


In [3]:
# Connect to mail client
credentials = load_credentials('credentials.yaml')
mail = connect_to_gmail_imap(*credentials)
print('Connected!')

# Define date range
end_date = date.today()
start_date = end_date - relativedelta(months=12)
since = start_date.strftime("%d-%b-%Y")
before = end_date.strftime("%d-%b-%Y")

# Search for emails in date range
status, data = mail.search(None, f'(SINCE {since} BEFORE {before})')
email_ids = data[0].split()
emails = []

for num in email_ids:
    status, msg_data = mail.fetch(num, '(RFC822)')
    if status != 'OK':
        continue
    msg = message_from_bytes(msg_data[0][1])

    email_data = {
        "from": clean_header(msg.get("From")),
        "to": clean_header(msg.get("To")),
        "subject": clean_header(msg.get("Subject")),
        "date": msg.get("Date"),
        "body": get_body(msg)
    }
    emails.append(email_data)

# Logout mail
mail.logout()

Connected!


('BYE', [b'LOGOUT Requested'])

In [4]:
# Save to JSON
with open("emails.json", "w", encoding='utf-8') as f:
    json.dump(emails, f, ensure_ascii=False, indent=4)

print(f"Saved {len(emails)} emails to emails.json.")

Saved 2709 emails to emails.json.
