# Enron Email Extraction

This notebook extracts various fields from the Enron email dataset, including:
- Message-ID
- Date
- From
- To
- Cc / Bcc
- Subject
- Mime-Version, Content-Type, Content-Transfer-Encoding
- Body (plain text/HTML content)
- Attachments

In [6]:
# Import required libraries
import os
import email
import email.header
import pandas as pd
from email import policy
from email.parser import BytesParser
import warnings
warnings.filterwarnings('ignore')

print("Imported libraries successfully.")

Imported libraries successfully.


In [7]:
# Function to parse individual EML files
def parse_eml_file(file_path):
    """
    Parse an EML file and extract all requested fields
    """
    with open(file_path, 'rb') as f:
        msg = BytesParser(policy=policy.default).parse(f)
    
    # Extract standard email headers
    subject = msg.get('Subject', 'No Subject')
    sender = msg.get('From', 'No Sender')
    recipients = msg.get('To', 'No Recipients')
    cc = msg.get('Cc', '')
    bcc = msg.get('Bcc', '')
    date = msg.get('Date', 'No Date')
    message_id = msg.get('Message-ID', 'No Message-ID')
    
    # Extract MIME-related headers
    mime_version = msg.get('Mime-Version', '')
    content_type = msg.get('Content-Type', '')
    content_transfer_encoding = msg.get('Content-Transfer-Encoding', '')
    
    # Decode subject if needed
    try:
        if subject:
            decoded_subject = email.header.decode_header(subject)
            subject = ''.join(part[0].decode(part[1] or 'utf-8') if isinstance(part[0], bytes) else str(part[0]) for part in decoded_subject)
    except:
        pass
    
    # Extract email body and attachments
    body = ''
    attachments = []
    
    if msg.is_multipart():
        for part in msg.walk():
            if part.get_content_type() == 'text/plain' and not part.get_filename():
                body = part.get_payload(decode=True).decode('utf-8', errors='ignore')
            elif part.get_filename():
                attachments.append(part.get_filename())
    else:
        body = msg.get_payload(decode=True).decode('utf-8', errors='ignore')
    
    return {
        'file': file_path,
        'message_id': message_id,
        'date': date,
        'from': sender,
        'to': recipients,
        'cc': cc,
        'bcc': bcc,
        'subject': subject,
        'body': body,
        'mime_version': mime_version,
        'content_type': content_type,
        'content_transfer_encoding': content_transfer_encoding,
        'attachments': '; '.join(attachments) if attachments else ''
    }

print("parse_eml_file function defined!")

parse_eml_file function defined!


In [8]:
# Function to extract emails from folder
def extract_emails_from_folder(folder_path, limit=1000):
    """
    Extract email data from all files in the maildir folder structure
    """
    emails_data = []
    email_count = 0
    
    print(f"Starting extraction from {folder_path}...")
    
    for root, dirs, files in os.walk(folder_path):
        if email_count >= limit:
            break
            
        for file in files:
            if file not in ['.', '..'] and not os.path.isdir(os.path.join(root, file)):
                if email_count >= limit:
                    break
                    
                file_path = os.path.join(root, file)
                try:
                    email_data = parse_eml_file(file_path)
                    emails_data.append(email_data)
                    email_count += 1
                    
                    if email_count % 500 == 0:
                        print(f'Processed {email_count} emails...')
                        
                except Exception as e:
                    print(f'Error processing {file_path}: {e}')
                    
        if email_count >= limit:
            break
    
    print(f"Extraction completed! Processed {len(emails_data)} emails.")
    return emails_data

print("extract_emails_from_folder function defined!")

extract_emails_from_folder function defined!


In [9]:
# Main extraction process
import json

enron_folder = 'enron_dataset/maildir'

if os.path.exists(enron_folder):
    print('Extracting email data from Enron dataset...')
    emails = extract_emails_from_folder(enron_folder, 5000)
    
    # Convert to DataFrame for analysis
    df = pd.DataFrame(emails)
    print(f'\nExtraction complete! Found {len(df)} emails.')
    
    # Display columns
    print('\nColumns extracted:', list(df.columns))
    
    # Save to JSON format for easier indexing
    output_file = 'enron_emails_extracted.json'
    
    # Save as JSON with proper formatting
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump({'emails': emails, 'total_count': len(emails)}, f, indent=2, ensure_ascii=False)
    
    print(f'\nData saved to {output_file}')
else:
    print(f'Enron dataset folder {enron_folder} not found.')
    print('Please download the dataset using the instructions in ENRON_DOWNLOAD_GUIDE.md')

Extracting email data from Enron dataset...
Starting extraction from enron_dataset/maildir...
Processed 500 emails...
Processed 1000 emails...
Processed 1500 emails...
Processed 2000 emails...
Processed 2500 emails...
Processed 3000 emails...
Processed 3500 emails...
Processed 4000 emails...
Processed 4500 emails...
Processed 5000 emails...
Extraction completed! Processed 5000 emails.

Extraction complete! Found 5000 emails.

Columns extracted: ['file', 'message_id', 'date', 'from', 'to', 'cc', 'bcc', 'subject', 'body', 'mime_version', 'content_type', 'content_transfer_encoding', 'attachments']

Data saved to enron_emails_extracted.json


In [10]:
# Display sample of extracted data
if 'df' in locals():
    print("Sample of extracted emails:")
    display(df.head())
    
    print("\nDataFrame Info:")

Sample of extracted emails:


Unnamed: 0,file,message_id,date,from,to,cc,bcc,subject,body,mime_version,content_type,content_transfer_encoding,attachments
0,enron_dataset/maildir/arnold-j/notes_inbox/36.,<17334447.1075857585446.JavaMail.evans@thyme>,"Thu, 16 Nov 2000 09:30:00 -0800",msagel@home.com,jarnold@enron.com,,,Status,John:\n?\nI'm not really sure what happened be...,1.0,"text/plain; charset=""ANSI_X3.4-1968""",7bit,
1,enron_dataset/maildir/arnold-j/notes_inbox/19.,<19171686.1075857585034.JavaMail.evans@thyme>,"Fri, 08 Dec 2000 05:05:00 -0800",slafontaine@globalp.com,john.arnold@enron.com,,,re:summer inverses,i suck-hope youve made more money in natgas la...,1.0,"text/plain; charset=""us-ascii""",7bit,
2,enron_dataset/maildir/arnold-j/notes_inbox/50.,<29887033.1075857630725.JavaMail.evans@thyme>,"Tue, 15 May 2001 09:43:00 -0700",iceoperations@intcx.com,"icehelpdesk@intcx.com, internalmarketing@intcx...",,,The WTI Bullet swap contracts,"Hi,\n\n\n Following the e-mail you have rece...",1.0,"text/plain; charset=""us-ascii""",7bit,
3,enron_dataset/maildir/arnold-j/notes_inbox/3.,<29084893.1075849630138.JavaMail.evans@thyme>,"Mon, 27 Nov 2000 01:49:00 -0800",jeff.youngflesh@enron.com,"anthony.gilmore@enron.com, colleen.koenig@enro...",,,Invitation: EBS/GSS Meeting w/Bristol Babcock ...,Conference Room TBD. \n\nThis meeting will be...,1.0,"text/plain; charset=""us-ascii""",7bit,
4,enron_dataset/maildir/arnold-j/notes_inbox/9.,<30248874.1075857584813.JavaMail.evans@thyme>,"Tue, 12 Dec 2000 09:33:00 -0800",caroline.abramo@enron.com,mike.grigsby@enron.com,john.arnold@enron.com,john.arnold@enron.com,Harvard Mgmt,Mike- I have their trader coming into the offi...,1.0,"text/plain; charset=""us-ascii""",7bit,



DataFrame Info:
