In [1]:
import os
import email
import email.header
import pandas as pd
from email import policy
from email.parser import BytesParser

In [2]:
def parse_eml_file(file_path):
    with open(file_path, 'rb') as f:
        msg = BytesParser(policy=policy.default).parse(f)
    
    # Extract key information
    subject = msg.get('Subject', 'No Subject')
    sender = msg.get('From', 'No Sender')
    recipients = msg.get('To', 'No Recipients')
    date = msg.get('Date', 'No Date')
    
    # Decode subject if needed
    try:
        if subject:
            decoded_subject = email.header.decode_header(subject)
            subject = ''.join(part[0].decode(part[1] or 'utf-8') if isinstance(part[0], bytes) else str(part[0]) for part in decoded_subject)
    except:
        pass
    
    # Get email body
    body = ""
    if msg.is_multipart():
        for part in msg.walk():
            if part.get_content_type() == 'text/plain':
                body = part.get_payload(decode=True).decode('utf-8', errors='ignore')
                break
    else:
        body = msg.get_payload(decode=True).decode('utf-8', errors='ignore')
    
    return {
        'subject': subject,
        'sender': sender,
        'recipients': recipients,
        'date': date,
        'body': body
    }

In [3]:
def extract_emails_from_folder(folder_path):
    emails_data = []
    email_count = 0
    
    for root, dirs, files in os.walk(folder_path):
        if email_count >= 5000:
            break
        for file in files:
            # For maildir, files are numbered without .eml extension
            # Also check if it's a file, not directory
            if file not in ['.', '..'] and not os.path.isdir(os.path.join(root, file)):
                if email_count >= 5000:
                    break
                file_path = os.path.join(root, file)
                try:
                    email_data = parse_eml_file(file_path)
                    email_data['file_path'] = file_path
                    emails_data.append(email_data)
                    email_count += 1
                    
                    if email_count % 1000 == 0:
                        print(f"Processed {email_count} emails...")
                        
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")
        if email_count >= 5000:
            break
    
    return emails_data

In [4]:
enron_folder = 'enron_dataset/maildir'

if os.path.exists(enron_folder):
    print("Extracting email data from Enron dataset...")
    emails = extract_emails_from_folder(enron_folder)
    
    # Convert to DataFrame
    df = pd.DataFrame(emails)
    print(f"\nExtraction complete! Found {len(df)} emails.")
    print(df.head())
    
    # Save to CSV for further analysis
    df.to_csv('enron_emails.csv', index=False)
    print("\nData saved to enron_emails.csv")
else:
    print(f"Enron dataset folder '{enron_folder}' not found.")
    print("Please download the dataset using the instructions in ENRON_DOWNLOAD_GUIDE.md")

Extracting email data from Enron dataset...
Processed 1000 emails...
Processed 2000 emails...
Processed 3000 emails...
Processed 4000 emails...
Processed 5000 emails...

Extraction complete! Found 5000 emails.
                                             subject  \
0                                             Status   
1                                 re:summer inverses   
2                      The WTI Bullet swap contracts   
3  Invitation: EBS/GSS Meeting w/Bristol Babcock ...   
4                                       Harvard Mgmt   

                      sender  \
0            msagel@home.com   
1    slafontaine@globalp.com   
2    iceoperations@intcx.com   
3  jeff.youngflesh@enron.com   
4  caroline.abramo@enron.com   

                                          recipients  \
0                                  jarnold@enron.com   
1                              john.arnold@enron.com   
2  icehelpdesk@intcx.com, internalmarketing@intcx...   
3  anthony.gilmore@enron.com, collee