In [1]:
import re
import os
import string
from email.parser import Parser
import pandas as pd

In [2]:
email_root_folder = 'W:\\Documents\\CPEN\\L400S1\\AI\\project\\maildir\\'

In [3]:
# don_t_read_from = ['family', 'fun', 'contacts', 'calendar', 'personal', 'funny']

In [4]:
email_df = pd.DataFrame(columns=['file', 'date', 'subject', 'from', 'to', 'email_body'])

In [5]:
# pattern2 example dasovich-j/sent/3184
# remove attached files e.g (' - Undercollection assessment final.doc') dasovich-j/sent/3395

# remove anything between <>, [], {}, ()

# the body of the email is between X-FileName: ddavis2.nsf and the patterns defined above

# what not to read
# any folder named 'family'
# any folder with 'mba' begining its name should be ignored
# any folder named 'fun'
# contacts

# don_t_read_from = ['family', 'mba', 'fun', 'contacts', 'calendar', 'personal', 'funny']

# i think i should exclude emails with less than 10 words

In [6]:
def extract_clean_body(email):
    everything = re.compile(r'X-FileName: .*\n*(.*\n*)*')  # full body of the email + some portion of the metadata
    top = re.compile(r'X-FileName: .*')  # metadata extracted with the email body
    bottom = re.compile(r'(-{5,}\s*Forwarded\s*by|-{5,}\s*Original\s*Message-{5,}|-+\s*Inline\s*attachment\s*follows|\n{4,})(.*\n*)+')  # unwanted email at the bottom of the main email
    html_tags = re.compile(r'<.*?>')  # <>
    paren = re.compile(r'\(.*?\)|\{.*\}|\[.*\]|\&\w+')  # () {} [] &somtext
    # botom_legal_text = re.compile(r'(\*|-|_){10,}(.*\n+)*')  # disclaimer text on the bottom of email
    email_address = re.compile(r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+(\.[a-zA-Z]{2,4}))', re.VERBOSE)  # abi you know dada
    # file_attachment = re.compile(r'-{1}\s{1}(\w+\s*)+\.\w{3}')  # faulty

    email_body = everything.search(email).group()  # extract email body
    email_body = re.sub(bottom, '', email_body)
    email_body = re.sub(top, '', email_body)  # main email body
    email_body = re.sub(email_address, '', email_body)
    # email_body =  re.sub(botom_legal_text, '', email_body)
    # email_body = re.sub(file_attachment, '', email_body)  # faulty
    email_body = re.sub(html_tags, '', email_body)
    email_body = re.sub(paren, '', email_body)

    email_body = re.sub(r'https?://\S+', ' ', email_body)  # link
    email_body = re.sub(r'\n', ' ', email_body)  # new line
    email_body = re.sub(r'\t', ' ', email_body)  # tab
    email_body = re.sub(r'\w*\d+\w*', '', email_body)  # words with numbers in them
    email_body = re.sub(r'[%s]' % re.escape(string.punctuation), '', email_body)  # punctuations
    # text = re.sub(r'[‘’“”…]', '', text)
    email_body = re.sub(r'\s{2,}', ' ', email_body)  # two or more spaces
    email_body = email_body.lower()

    return email_body

In [7]:
def extract_clean_metadata(email):
    metadata = {}
    email_parser = Parser().parsestr(email)
    
    metadata['from'] = re.sub(r'\s', '', \
        re.sub(r'\t', '', \
            re.sub(r'\n', '', \
                str(email_parser['from']))))
    metadata['to'] = re.sub(r'\s', '', \
        re.sub(r'\t', '', \
            re.sub(r'\n', '', \
                str(email_parser['to']))))  # remove newlines, tabs and spaces
    metadata['date'] = re.sub(r'\s{2,}', '', \
        re.sub(r'\t', '', \
            re.sub(r'\n', '', \
                str(email_parser['date']))))
    metadata['subject'] = re.sub(r'\s{2,}', '', \
        re.sub(r'\t', '', \
            re.sub(r'\n', '', \
                str(email_parser['subject']))))  # remove newlines, tabs and double spaces
    
    return metadata

In [9]:
df = extract_clean_body(ds).strip()
df

'this in an automated email sent out from the commissionercom web site do not reply to this email but instead please visit your leagues site at mark friedman requests the following transaction add jacquez green to approve this transaction please go to your web site and select transactions requests in the front office'

In [10]:
th = extract_clean_metadata(ds)
th

{'from': 'mark.friedman@enron.com',
 'to': 'mcuilla@enron.com',
 'date': 'Fri, 15 Sep 2000 07:02:00 -0700 (PDT)',
 'subject': 'Commissioner.COM Transaction Request'}

In [11]:
# loop through every file in root_dir
for root, dirs, files in os.walk(email_root_folder):
    # replace / in the file path with __ (two underscore) and - with ___ (three underscore)
    dir_name = root.replace(email_root_folder, '').replace('\\', '__').replace('-', '___')
    print(dir_name)
    # print (dirs)
    # print (len(files))
    
    # read every file in the directory cleaning its contents and adding it to a dataframe
    if len(files) > 0:
        for file in files:
            with open(root+'\\'+str(file)) as f:
                # files_content_dict.append({dir_name+'_'+str(file):  f.read()})
                file_name = dir_name+'_'+str(file)
                email = f.read()
                metadata = extract_clean_metadata(email)
                email_body = extract_clean_body(email).strip()

                # print(file_name, metadata, email_body, sep='\n')

                email_df.loc[len(email_df.index)] = [file_name, metadata['date'], metadata['subject'], metadata['from'], metadata['to'], email_body]
    else:
        print('no files in this directory')
    print ('--------------------------------')


no files in this directory
--------------------------------
allen___p
no files in this directory
--------------------------------
allen___p__all_documents
--------------------------------
allen___p__contacts
--------------------------------
allen___p__deleted_items
--------------------------------
allen___p__discussion_threads
--------------------------------
allen___p__inbox
--------------------------------
allen___p__notes_inbox
--------------------------------
allen___p__sent
--------------------------------
allen___p__sent_items
--------------------------------
allen___p__straw
--------------------------------
allen___p___sent_mail
--------------------------------
arnold___j
no files in this directory
--------------------------------
arnold___j__2000_conference
--------------------------------
arnold___j__active_international
--------------------------------
arnold___j__all_documents
--------------------------------
arnold___j__avaya
--------------------------------
arnold___j__bm

In [12]:
email_df.head()

Unnamed: 0,file,date,subject,from,to,email_body
0,allen___p__all_documents_1,"Wed, 13 Dec 2000 18:41:00 -0800 (PST)","December 14, 2000 - Bear Stearns' predictions ...",1.11913372.-2@multexinvestornetwork.com,pallen@enron.com,in todays daily update youll find free reports...
1,allen___p__all_documents_10,"Wed, 13 Dec 2000 08:35:00 -0800 (PST)",Bloomberg Power Lines Report,messenger@ecm.bloomberg.com,,here is todays copy of bloomberg power lines a...
2,allen___p__all_documents_100,"Mon, 9 Oct 2000 07:16:00 -0700 (PDT)",Consolidated positions: Issues & To Do list,phillip.allen@enron.com,keith.holst@enron.com,
3,allen___p__all_documents_101,"Mon, 9 Oct 2000 07:00:00 -0700 (PDT)",Consolidated positions: Issues & To Do list,phillip.allen@enron.com,keith.holst@enron.com,
4,allen___p__all_documents_102,"Thu, 5 Oct 2000 06:26:00 -0700 (PDT)",,phillip.allen@enron.com,david.delainey@enron.com,dave here are the names of the west desk membe...


In [14]:
email_df.describe()

Unnamed: 0,file,date,subject,from,to,email_body
count,517401,517401,517401.0,517401,517401.0,517401.0
unique,517401,224128,159237.0,20328,58564.0,199250.0
top,allen___p__all_documents_1,"Wed, 27 Jun 2001 16:02:00 -0700 (PDT)",,kay.mann@enron.com,,
freq,1,1118,19187.0,16735,21847.0,57011.0


In [15]:
email_df.shape

(517401, 6)

In [16]:
# write dataset to csv file
email_df.to_csv('dataset/enron_emails.csv', index=False)