In [None]:
import mailbox
import re
import xlwings as xw
import pickle
import os
from pathlib import Path
from pdfminer.high_level import extract_text
import pymongo

## Configuration parameters

In [None]:
mboxfile = os.environ.get("MBOX_FILE")
xls_file = os.environ.get("XLS_FILE")

## Identification of the *fake* e-mails from my mbox by querying the master Excel table

In [None]:
with xw.App() as app:
    book = xw.Book(xls_file)
    sheet = book.sheets[0]
    emails = sheet[0:, 1].value
    num_row = sheet.range('A1').end('down').row

    # collect data
    content_list = sheet.range((1,2),(num_row,2)).value
    
    book.close()

In [None]:
emails = [email.lower().strip() for email in content_list]

In [None]:
f = re.compile(r'[\w.-]+@[\w.-]+\.\w+')
# alternative for e-mail regex
badones = set()
not_identified = set()

for thisemail in mailbox.mbox(mboxfile):
    match = f.search(str(thisemail['From']))
    if match is not None:
        msender = match.group().lower().strip()
        if (msender in emails):
            badones.add(msender)
        else:
            not_identified.add(msender)

In [None]:
# Serialization and saving of the "bad e-mail" set for further use
path = './pickles'
Path(path).mkdir(parents=True, exist_ok=True)

with open(os.path.join(path, 'badones.pickle'), 'wb') as f:
    pickle.dump(badones, f, pickle.HIGHEST_PROTOCOL)
    
with open(os.path.join(path, 'goodones.pickle'), 'wb') as f:
    pickle.dump(not_identified, f, pickle.HIGHEST_PROTOCOL)    

## Extraction of the attached pdfs from the *bad* emails

**Structure of the messages**

*Either:*

One part message with content type: text/plain


*Or*

Multipart message

*Subparts of the multipart message:*

Content type multipart/mixed

Content type text/plain

Content type application/octet-stream

## Helper functions

In [None]:
def getcharsets(msg):
    charsets = set({})
    for c in msg.get_charsets():
        if c is not None:
            charsets.update([c])
    return charsets

def handleerror(errmsg, emailmsg,cs):
    print()
    print(errmsg)
    print("This error occurred while decoding with ",cs," charset.")
    print("These charsets were found in the one email.",getcharsets(emailmsg))
    print("This is the subject:",emailmsg['subject'])
    print("This is the sender:",emailmsg['From'])

def getbodyfromemail(msg, msender, status):
    body = None
    path = ''
    if (status == 'bad'):
        path = './badones'
    elif (status == 'not_identified'):
        path = './not_identified'
    Path(path).mkdir(parents=True, exist_ok=True)

    #Walk through the parts of the email to find the text body.
    # We save the the attachment (pdf-file) as [e-mail].pdf
    if msg.is_multipart():    
        print('Multipart message from:', msender)
        for part in msg.walk():
            # If part is multipart, walk through the subparts.            
            if part.is_multipart(): 
                print('Part in multipart message from:', msg['From'])
                for subpart in part.walk():
                    if subpart.get_content_type() == 'application/octet-stream':
                        filename = msender + '.pdf'
                        # filename = subpart.get_filename()
                        # this regex is being updated through usage
                        # if re.search(r"[\?\n]", filename): continue
                        # Changing file name to sender address
                        fb = open(os.path.join(path, filename),'wb')
                        fb.write(subpart.get_payload(decode=True))
                        fb.close()
                
    else:
        # if msg.get_content_type() == 'text/plain':
        print('One part message with content type:', msg.get_content_type(), 'from:', msg['From'])
        body = msg.get_payload(decode=True) 
        
    return msender    

In [None]:
f = re.compile(r'[\w.-]+@[\w.-]+\.\w+')

for thisemail in mailbox.mbox(mboxfile):
    match = f.search(str(thisemail['From']))
    if match is not None:
        msender = match.group(0).lower().strip()
        if (msender in badones):
            getbodyfromemail(thisemail, msender, 'bad')
        else:    
            getbodyfromemail(thisemail, msender, 'not_identified')