In [26]:
import mailbox
import re
import xlwings as xw
import pickle
import os
from pathlib import Path
from pdfminer.high_level import extract_text
import pymongo

## Configuration parameters

In [35]:
mboxfile = os.environ.get("MBOX_FILE")
xls_file = os.environ.get("XLS_FILE")

## Identification of the *fake* e-mails from my mbox by querying the master Excel table

In [36]:
with xw.App() as app:
    book = xw.Book(xls_file)
    sheet = book.sheets[0]
    emails = sheet[0:, 1].value
    num_row = sheet.range('A1').end('down').row

    # collect data
    content_list = sheet.range((1,2),(num_row,2)).value
    
    book.close()

In [37]:
emails = [email.lower().strip() for email in content_list]

In [38]:
f = re.compile(r'[\w.-]+@[\w.-]+\.\w+')
# alternative for e-mail regex
badones = set()
not_identified = set()

for thisemail in mailbox.mbox(mboxfile):
    match = f.search(str(thisemail['From']))
    if match is not None:
        msender = match.group().lower().strip()
        if (msender in emails):
            badones.add(msender)
        else:
            not_identified.add(msender)

In [39]:
# Serialization and saving of the "bad e-mail" set for further use
path = './pickles'
Path(path).mkdir(parents=True, exist_ok=True)

with open(os.path.join(path, 'badones.pickle'), 'wb') as f:
    pickle.dump(badones, f, pickle.HIGHEST_PROTOCOL)
    
with open(os.path.join(path, 'goodones.pickle'), 'wb') as f:
    pickle.dump(not_identified, f, pickle.HIGHEST_PROTOCOL)    

## Extraction of the attached pdfs from the *bad* emails

**Structure of the messages**

*Either:*

One part message with content type: text/plain


*Or*

Multipart message

*Subparts of the multipart message:*

Content type multipart/mixed

Content type text/plain

Content type application/octet-stream

## Helper functions

In [41]:
def getcharsets(msg):
    charsets = set({})
    for c in msg.get_charsets():
        if c is not None:
            charsets.update([c])
    return charsets

def handleerror(errmsg, emailmsg,cs):
    print()
    print(errmsg)
    print("This error occurred while decoding with ",cs," charset.")
    print("These charsets were found in the one email.",getcharsets(emailmsg))
    print("This is the subject:",emailmsg['subject'])
    print("This is the sender:",emailmsg['From'])

def getbodyfromemail(msg, msender, status):
    body = None
    path = ''
    if (status == 'bad'):
        path = './badones'
    elif (status == 'not_identified'):
        path = './not_identified'
    Path(path).mkdir(parents=True, exist_ok=True)

    #Walk through the parts of the email to find the text body.
    # We save the the attachment (pdf-file) as [e-mail].pdf
    if msg.is_multipart():    
        print('Multipart message from:', msender)
        for part in msg.walk():
            # If part is multipart, walk through the subparts.            
            if part.is_multipart(): 
                print('Part in multipart message from:', msg['From'])
                for subpart in part.walk():
                    if subpart.get_content_type() == 'application/octet-stream':
                        filename = msender + '.pdf'
                        # filename = subpart.get_filename()
                        # this regex is being updated through usage
                        # if re.search(r"[\?\n]", filename): continue
                        # Changing file name to sender address
                        fb = open(os.path.join(path, filename),'wb')
                        fb.write(subpart.get_payload(decode=True))
                        fb.close()
                
    else:
        # if msg.get_content_type() == 'text/plain':
        print('One part message with content type:', msg.get_content_type(), 'from:', msg['From'])
        body = msg.get_payload(decode=True) 
        
    return msender    

In [42]:
f = re.compile(r'[\w.-]+@[\w.-]+\.\w+')

for thisemail in mailbox.mbox(mboxfile):
    match = f.search(str(thisemail['From']))
    if match is not None:
        msender = match.group(0).lower().strip()
        if (msender in badones):
            getbodyfromemail(thisemail, msender, 'bad')
        else:    
            getbodyfromemail(thisemail, msender, 'not_identified')

Multipart message from: andrea0walter11@gmail.com
Part in multipart message from: andrea0walter11@gmail.com
Multipart message from: mariana.rafael.savian@gmail.com
Part in multipart message from: "Mariana R. Savian" <mariana.rafael.savian@gmail.com>
Part in multipart message from: "Mariana R. Savian" <mariana.rafael.savian@gmail.com>
Multipart message from: wenliehue@gmail.com
Part in multipart message from: wenliehue@gmail.com
One part message with content type: text/plain from: hchong333@gmail.com
Multipart message from: kasiaorblin@gmail.com
Part in multipart message from: kasiaorblin@gmail.com
Multipart message from: maria.charlotte550@gmail.com
Part in multipart message from: maria.charlotte550@gmail.com
Multipart message from: ulla3bergstrom@gmail.com
Part in multipart message from: ulla3bergstrom@gmail.com
Multipart message from: mohammad.ali4translation@gmail.com
Part in multipart message from: Mohammad Ali <mohammad.ali4translation@gmail.com>
Part in multipart message from: Mo

One part message with content type: text/plain from: kayhanjoan484@gmail.com
Multipart message from: russelljuskova89@gmail.com
Part in multipart message from: Russell Juskova <russelljuskova89@gmail.com>
Part in multipart message from: Russell Juskova <russelljuskova89@gmail.com>
One part message with content type: text/plain from: lfaust258@gmail.com
Multipart message from: contact@qualtransloc.com
Part in multipart message from: Andrei Ionut Carp <contact@qualtransloc.com>
Multipart message from: ellisbennick@gmail.com
Part in multipart message from: ellisbennick@gmail.com
Multipart message from: juliephan101@gmail.com
Part in multipart message from: juliephan101@gmail.com
Multipart message from: danieljacobbbbb@gmail.com
Part in multipart message from: danieljacobbbbb@gmail.com
Multipart message from: iryanaegger00@gmail.com
Part in multipart message from: iryanaegger00@gmail.com
Multipart message from: emmav5870@gmail.com
Part in multipart message from: Van Damme Emma <emmav5870@g

Multipart message from: danielparrott525@gmail.com
Part in multipart message from: danielparrott525@gmail.com
Multipart message from: hi@bergentext.one
Part in multipart message from: "Bergen Text Company" <hi@bergentext.one>
Multipart message from: vardmary03@gmail.com
Part in multipart message from: Mary Vard <vardmary03@gmail.com>
Part in multipart message from: Mary Vard <vardmary03@gmail.com>
Multipart message from: luckma.dorj@gmail.com
Part in multipart message from: luckma.dorj@gmail.com
One part message with content type: text/plain from: kayhanjoan484@gmail.com
Multipart message from: hana.abusalim8@gmail.com
Part in multipart message from: hana.abusalim8@gmail.com
Multipart message from: deborah.nicolas810@gmail.com
Part in multipart message from: deborah.nicolas810@gmail.com
Multipart message from: rinarinabakbak@gmail.com
Part in multipart message from: rinarinabakbak@gmail.com
Multipart message from: nam33yoona@gmail.com
Part in multipart message from: Yoona Nam <nam33yoo

Multipart message from: nicoleneolzer10@gmail.com
Part in multipart message from: nicoleneolzer10@gmail.com
Multipart message from: annikaa.falk@gmail.com
Part in multipart message from: annikaa.falk@gmail.com
Multipart message from: viviendadiyanvie@gmail.com
Part in multipart message from: viviendadiyanvie@gmail.com
Multipart message from: ann0maresca@gmail.com
Part in multipart message from: ann0maresca@gmail.com
Multipart message from: ellisbennick@gmail.com
Part in multipart message from: ellisbennick@gmail.com
Multipart message from: marikimura.89@gmail.com
Part in multipart message from: Mari Kimura <marikimura.89@gmail.com>
Part in multipart message from: Mari Kimura <marikimura.89@gmail.com>
Multipart message from: christina197418@gmail.com
Part in multipart message from: christina197418@gmail.com
Multipart message from: christina197418@gmail.com
Part in multipart message from: christina197418@gmail.com
Multipart message from: elham.jarad111@gmail.com
Part in multipart message

Multipart message from: teresajory63@gmail.com
Part in multipart message from: teresajory63@gmail.com
Multipart message from: sugaranna055@gmail.com
Part in multipart message from: sugaranna055@gmail.com
Multipart message from: danielberry1234567@gmail.com
Part in multipart message from: danielberry1234567@gmail.com
Multipart message from: leahkleivenes@gmail.com
Part in multipart message from: leahkleivenes@gmail.com
Multipart message from: danilaanderson4@gmail.com
Part in multipart message from: danilaanderson4@gmail.com
Multipart message from: arvidroli12@gmail.com
Part in multipart message from: arvidroli12@gmail.com
Multipart message from: loredana.rico.italian@gmail.com
Part in multipart message from: Loredana Rico <loredana.rico.italian@gmail.com>
Part in multipart message from: Loredana Rico <loredana.rico.italian@gmail.com>
Multipart message from: helenwang640@gmail.com
Part in multipart message from: helenwang640@gmail.com
Multipart message from: diana.olmec@gmail.com
Part i

Multipart message from: luciemaruniakova.m@gmail.com
Part in multipart message from: luciemaruniakova.m@gmail.com
Multipart message from: annabehrmann12@gmail.com
Part in multipart message from: annabehrmann12@gmail.com
Multipart message from: rrosmeilanarifin@gmail.com
Part in multipart message from: rrosmeilanarifin@gmail.com
Multipart message from: yokotaki77@gmail.com
Part in multipart message from: yokotaki77@gmail.com
Multipart message from: baharalpan414@gmail.com
Part in multipart message from: baharalpan414@gmail.com
Multipart message from: nazlipiastnana@gmail.com
Part in multipart message from: nazlipiastnana@gmail.com
Multipart message from: brigittesylvain53@gmail.com
Part in multipart message from: brigittesylvain53@gmail.com
Multipart message from: tanacarthy27@gmail.com
Part in multipart message from: tanacarthy27@gmail.com
Multipart message from: for80536@gmail.com
Part in multipart message from: for80536@gmail.com
Multipart message from: claudialebrun09@gmail.com
Part

Multipart message from: emilieknutsen2@gmail.com
Part in multipart message from: emilieknutsen2@gmail.com
Multipart message from: ricciardiraymond5@gmail.com
Part in multipart message from: ricciardiraymond5@gmail.com
Multipart message from: jack.transs.selkov@gmail.com
Part in multipart message from: Jack Selkov <jack.transs.selkov@gmail.com>
Part in multipart message from: Jack Selkov <jack.transs.selkov@gmail.com>
Multipart message from: melinasolja77@gmail.com
Part in multipart message from: melinasolja77@gmail.com
Multipart message from: julieandersen798@gmail.com
Part in multipart message from: julieandersen798@gmail.com
Multipart message from: arvidroli12@gmail.com
Part in multipart message from: arvidroli12@gmail.com
Multipart message from: sand66toro@gmail.com
Part in multipart message from: sand66toro@gmail.com
Multipart message from: yokotaki77@gmail.com
Part in multipart message from: yokotaki77@gmail.com
Multipart message from: caskew834@gmail.com
Part in multipart message

Multipart message from: peter.steneitaly10@gmail.com
Part in multipart message from: Peter Steneitaly <peter.steneitaly10@gmail.com>
Part in multipart message from: Peter Steneitaly <peter.steneitaly10@gmail.com>
Multipart message from: camillalysholm70@gmail.com
Part in multipart message from: camillalysholm70@gmail.com
Multipart message from: angelapetersen070@gmail.com
Part in multipart message from: angelapetersen070@gmail.com
Multipart message from: trans.annebrandt74@gmail.com
Part in multipart message from: trans.annebrandt74@gmail.com
Multipart message from: gannetprediger@gmail.com
Part in multipart message from: gannetprediger@gmail.com
Multipart message from: irene0jansson@gmail.com
Part in multipart message from: irene0jansson@gmail.com
Multipart message from: tolmanhiroshi@gmail.com
Part in multipart message from: Tolman Hiroshi <tolmanhiroshi@gmail.com>
Part in multipart message from: Tolman Hiroshi <tolmanhiroshi@gmail.com>
Multipart message from: mirkatom159@gmail.com
P

Multipart message from: ireneshang686@gmail.com
Part in multipart message from: ireneshang686@gmail.com
Multipart message from: luciemaruniakova.m@gmail.com
Part in multipart message from: luciemaruniakova.m@gmail.com
Multipart message from: peters77sm@gmail.com
Part in multipart message from: peters77sm@gmail.com
Multipart message from: sandytyssens011@gmail.com
Part in multipart message from: sandytyssens011@gmail.com
Multipart message from: hisaokada4@gmail.com
Part in multipart message from: hisaokada4@gmail.com
Multipart message from: joanna.przybysz@rehau.com
Part in multipart message from: "Joanna Przybysz, BSCPZ 5411, ACC-BSC" <joanna.przybysz@rehau.com>
Part in multipart message from: "Joanna Przybysz, BSCPZ 5411, ACC-BSC" <joanna.przybysz@rehau.com>
Part in multipart message from: "Joanna Przybysz, BSCPZ 5411, ACC-BSC" <joanna.przybysz@rehau.com>
Multipart message from: joanna.przybysz@rehau.com
Part in multipart message from: "Joanna Przybysz, BSCPZ 5411, ACC-BSC" <joanna.pr

Multipart message from: emma9furuheim@gmail.com
Part in multipart message from: emma9furuheim@gmail.com
Multipart message from: emma9furuheim@gmail.com
Part in multipart message from: emma9furuheim@gmail.com
Multipart message from: suse0ballis@gmail.com
Part in multipart message from: suse0ballis@gmail.com
Multipart message from: hinatran099@gmail.com
Part in multipart message from: hinatran099@gmail.com
One part message with content type: text/plain from: marleendetelder@gmail.com
Multipart message from: ireneshellyi@gmail.com
Part in multipart message from: ireneshellyi@gmail.com
One part message with content type: text/plain from: jjohannawalter@gmail.com
Multipart message from: lucyrainerio03@gmail.com
Part in multipart message from: lucyrainerio03@gmail.com
Multipart message from: marta.borkowska@rehau.com
Part in multipart message from: "Marta Borkowska, BSCPZ 5426, ACC-BSC" <MARTA.BORKOWSKA@REHAU.COM>
Part in multipart message from: "Marta Borkowska, BSCPZ 5426, ACC-BSC" <MARTA.

Multipart message from: emma0berg0@gmail.com
Part in multipart message from: emma0berg0@gmail.com
One part message with content type: text/plain from: michaellemel70@gmail.com
Multipart message from: taniagreer352@gmail.com
Part in multipart message from: taniagreer352@gmail.com
Multipart message from: taniagreer352@gmail.com
Part in multipart message from: taniagreer352@gmail.com
Multipart message from: annaolesen80@gmail.com
Part in multipart message from: annaolesen80@gmail.com
Multipart message from: elinahanssone0@gmail.com
Part in multipart message from: elinahanssone0@gmail.com
Multipart message from: elinahanssone0@gmail.com
Part in multipart message from: elinahanssone0@gmail.com
Multipart message from: sharon11koning11@gmail.com
Part in multipart message from: sharon11koning11@gmail.com
Multipart message from: katrinberger934@gmail.com
Part in multipart message from: katrinberger934@gmail.com
Multipart message from: rondabird7@gmail.com
Part in multipart message from: rondabi

Multipart message from: nicole0runge1@gmail.com
Part in multipart message from: nicole0runge1@gmail.com
Multipart message from: marianne0jensen0@gmail.com
Part in multipart message from: marianne0jensen0@gmail.com
Multipart message from: kristinurban94@gmail.com
Part in multipart message from: kristinurban94@gmail.com
Multipart message from: juliephan333@gmail.com
Part in multipart message from: juliephan333@gmail.com
Multipart message from: ireneshellyi@gmail.com
Part in multipart message from: ireneshellyi@gmail.com
Multipart message from: claudiamaydorn78@gmail.com
Part in multipart message from: claudiamaydorn78@gmail.com
Multipart message from: williammaria241@gmail.com
Part in multipart message from: williammaria241@gmail.com
Multipart message from: neguyenhuyen@gmail.com
Part in multipart message from: Neguyen Huyen <neguyenhuyen@gmail.com>
Part in multipart message from: Neguyen Huyen <neguyenhuyen@gmail.com>
Multipart message from: mariascheeler0@gmail.com
Part in multipart me

Multipart message from: lynda.lotz01@gmail.com
Part in multipart message from: lynda.lotz01@gmail.com
Multipart message from: mozo.allia@gmail.com
Part in multipart message from: mozo.allia@gmail.com
Multipart message from: phai.trans.tai@gmail.com
Part in multipart message from: Phan Tai <phai.trans.tai@gmail.com>
Part in multipart message from: Phan Tai <phai.trans.tai@gmail.com>
Multipart message from: grietstenger5@gmail.com
Part in multipart message from: grietstenger5@gmail.com
Multipart message from: sophiebill84@gmail.com
Part in multipart message from: sophiebill84@gmail.com
Multipart message from: sofialarsen953@gmail.com
Part in multipart message from: sofialarsen953@gmail.com
Multipart message from: alvaiversen76@gmail.com
Part in multipart message from: alvaiversen76@gmail.com
Multipart message from: helenamichel86@gmail.com
Part in multipart message from: helenamichel86@gmail.com
Multipart message from: joonlee1981@gmail.com
Part in multipart message from: joonlee1981@gma

Multipart message from: chantalkroon800@gmail.com
Part in multipart message from: Chantal De Kroon <chantalkroon800@gmail.com>
Part in multipart message from: Chantal De Kroon <chantalkroon800@gmail.com>
Multipart message from: pasimaria9@gmail.com
Part in multipart message from: Maria Pasi <pasimaria9@gmail.com>
Part in multipart message from: Maria Pasi <pasimaria9@gmail.com>
Multipart message from: martinabraun6333@gmail.com
Part in multipart message from: martinabraun6333@gmail.com
Multipart message from: annabehrmann12@gmail.com
Part in multipart message from: annabehrmann12@gmail.com
Multipart message from: annabehrmann12@gmail.com
Part in multipart message from: annabehrmann12@gmail.com
Multipart message from: amelieanthony.a81@gmail.com
Part in multipart message from: amelieanthony.a81@gmail.com
Multipart message from: piotrdean1@gmail.com
Part in multipart message from: piotrdean1@gmail.com
Multipart message from: yukinakamura050@gmail.com
Part in multipart message from: yukin

Multipart message from: tina.fukataki1999@gmail.com
Part in multipart message from: tina.fukataki1999@gmail.com
Multipart message from: elenora0michel@gmail.com
Part in multipart message from: elenora0michel@gmail.com
One part message with content type: text/plain from: iglesiasani514@gmail.com
Multipart message from: meretblome6@gmail.com
Part in multipart message from: meretblome6@gmail.com
Multipart message from: bojansim78@gmail.com
Part in multipart message from: bojansim78@gmail.com
Multipart message from: amelieanthony.a81@gmail.com
Part in multipart message from: amelieanthony.a81@gmail.com
Multipart message from: helene.vetillard12@gmail.com
Part in multipart message from: helene.vetillard12@gmail.com
Multipart message from: hi@bergentext.one
Part in multipart message from: "Bergen Text Company" <hi@bergentext.one>
Multipart message from: mohsen.hasany1@gmail.com
Part in multipart message from: Mohsen Hasani <mohsen.hasany1@gmail.com>
Part in multipart message from: Mohsen Has