In [3]:
def imap_login(address=None, password=None):
    '''Wrapper for logging into to email through IMAP
    
    ARGS: 
    address - str (defaul: None, prompt input). Email address 
    being connected to.
    
    password - str (default: None, prompt input). Password for email address.
    
    Returns:
    Mail object connected to corresponding server for email address'''
    
    import imaplib
    
    if not address:
        address = input('Enter you email address: ')
        
    if not password:
        password = input('Enter your password')
        
    if 'gmail' in address:
        SMTP_SERVER = 'imap.gmail.com'
    elif 'yahoo' in password:
        SMTP_SERVER = 'imap.mail.yahoo.com'
    else:
        raise NameError('Please enter a gmail or yahoo email address')
        
    SMTP_PORT = 993
    try:
        mail = imaplib.IMAP4_SSL(SMTP_SERVER)
        mail.login(address, password)
        mail.select('inbox')
    except Exception as e:
        raise
        
    return mail

In [4]:
def search_mailbox(mail, inbox='inbox'):
    """Connects to mailbox and collects a list of ids from mailbox
    
    ARGS:
    mail - logged in mail object
    
    inbox - str (defauls: 'inbox'). Mailbox to connect to. Must be valid
    imap mailbox.
    
    Returns:
    tup (mail object, list of mail_ids)
    If you don't need the ids, you can use an underscore like so:
    mail, _ = search_mailbox(mail)"""
    
    import imaplib
    
    mail.select(inbox)
    
    typ, data = mail.search(None, 'ALL')
    
    mail_ids = data[0].decode()
    mail_ids = mail_ids.split()
    
    return mail, mail_ids

In [5]:
def print_mail(mail, i_d=None, mail_part='(RFC822)'):
    """Prints out mail messages to screen
    
    ARGS:
    mail - logged in mail object
    
    i_d - single id or list of ids
    id(s) of email(s) to print
    
    Returns: None; prints message to screen"""
    
    import imaplib
    import email
    
    if type(i_d) == list:
        for i in i_d:
            typ, data = mail.fetch(str(i).encode(), mail_part)
            meta = email.message_from_bytes(data[0][1])
            print(meta)
    
    else:
        typ, data = mail.fetch(str(i_d).encode(), mail_part)
        meta = email.message_from_bytes(data[0][1])
        print(meta)

In [2]:
def save_mail(mail, i_d, filename='email_data.csv', verbose=False):
    """Writes email data to csv
    
    ARGS: 
    mail - logged in mail object
    
    i_d - list of i_ds
    ids of messages to get
    
    filename - string ending in .csv (default: 'email_data.csv')
    name of file to write to 
    
    Returns: None, saves data to csv"""
    
    import imaplib
    import email
    import csv
    
    csv_file = open(filename, 'w', encoding='UTF-8')
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['id', 'uid', 'from_', 'subject', 'msg', 'content_type'])
    
    for i in i_d:
        try:
            typ, data = mail.fetch(str(i).encode(), '(UID RFC822)')

            uid = email.message_from_bytes(data[0][0])
            uid = uid.get_payload()
            uid = uid.split()[-3]

            meta = email.message_from_bytes(data[0][1])
            from_ = meta['From']
            subject = meta['Subject']
            content_type = meta['Content-Type'].split(';')[0]
            
            msg = meta.get_payload()
            while type(msg) != str:
                msg = msg[0].get_payload()
            
            print(i)
            if verbose:
                print('UID: ', uid)
                print('From: ', from_)
                print('Subject: ', subject)
                print('Content-Type: ', content_type)
                print('Message: ', msg)
            csv_writer.writerow([i, uid, from_, subject, msg, content_type])
            print('Message saved')
        except Exception as e:
            print(e)
            
        

In [9]:
def clean_emails(df, drop_html=True):
    """Cleans our emails from csv (assumes columns of save_mail func)
    
    ARGS: pandas dataframe
    dataframe from csv with columns of save_mail()
    
    Returns:
    Dataframe with emails cleaned up"""
    
    df['content_type'] = df['content_type'].map(lambda x: 'multipart/alternative' if x == 'Multipart/Alternative' else x)
    
    df = df.dropna(subset=['msg'])
    
    if drop_html:
        mask = df['msg'].str.startswith('<')
        df = df[~mask]
        
    df['msg'] = df['msg'].apply(lambda x: ''.join(x.split('=\r\n')))
    df['msg'] = df['msg'].apply(lambda x: ''.join(x.split('=E2=80=99')))
    df['msg'] = df['msg'].apply(lambda x: ' '.join(x.split('=0A')))
    df['msg'] = df['msg'].apply(lambda x: ' '.join(x.split('=0D')))
    df['msg'] = df['msg'].apply(lambda x: ' '.join(x.split('\r\n')))
    df['msg'] = df['msg'].apply(lambda x: ' '.join(x.split('\n')))
    df['msg'] = df['msg'].apply(lambda x: ' '.join(x.split('\r')))
    df['msg'] = df['msg'].apply(lambda x: ' '.join(x.split('\t')))
    return df