In [1]:
import imaplib

In [25]:
def imap_login(address=None, password=None):
    '''Wrapper for logging into to email through IMAP
    
    ARGS: 
    address - str (defaul: None, prompt input). Email address 
    being connected to.
    
    password - str (default: None, prompt input). Password for email address.
    
    Returns:
    Mail object connected to corresponding server for email address'''
    
    import imaplib
    
    if not address:
        address = input('Enter you email address: ')
        
    if not password:
        password = input('Enter your password')
        
    if 'gmail' in address:
        SMTP_SERVER = 'imap.gmail.com'
    elif 'yahoo' in password:
        SMTP_SERVER = 'imap.mail.yahoo.com'
    else:
        raise NameError('Please enter a gmail or yahoo email address')
        
    SMTP_PORT = 993
    try:
        mail = imaplib.IMAP4_SSL(SMTP_SERVER)
        mail.login(address, password)
        mail.select('inbox')
    except Exception as e:
        raise
        
    return mail

In [61]:
mail = imap_login(address = '', password = '')

In [51]:
def search_mailbox(mail, inbox='inbox'):
    """Connects to mailbox and collects a list of ids from mailbox
    
    ARGS:
    mail - logged in mail object
    
    inbox - str (defauls: 'inbox'). Mailbox to connect to. Must be valid
    imap mailbox.
    
    Returns:
    tup (mail object, list of mail_ids)
    If you don't need the ids, you can use an underscore like so:
    mail, _ = search_mailbox(mail)"""
    
    import imaplib
    
    mail.select(inbox)
    
    typ, data = mail.search(None, 'ALL')
    
    mail_ids = data[0].decode()
    mail_ids = mail_ids.split()
    
    return mail, mail_ids

In [62]:
mail, mail_ids = search_mailbox(mail)

In [63]:
mail

<imaplib.IMAP4_SSL at 0x202b7ee5688>

In [88]:
mail_ids[0:5]

['1', '2', '3', '4', '5']

In [89]:
def print_mail(mail, i_d=None, mail_part='(RFC822)'):
    """Prints out mail messages to screen
    
    ARGS:
    mail - logged in mail object
    
    i_d - single id or list of ids
    id(s) of email(s) to print
    
    Returns: None; prints message to screen"""
    
    import imaplib
    import email
    
    if type(i_d) == list:
        for i in i_d:
            typ, data = mail.fetch(str(i).encode(), mail_part)
            meta = email.message_from_bytes(data[0][1])
            print(meta)
    
    else:
        typ, data = mail.fetch(str(i_d).encode(), mail_part)
        meta = email.message_from_bytes(data[0][1])
        print(meta)

In [39]:
example1 = print_mail(mail, mail_ids[0])

Delivered-To: taggerhq@gmail.com
Received: by 2002:a2e:99d4:0:0:0:0:0 with SMTP id l20csp9770442ljj;
        Wed, 13 Nov 2019 09:10:12 -0800 (PST)
X-Received: by 2002:a92:8108:: with SMTP id e8mr4877134ild.209.1573664547771;
        Wed, 13 Nov 2019 09:02:27 -0800 (PST)
ARC-Seal: i=1; a=rsa-sha256; t=1573664547; cv=none;
        d=google.com; s=arc-20160816;
        b=TL/umDTkZUelCDVKGo83kkeNgCobO21sNG/p2lV4SezXrl6V1Xl5vvQSep5e7ZX4H3
         3A/QPxSJ/arTUPZPXKFknxQrCdiwmITfaQRQSZTOTbKGicxgxyOQL9Eym5G+SZHJvX1+
         yiIjcI6nmzytaxK217elUeGClyBQ3PmhtFCuqEfd19K+sfWG0fnhjNLnIEjcTdXJet4z
         gN4taVMageIB6WUJiG6OfJm7ctuXnTKxyCbT9bsFVG4UNqSykidzUuILLyEbbm5SZjhP
         mitYuQYYnIj3Z6euhpXT+ls83hsl6cJMUgWAkpVaf+zh867BYQzytiUO/0v6fGWug3x8
         0ZPA==
ARC-Message-Signature: i=1; a=rsa-sha256; c=relaxed/relaxed; d=google.com; s=arc-20160816;
        h=to:from:subject:message-id:feedback-id:reply-to:date:mime-version
         :dkim-signature;
        bh=/r5N7YLGKda5iqfAlgz4owxryRsdLa

In [66]:
def save_mail(mail, i_d, filename='email_data.csv', verbose=False):
    """Writes email data to csv
    
    ARGS: 
    mail - logged in mail object
    
    i_d - list of i_ds
    ids of messages to get
    
    filename - string ending in .csv (default: 'email_data.csv')
    name of file to write to 
    
    Returns: None, saves data to csv"""
    
    import imaplib
    import email
    import csv
    
    csv_file = open(filename, 'w', encoding='UTF-8')
    csv_writer = csv.writer(csv_file)
    csv_writer.writerow(['id', 'uid', 'from_', 'subject', 'msg', 'content_type'])
    
    for i in i_d:
        try:
            typ, data = mail.fetch(str(i).encode(), '(UID RFC822)')

            uid = email.message_from_bytes(data[0][0])
            uid = uid.get_payload()
            uid = uid.split()[-3]

            meta = email.message_from_bytes(data[0][1])
            from_ = meta['From']
            subject = meta['Subject']
            content_type = meta['Content-Type'].split(';')[0]
            
            msg = meta.get_payload()
            while type(msg) != str:
                msg = msg[0].get_payload()
            
            print(i)
            if verbose:
                print('UID: ', uid)
                print('From: ', from_)
                print('Subject: ', subject)
                print('Content-Type: ', content_type)
                print('Message: ', msg)
            csv_writer.writerow([i, uid, from_, subject, msg, content_type])
            print('Message saved')
        except Exception as e:
            print(e)

In [67]:
save_mail(mail, mail_ids)

1
Message saved
2
Message saved
3
Message saved
4
Message saved
5
Message saved
6
Message saved
7
Message saved
8
Message saved
9
Message saved
10
Message saved
11
Message saved
12
Message saved
13
Message saved
14
Message saved
15
Message saved
16
Message saved
17
Message saved
18
Message saved
19
Message saved
20
Message saved
21
Message saved
22
Message saved
23
Message saved
24
Message saved
25
Message saved
26
Message saved
27
Message saved
28
Message saved
29
Message saved
30
Message saved
31
Message saved
32
Message saved
33
Message saved
34
Message saved
35
Message saved
36
Message saved
37
Message saved
38
Message saved
39
Message saved
40
Message saved
41
Message saved
42
Message saved
43
Message saved
44
Message saved
45
Message saved
46
Message saved
47
Message saved
48
Message saved
49
Message saved
50
Message saved
51
Message saved
52
Message saved
53
Message saved
54
Message saved
55
Message saved
56
Message saved
57
Message saved
58
Message saved


KeyboardInterrupt: 

In [31]:
import pandas as pd
df = pd.read_csv('email_data.csv')

In [41]:
print(list(df[df['uid'] == 96]['msg']))

['<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.=\r\r\nw3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\r\r\n<html xmlns=3D"http://www.w3.org/1999/xhtml">\r\r\n<head>\r\r\n<meta http-equiv=3D"Content-Type" content=3D"text/html; charset=3Dutf-8" />\r\r\n<meta name=3D"viewport" content=3D"width=3Ddevice-width, initial-scale=3D1.=\r\r\n0" />\r\r\n<meta name=3D"format-detection" content=3D"telephone=3Dno" />\r\r\n<meta http-equiv=3D"X-UA-Compatible" content=3D"IE=3Dedge" />\r\r\n<meta name=3D"x-apple-disable-message-reformatting" />\r\r\n<title>BarkBox</title>\r\r\n<style type=3D"text/css">\r\r\nbody {\r\r\nmin-width: 100%;\r\r\nbackground-color: #ffffff;\r\r\nmargin: 0px !important;\r\r\npadding: 0px !important;\r\r\nwidth: 100% !important;\r\r\nmin-width: 100% !important;\r\r\n-webkit-text-size-adjust: 100% !important;\r\r\n-ms-text-size-adjust: 100% !important;\r\r\n-webkit-font-smoothing: antialiased !important;\r\r\nfont-family: Arial, Helvetica Neue, Helvet

In [485]:
df.shape

(154, 6)

In [71]:
df[1:2]

Unnamed: 0,id,uid,from_,subject,msg,content_type
1,2,2,"""Namecheap Support"" <support@namecheap.com>",IMMEDIATE VERIFICATION required for your domai...,"Hi Shane, <br /><br />\r\r\n\r\r\nAs of Januar...",multipart/alternative


In [409]:
from timeit import default_timer as timer
start = timer()
# ...

import basilica
API_KEY = 'SLOW_DEMO_KEY'
sentences = df[['from_','subject', 'msg','content_type']]
# [
#     "This is a sentence!",
#     "This is a similar sentence!",
#     "I don't think this sentence is very similar at all...",
#     "I don't think this sentence is very similar at all...",
#     "I don't think this sentence is very similar at all...",
#     "I don't think this sentence is very similar at all...",
#     "I don't think this sentence is very similar at all..."
# ]
with basilica.Connection(API_KEY) as c:
    embedding = list(c.embed_sentences(sentences, model='email', version='default', opts={}, timeout=5))
    #print(embedding)
    
end = timer()
print(end - start)


0.8528249000009964


In [429]:
df.id[:]

0        1
1        2
2        3
3        4
4        5
      ... 
149    150
150    151
151    152
152    153
153    154
Name: id, Length: 154, dtype: int64

In [97]:
df[]

KeyError: 0

In [463]:
def embed_basilica_to_df(df):
    ids_email = df['uid']
    df_new = df[['from_','subject', 'msg','content_type']].copy()
    from timeit import default_timer as timer
    start = timer()
    import basilica
    API_KEY = 'SLOW_DEMO_KEY'
    df_new['joined_columns'] = df_new[df_new.columns[1:]].apply(lambda x: ','.join(x.dropna().astype(str)), axis=1)
    df_final = pd
    column_embedded = []
    for column in df_new['joined_columns']:
        sentences = column
        with basilica.Connection(API_KEY) as c:
            embedding = list(c.embed_sentence(sentences, model='email', version='default', opts={}, timeout=5))
        column_embedded.append(embedding)
        #print(start)
    df_new['embedded'] = column_embedded
    df_new['id_email'] = ids_email
    end = timer()
    print(end - start)
    return df_new

In [464]:
df_test = embed_basilica_to_df(df)

92.16939840000123


In [488]:
df_test

Unnamed: 0,from_,subject,msg,content_type,joined_columns,embedded,id_email
0,Google Community Team <googlecommunityteam-nor...,Finish setting up your new Google Account,R29vZ2xlICANCjxodHRwczovL3d3dy5nb29nbGUuY29tL2...,multipart/alternative,"Finish setting up your new Google Account,R29v...","[-0.83379, -0.281114, -0.0980531, -0.13849, 0....",1
1,"""Namecheap Support"" <support@namecheap.com>",IMMEDIATE VERIFICATION required for your domai...,Hi Shane br br As of January 1 2014 the Intern...,multipart/alternative,IMMEDIATE VERIFICATION required for your domai...,"[-0.258877, -0.381759, 0.879337, -0.107925, 0....",2
2,Namecheap <hello@namecheap.com>,Welcome to Namecheap,DOCTYPE html html xmlns http www w3 org 1999 x...,text/html,"Welcome to Namecheap,DOCTYPE html html xmlns h...","[-0.993557, -1.10535, 0.367852, 0.0831669, 0.0...",3
3,"""Namecheap.com Support"" <support@namecheap.com>","Namecheap.com Order Summary (Order# 50229187, ...",DQotLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS...,multipart/alternative,"Namecheap.com Order Summary (Order# 50229187, ...","[-1.21567, -0.398846, -0.247979, -0.0636585, -...",4
4,Netlify <team@netlify.com>,=?utf-8?Q?[Netlify]=20Welcome=20to=20Netlify.=...,li text indent 1em table td border collapse co...,multipart/alternative,=?utf-8?Q?[Netlify]=20Welcome=20to=20Netlify.=...,"[0.134237, -0.483953, 1.38002, -0.127382, 1.42...",5
...,...,...,...,...,...,...,...
149,Google <no-reply@accounts.google.com>,Critical security alert,W2ltYWdlOiBHb29nbGVdDQpTaWduLWluIGF0dGVtcHQgd2...,multipart/alternative,"Critical security alert,W2ltYWdlOiBHb29nbGVdDQ...","[-0.908398, -0.424279, 0.0300707, -0.16903, 0....",154
150,Google <no-reply@accounts.google.com>,Security alert,W2ltYWdlOiBHb29nbGVdDQpSZWNvdmVyeSBwaG9uZSB3YX...,multipart/alternative,"Security alert,W2ltYWdlOiBHb29nbGVdDQpSZWNvdmV...","[-0.882989, -0.331768, 0.0611199, -0.19062, 0....",155
151,Google <no-reply@accounts.google.com>,Security alert,W2ltYWdlOiBHb29nbGVdDQpSZWNvdmVyeSBwaG9uZSB3YX...,multipart/alternative,"Security alert,W2ltYWdlOiBHb29nbGVdDQpSZWNvdmV...","[-0.873831, -0.334034, 0.0759848, -0.118473, 0...",156
152,Google <no-reply@accounts.google.com>,Critical security alert,W2ltYWdlOiBHb29nbGVdDQpBY2Nlc3MgZm9yIGxlc3Mgc2...,multipart/alternative,"Critical security alert,W2ltYWdlOiBHb29nbGVdDQ...","[-0.977607, -0.294551, 0.0754415, -0.105849, 0...",157


In [433]:
sentences

Unnamed: 0,from_,subject,msg,content_type
0,Google Community Team <googlecommunityteam-nor...,Finish setting up your new Google Account,R29vZ2xlICANCjxodHRwczovL3d3dy5nb29nbGUuY29tL2...,multipart/alternative
1,"""Namecheap Support"" <support@namecheap.com>",IMMEDIATE VERIFICATION required for your domai...,Hi Shane br br As of January 1 2014 the Intern...,multipart/alternative
2,Namecheap <hello@namecheap.com>,Welcome to Namecheap,DOCTYPE html html xmlns http www w3 org 1999 x...,text/html
3,"""Namecheap.com Support"" <support@namecheap.com>","Namecheap.com Order Summary (Order# 50229187, ...",DQotLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS...,multipart/alternative
4,Netlify <team@netlify.com>,=?utf-8?Q?[Netlify]=20Welcome=20to=20Netlify.=...,li text indent 1em table td border collapse co...,multipart/alternative
...,...,...,...,...
149,Google <no-reply@accounts.google.com>,Critical security alert,W2ltYWdlOiBHb29nbGVdDQpTaWduLWluIGF0dGVtcHQgd2...,multipart/alternative
150,Google <no-reply@accounts.google.com>,Security alert,W2ltYWdlOiBHb29nbGVdDQpSZWNvdmVyeSBwaG9uZSB3YX...,multipart/alternative
151,Google <no-reply@accounts.google.com>,Security alert,W2ltYWdlOiBHb29nbGVdDQpSZWNvdmVyeSBwaG9uZSB3YX...,multipart/alternative
152,Google <no-reply@accounts.google.com>,Critical security alert,W2ltYWdlOiBHb29nbGVdDQpBY2Nlc3MgZm9yIGxlc3Mgc2...,multipart/alternative


In [82]:
test2 = embedding[0]

In [83]:
test2 == test1

True

In [45]:
df['from_'][0:5]

0    Google Community Team <googlecommunityteam-nor...
1          "Namecheap Support" <support@namecheap.com>
2                      Namecheap <hello@namecheap.com>
3      "Namecheap.com Support" <support@namecheap.com>
4                           Netlify <team@netlify.com>
Name: from_, dtype: object

In [48]:
df['subject'][0]

'Finish setting up your new Google Account'

In [49]:
df['subject'][1]

'IMMEDIATE VERIFICATION required for your domain(s)'

In [50]:
df['subject'][2]

'Welcome to Namecheap'

In [51]:
df['subject'][3]

'Namecheap.com Order Summary (Order# 50229187, Order Ref#\r\r\n taggerhq-98.210.195.55-f49c52697f7b460e9049bf88cf0d3498 );'

In [407]:
df['msg'][60]

'View Web Version https view e lowes com qsec61c35496d808ecb7ff876a5052b44843dafc431c7683fc62f477e322a418fcaa3ad8441e707d606817b9d8c769acbe426288e145e5e0bc7fb199e5f1f93f25bbb6b6e5ef133dfb97ee74771ffe19 Unfortunately your email client cannot display HTML or your settings are turned off To view this email please copy and paste the link above into your browser Click here to unsubscribe http click e lowes com qs9c0f795a75ada51acc03296b4d9f3971f0902f09bf96f787e4b6c631e6d01e0fd0c1b504ac952385dc0e5fbcc23c0ac4c1d4ecc899b6f9547c630aec2bea32 You can also contact us here http click e lowes com qs9c0f795a75ada5ca577ce4a39390e689d2593ef93228421bf9033d4ff2e7eb87dab51b1e7202a877d2fd08cce871e8004e6f6497d3697a Lowes Companies Inc 1000 Lowes Blvd Mooresville NC 28117 View our Privacy Policy http click e lowes com qs9c0f795a75ada5b50fdda79ce0e3be4f41602f209575d034ee0783064ebe337e24af6bf7ac4d0ba53be0f46faec2c1dfd1108212fe9136 C 2019 by Lowe s R All rights reserved Lowe s and the gable design are registere

In [61]:
embedding[5]

[0.254256,
 -0.474575,
 1.33103,
 -0.271874,
 1.17263,
 -0.165119,
 -0.587641,
 -0.314974,
 0.0830585,
 -0.600357,
 -0.162125,
 -0.0516132,
 -0.407519,
 -0.245937,
 -0.637593,
 -0.168057,
 -0.804933,
 -0.220954,
 -0.318795,
 -0.312209,
 -0.387674,
 0.120713,
 -0.729393,
 -0.0260139,
 0.62427,
 -0.269808,
 0.314676,
 0.226492,
 -0.740952,
 0.388876,
 -0.232444,
 -1.64847,
 -0.156164,
 0.233963,
 -0.932272,
 0.0599625,
 -0.54156,
 0.16557,
 -0.0838702,
 -0.0785121,
 -0.678266,
 -0.5706,
 -0.50689,
 -0.0161251,
 -0.787368,
 0.082466,
 -2.71827,
 0.192935,
 -0.0394947,
 -0.150446,
 -0.00888205,
 -0.00815404,
 0.0983866,
 0.977929,
 0.64477,
 -0.509619,
 0.594352,
 0.09659,
 -1.50234,
 -1.70457,
 0.609077,
 -0.0567993,
 0.261185,
 0.0296685,
 -0.365417,
 -0.151173,
 1.38316,
 -0.348478,
 0.286381,
 -0.350487,
 0.667856,
 0.46728,
 -0.71493,
 0.0684683,
 -0.0842288,
 0.849118,
 0.194398,
 -0.186597,
 0.142835,
 1.56821,
 -0.00381372,
 -0.370742,
 -0.136413,
 -0.470927,
 -1.2126,
 -0.457647,


In [466]:
from sklearn.neighbors import NearestNeighbors, KNeighborsRegressor, KNeighborsClassifier
nn = KNeighborsClassifier()

import numpy as np

In [467]:
test_100 = np.vstack(np.array(df_test.embedded[100:101]))

In [497]:
df_test[100:101]

Unnamed: 0,from_,subject,msg,content_type,joined_columns,embedded,id_email
100,Heroku <no-reply@heroku.com>,"Debugging Prod DBs, Graceful Shutdowns, and Dr...",December 2019 Newsletter View in Browser https...,multipart/alternative,"Debugging Prod DBs, Graceful Shutdowns, and Dr...","[-0.660411, -0.644085, -0.216238, 0.0171638, -...",102


In [469]:
df_test[63:64]

Unnamed: 0,from_,subject,msg,content_type,joined_columns,embedded,id_email
63,Heroku <no-reply@heroku.com>,Extend your app in a couple of clicks with Her...,Heroku Postgres provides integrated tools such...,multipart/alternative,Extend your app in a couple of clicks with Her...,"[-1.01369, -0.608849, -0.232221, -0.057915, -0...",65


In [489]:
X = np.vstack(np.array(df_test['embedded']))
y = df_test[['id_email']]

In [490]:
y

Unnamed: 0,id_email
0,1
1,2
2,3
3,4
4,5
...,...
149,154
150,155
151,156
152,157


In [219]:
X[0]


array([-8.29169e-01, -2.86474e-01, -5.33251e-02, -1.29674e-01,
        1.60342e-01,  8.11307e-01, -8.24302e-02,  2.47094e-01,
       -4.74104e-01, -4.04739e-01, -1.49701e-01,  6.39366e-01,
        5.02851e-01, -4.67005e-02,  7.56267e-02,  3.48291e-01,
        2.30641e-01,  5.07231e-01, -3.07716e-01, -6.11254e-01,
        1.27446e-01, -2.81927e-01, -2.41977e-02, -1.33096e-01,
        3.43884e-01,  9.33474e-01,  3.52724e-01,  2.92959e-02,
        1.01734e-01,  3.24678e-02,  5.23186e-01, -2.36646e-01,
        4.46409e-01,  7.52639e-02, -3.16180e-02,  1.05938e-01,
        2.18992e-01,  1.46872e-02,  7.19437e-01,  1.91201e-02,
        2.41664e-01,  4.66771e-01, -4.66632e-01, -2.86754e-01,
       -1.08034e-01, -1.59961e-01, -4.79914e+00,  1.06629e+00,
        2.31618e-01, -5.12702e-01,  4.94732e-01,  1.16810e-01,
        1.23109e-01,  5.41103e-01,  7.69312e-01,  3.08303e-01,
       -1.09727e-01, -3.30516e-02, -2.22562e-01, -6.64866e-03,
        6.01869e-01, -2.26786e-01,  1.23609e-01,  1.638

In [491]:
nn.fit(X,y)

  """Entry point for launching an IPython kernel.


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [438]:
import basilica
API_KEY = 'SLOW_DEMO_KEY'
sentences = 'google'
# [
#     "This is a sentence!",
#     "This is a similar sentence!",
#     "I don't think this sentence is very similar at all...",
#     "I don't think this sentence is very similar at all...",
#     "I don't think this sentence is very similar at all...",
#     "I don't think this sentence is very similar at all...",
#     "I don't think this sentence is very similar at all..."
# ]
with basilica.Connection(API_KEY) as c:
    embedding_tested = list(c.embed_sentences(sentences, model='email', version='default', opts={}, timeout=5))
    #print(embedding)

HTTPError: 429 Client Error: Too Many Requests for url: https://api.basilica.ai/embed/text/email/default

In [None]:
X_test = df_test[]

In [311]:
np.vstack(np.array(embedding_tested))

array([[-0.451287, -0.215444,  1.8912  , ...,  0.324049, -0.363744,
        -0.105432],
       [-0.509728,  0.925858,  1.16073 , ..., -0.412699, -0.383742,
         0.147448],
       [-0.509728,  0.925858,  1.16073 , ..., -0.412699, -0.383742,
         0.147448],
       [-0.451287, -0.215444,  1.8912  , ...,  0.324049, -0.363744,
        -0.105432],
       [ 0.257372, -0.43574 ,  1.58261 , ...,  0.267864, -0.595431,
        -0.19945 ],
       [-0.229938, -0.041547,  1.53044 , ...,  0.165429, -1.1708  ,
         0.253264]])

In [492]:
nn.kneighbors(test_100, 10)[1][0]

array([100, 138, 126,  78, 101, 132,  71, 108,   8,  63], dtype=int64)

In [24]:
df[df['uid'] == 95]

Unnamed: 0,id,uid,from_,subject,msg,content_type
93,94,95,=?utf-8?Q?The=20FullStory=20Team?= <thefuture@...,=?utf-8?Q?New=20sessions=20are=20ready=20for=2...,You ve got sessions now what Search segment an...,multipart/alternative


In [511]:
df_test.subject[134]

'New Deals Have Arrived'

In [2]:
import imaplib
import email
import csv
import quopri
from bs4 import BeautifulSoup
import re
class IMap():
    def __init__(self, address=None, password=None):
        self.address = address
        self.password = password
        self.login(self.address, self.password)
    def login(self, address=None, password=None):
        if not address:
            raise Exception("Missing email address.")
        if not password:
            raise Exception("Missing email password.")
        if "gmail" in address:
            server = "imap.gmail.com"
        elif "@yahoo" in password:
            server = "imap.mail.yahoo.com"
        else:
            raise Exception("Please use a Yahoo or Gmail email account.")
        try:
            self.mail = imaplib.IMAP4_SSL(server)
            self.mail.login(address, password)
        except Exception as e:
            raise
        print("Logged in as {}!".format(address))
    def search_mailbox(self, inbox="inbox"):
        self.mail.select(inbox)
        typ, data = self.mail.search(None, "ALL")
        mail_ids = data[0].decode()
        mail_ids = mail_ids.split()
        return self.mail, mail_ids
    def clean_text(self, text):
        text = text.decode("utf-8", errors="ignore")
        html = BeautifulSoup(text, "html.parser")
        removals = html.find_all("style")
        for match in removals:
            match.decompose()
        text = re.sub(r"\\n|\\r", "", text)
        text = " ".join(re.findall(r"\b\w+\b", text))
        return text
    def save_mail(self, i_d, filename='email_data.csv', verbose=False):
        csv_file = open(filename, 'w', encoding='utf-8')
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['id', 'uid', 'from_', 'subject', 'msg', 'content_type'])
        for i in i_d:
            try:
                typ, data = self.mail.fetch(str(i).encode(), '(UID RFC822)')
                uid = email.message_from_bytes(data[0][0])
                uid = uid.get_payload()
                uid = uid.split()[-3]
                meta = email.message_from_bytes(data[0][1])
                from_ = meta['From']
                subject = meta['Subject']
                content_type = meta['Content-Type'].split(';')[0]
                msg = meta.get_payload()
                while type(msg) != str:
                    msg = msg[0].get_payload()
                print(i)
                if verbose:
                    print('UID: ', uid)
                    print('From: ', from_)
                    print('Subject: ', subject)
                    print('Content-Type: ', content_type)
                    print('Message: ', quopri.decodestring(msg))
                csv_writer.writerow([i, uid, from_, subject, self.clean_text(quopri.decodestring(msg)), content_type])
                print('Message saved')
            except Exception as e:
                print(e)
mail = IMap("", "")
m,i = mail.search_mailbox()
mail.save_mail(i)

Logged in as taggerhq@gmail.com!
1
Message saved
2
Message saved
3
Message saved
4
Message saved
5
Message saved
6
Message saved
7
Message saved
8
Message saved
9
Message saved
10
Message saved
11
Message saved
12
Message saved
13
Message saved
14
Message saved
15
Message saved
16
Message saved
17
Message saved
18
Message saved
19
Message saved
20
Message saved
21
Message saved
22
Message saved
23
Message saved
24
Message saved
25
Message saved
26
Message saved
27
Message saved
28
Message saved
29
Message saved
30
Message saved
31
Message saved
32
Message saved
33
Message saved
34
Message saved
35
Message saved
36
Message saved
37
Message saved
38
Message saved
39
Message saved
40
Message saved
41
Message saved
42
Message saved
43
Message saved
44
Message saved
45
Message saved
46
Message saved
47
Message saved
48
Message saved
49
Message saved
50
Message saved
51
Message saved
52
Message saved
53
Message saved
54
Message saved
55
Message saved
56
Message saved
57
Message saved
58
Mess

In [15]:
class Basilica_api():

    columns = ['from_','subject', 'msg','content_type']

    def __init__(self, df, API_KEY='SLOW_DEMO_KEY', columns=columns):
        self.df = df
        self.API_KEY = API_KEY
        self.columns = columns

    def make_one_column(self):
        """
        This function will make a new column named 'joined_columns' from the columns given to the class.
        It will also look for a column named 'uid' to return that along with the other columns but not joined.
        """
        ids_email = self.df[['uid']]
        df_new = self.df[self.columns].copy()
        df_new['joined_columns'] = df_new[df_new.columns[1:]].apply(lambda x: ','.join(x.dropna().astype(str)), axis=1)
        df_new['id_email'] = ids_email 
        self.df = df_new
        return None

    def embed_basilica_to_df(self):
        """
        This function will time how long basilica takes to run. For faster performance pass an API key that is functional.
        Returns the df with a column named 'embedded'.
        """
        import basilica
        self.make_one_column()
        from timeit import default_timer as timer
        start = timer()
        column_embedded = []
        for column in self.df['joined_columns']:
            sentence = column
            with basilica.Connection(self.API_KEY) as c:
                embedding = list(c.embed_sentence(sentence, model='email', version='default', opts={}, timeout=5))
            column_embedded.append(embedding)
        self.df['embedded'] = column_embedded
        end = timer()
        print(end - start)
        return self.df

In [16]:
import pandas as pd
df = pd.read_csv('email_data.csv')

In [17]:
df.head()

Unnamed: 0,id,uid,from_,subject,msg,content_type
0,1,1,Google Community Team <googlecommunityteam-nor...,Finish setting up your new Google Account,R29vZ2xlICANCjxodHRwczovL3d3dy5nb29nbGUuY29tL2...,multipart/alternative
1,2,2,"""Namecheap Support"" <support@namecheap.com>",IMMEDIATE VERIFICATION required for your domai...,Hi Shane br br As of January 1 2014 the Intern...,multipart/alternative
2,3,3,Namecheap <hello@namecheap.com>,Welcome to Namecheap,DOCTYPE html html xmlns http www w3 org 1999 x...,text/html
3,4,4,"""Namecheap.com Support"" <support@namecheap.com>","Namecheap.com Order Summary (Order# 50229187, ...",DQotLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS...,multipart/alternative
4,5,5,Netlify <team@netlify.com>,=?utf-8?Q?[Netlify]=20Welcome=20to=20Netlify.=...,li text indent 1em table td border collapse co...,multipart/alternative


In [20]:
ba = Basilica_api(df, API_KEY='')

In [21]:
ba.embed_basilica_to_df()

95.25190839999993


Unnamed: 0,from_,subject,msg,content_type,joined_columns,id_email,embedded
0,Google Community Team <googlecommunityteam-nor...,Finish setting up your new Google Account,R29vZ2xlICANCjxodHRwczovL3d3dy5nb29nbGUuY29tL2...,multipart/alternative,"Finish setting up your new Google Account,R29v...",1,"[-0.83379, -0.281114, -0.0980531, -0.13849, 0...."
1,"""Namecheap Support"" <support@namecheap.com>",IMMEDIATE VERIFICATION required for your domai...,Hi Shane br br As of January 1 2014 the Intern...,multipart/alternative,IMMEDIATE VERIFICATION required for your domai...,2,"[-0.258877, -0.381759, 0.879337, -0.107925, 0...."
2,Namecheap <hello@namecheap.com>,Welcome to Namecheap,DOCTYPE html html xmlns http www w3 org 1999 x...,text/html,"Welcome to Namecheap,DOCTYPE html html xmlns h...",3,"[-0.993557, -1.10535, 0.367852, 0.0831669, 0.0..."
3,"""Namecheap.com Support"" <support@namecheap.com>","Namecheap.com Order Summary (Order# 50229187, ...",DQotLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS...,multipart/alternative,"Namecheap.com Order Summary (Order# 50229187, ...",4,"[-1.21567, -0.398846, -0.247979, -0.0636585, -..."
4,Netlify <team@netlify.com>,=?utf-8?Q?[Netlify]=20Welcome=20to=20Netlify.=...,li text indent 1em table td border collapse co...,multipart/alternative,=?utf-8?Q?[Netlify]=20Welcome=20to=20Netlify.=...,5,"[0.134237, -0.483953, 1.38002, -0.127382, 1.42..."
...,...,...,...,...,...,...,...
151,Google <no-reply@accounts.google.com>,Security alert,W2ltYWdlOiBHb29nbGVdDQpSZWNvdmVyeSBwaG9uZSB3YX...,multipart/alternative,"Security alert,W2ltYWdlOiBHb29nbGVdDQpSZWNvdmV...",156,"[-0.873831, -0.334034, 0.0759848, -0.118473, 0..."
152,Google <no-reply@accounts.google.com>,Critical security alert,W2ltYWdlOiBHb29nbGVdDQpBY2Nlc3MgZm9yIGxlc3Mgc2...,multipart/alternative,"Critical security alert,W2ltYWdlOiBHb29nbGVdDQ...",157,"[-0.977607, -0.294551, 0.0754415, -0.105849, 0..."
153,Google <no-reply@accounts.google.com>,Security alert,W2ltYWdlOiBHb29nbGVdDQpOb24tR29vZ2xlIGFwcHMgaG...,multipart/alternative,"Security alert,W2ltYWdlOiBHb29nbGVdDQpOb24tR29...",158,"[-1.02572, -0.26337, -0.0326564, -0.176213, 0...."
154,Google <no-reply@accounts.google.com>,Security alert,W2ltYWdlOiBHb29nbGVdDQpUYWdnZXIgd2FzIGdyYW50ZW...,multipart/alternative,"Security alert,W2ltYWdlOiBHb29nbGVdDQpUYWdnZXI...",159,"[-0.876648, -0.337646, -0.061159, -0.245319, 0..."
