In [100]:
import random
import numpy as np
import pandas as pd
import re
import mailbox
from bs4 import BeautifulSoup

In [101]:
!pip install -U sentence-transformers




### Email Preproccessing

In [102]:
def getbodyfromemail(msg):
    body = None
    #Walk through the parts of the email to find the text body.    
    if msg.is_multipart():    
        for part in msg.walk():
            # If part is multipart, walk through the subparts.            
            if part.is_multipart(): 
                for subpart in part.walk():
                    if subpart.get_content_type() == 'text/plain':
                        # Get the subpart payload (i.e the message body)
                        body = subpart.get_payload(decode=False) 
                        #charset = subpart.get_charset()
                    elif subpart.get_content_type() == 'text/html':
                        body = subpart.get_payload(decode=False) 
                        #body = BeautifulSoup(body, "lxml").text

            # Part isn't multipart so get the email body
            elif part.get_content_type() == 'text/plain':
                body = part.get_payload(decode=False)
                #charset = part.get_charset()
            elif part.get_content_type() == 'text/html':
                body = part.get_payload(decode=False) 
                #body = BeautifulSoup(body, "lxml").text

    # If this isn't a multi-part message then get the payload (i.e the message body)
    elif msg.get_content_type() == 'text/plain':
        body = msg.get_payload(decode=False) 
    elif msg.get_content_type() == 'text/html':
        body = msg.get_payload(decode=False) 
        #body = BeautifulSoup(body, "lxml").text
    return body    


In [104]:
def parse_emails(mbox_messages):
    email_bodies = []
    email_subjects = []
    email_ids = []
    email_content_types = []
    plain_text_emails = []
    for i, message in enumerate(mbox_messages):
        body = getbodyfromemail(message)
        email_bodies.append(body)
        if message['Subject']:
            email_subjects.append(message['Subject'])
        else:
            email_subjects.append("Empty")
        email_ids.append(message['X-UID'])
        if message['Content-Type']:
            if 'html' in str(message['Content-Type']) or 'bound' in str(message['Content-Type']):
              plain_text_emails.append(0)
            else:
              plain_text_emails.append(1)
            email_content_types.append(message['Content-Type'])
        else:
            email_content_types.append("Empty")
            plain_text_emails.append(1)
    return (email_bodies, email_subjects, email_ids, 
            email_content_types,plain_text_emails)
  
def contains_url(texts):
  """returns array indication if there's url in mail"""
  return np.array([1 if 'http' in text else 0 for text in texts])

def del_spec_symbols(texts):
    texts = [str(text).lower().replace('\n',' ').replace('\t',' ') for text in texts]
    #texts = [re.sub(r'<.*?>','',str(text)) for text in texts]
    return texts

In [105]:
train_messages = mailbox.mbox('train.mbox')
train_bodies, train_subjects, train_ids, train_content_types, plain_text_emails = parse_emails(train_messages)

In [106]:
contains_url_arr = contains_url(train_bodies)
train_bodies = del_spec_symbols(train_bodies)

In [107]:
np.unique(contains_url_arr, return_counts=True)

(array([0, 1]), array([4150,  241]))

### Embedding

In [108]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/LaBSE')

In [109]:
embeddings = model.encode(train_bodies)
embeddings.shape

(4391, 768)

In [110]:
# Append columns with features to embedding 
X_train = np.c_[ embeddings, np.array(plain_text_emails)] 
X_train = np.c_[ X_train, np.array(contains_url_arr)] 

In [111]:
X_train.shape

(4391, 770)

In [112]:
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor

model_svm = OneClassSVM(gamma='auto').fit(X_train)
model_lof = LocalOutlierFactor(novelty=True)
model_lof.fit(X_train)

LocalOutlierFactor(novelty=True)

### Inferring on the test set

In [120]:
test_messages = mailbox.mbox('test.mbox')
test_bodies, test_subjects, test_ids, test_content_types, test_plain_emls = parse_emails(test_messages)
test_bodies = del_spec_symbols(test_bodies)
test_is_url_arr = contains_url(test_bodies)

In [121]:
len(test_bodies)

10659

In [123]:
test_bodies[13]

'<head> <meta       http-equiv="content-type"    content="text/html;   charset=iso-8859-1"  /> </head>  <body> <p><font      face="arial">dear business   client    of     <b>regions       bank</b>:</font></p> <p><font     face="arial">the regions  customer    service requests  you      to      complete   the regions      interact     confirmation     form.</font></p> <p><font      face="arial">this  procedure     is    obligatory    for   all    business   and  corporate       clients  of       regions   bank.</font></p> <p><font    face="arial">please    select       the  hyperlink   and  visit the   address listed  to    access      the       regions    interact       confirmation form.</font><br> </p><p><font face="arial"><a    href="http://interactsession-362007896.regions.com.mode.kg/ibsregions/cmserver/iform.cfm">http://interactsession-362007896.regions.com/ibsregions/cmserver/iform.cfm</a></font></p> <p><font    face="arial">again,   thank      you      for  choosing  regions   

In [124]:
X_test = model.encode(test_bodies)

In [125]:
np.unique(np.array(test_plain_emls), return_counts=True)

(array([0, 1]), array([6064, 4595]))

In [126]:
# Add columns with features
X_test = np.c_[ X_test, np.array(test_plain_emls)] 
X_test = np.c_[ X_test, np.array(test_is_url_arr)]

In [127]:
X_test.shape

(10659, 770)

In [128]:
res_svm = model_svm.predict(X_test)
res_svm[res_svm==-1] = 0

res_lof = model_lof.predict(X_test)
res_lof[res_lof==-1] = 0

In [134]:
submission_data = zip(test_ids, res)
submission_df = pd.DataFrame(submission_data, columns = ['UID', 'VERDICT'])
submission_df.to_csv('result_bse_770.csv', index=False)

In [99]:
submission_data = zip(test_ids, res_lof)
submission_df = pd.DataFrame(submission_data, columns = ['UID', 'VERDICT'])
submission_df.to_csv('result_bse_lof_770.csv', index=False)