In [1]:
import random
import numpy as np
import pandas as pd
import gensim
import re
import mailbox
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [2]:
random.seed(432)
np.random.seed(432)

### Text preprocessing

In [3]:
def getcharsets(msg):
    charsets = set({})
    for c in msg.get_charsets():
        if c is not None:
            charsets.update([c])
    return charsets

def handleerror(errmsg, emailmsg,cs):
    print()
    print(errmsg)
    print("This error occurred while decoding with ",cs," charset.")
    print("These charsets were found in the one email.",getcharsets(emailmsg))
    print("This is the subject:",emailmsg['subject'])
    print("This is the sender:",emailmsg['From'])
    
def getbodyfromemail(msg):
    body = None
    #Walk through the parts of the email to find the text body.    
    if msg.is_multipart():    
        for part in msg.walk():

            # If part is multipart, walk through the subparts.            
            if part.is_multipart(): 

                for subpart in part.walk():
                    if subpart.get_content_type() == 'text/plain':
                        # Get the subpart payload (i.e the message body)
                        body = subpart.get_payload(decode=True) 
                        #charset = subpart.get_charset()
                    elif subpart.get_content_type() == 'text/html':
                        body = subpart.get_payload(decode=True) 
                        #body = BeautifulSoup(body, "lxml").text

            # Part isn't multipart so get the email body
            elif part.get_content_type() == 'text/plain':
                body = part.get_payload(decode=True)
                #charset = part.get_charset()
            elif part.get_content_type() == 'text/html':
                body = part.get_payload(decode=True) 
                #body = BeautifulSoup(body, "lxml").text

    # If this isn't a multi-part message then get the payload (i.e the message body)
    elif msg.get_content_type() == 'text/plain':
        body = msg.get_payload(decode=True) 
    elif msg.get_content_type() == 'text/html':
        body = msg.get_payload(decode=True) 
        #body = BeautifulSoup(body, "lxml").text
    

   # No checking done to match the charset with the correct part. 
#     for charset in getcharsets(msg):
#         try:
#             body = body.decode(charset)
#         except UnicodeDecodeError:
#             handleerror("UnicodeDecodeError: encountered.",msg,charset)
#         except AttributeError:
#              handleerror("AttributeError: encountered" ,msg,charset)
    return body    


In [4]:
def parse_emails(mbox_messages):
    email_bodies = []
    email_subjects = []
    email_ids = []
    email_content_types = []
    for i, message in enumerate(mbox_messages):
        body = getbodyfromemail(message)
        email_bodies.append(body)
#         body = message.get_payload()
#         if isinstance(body, str):
#             email_bodies.append(body)
#         else:
#             new_body = ' '.join(body)
#             email_bodies.append(new_body)
        if message['Subject']:
            email_subjects.append(message['Subject'])
        else:
            email_subjects.append("Empty")
        
        email_ids.append(message['X-UID'])
        if message['Content-Type']:
            email_content_types.append(message['Content-Type'])
        else:
            email_content_types.append("Empty")
    return email_bodies, email_subjects, email_ids, email_content_types


def del_punct_symbols(texts):
    texts = [str(text).lower().replace('\n',' ').replace('\t',' ') for text in texts]
    texts = [re.sub(r'<.*?>','',str(text)) for text in texts]
    texts = [re.sub(r'0x*?','',str(text)) for text in texts]
    texts = [re.sub(r'[^\w\s]','',str(text)) for text in texts]
    return texts


def del_stop_words(texts, stop_words):
    return [[word for word in email.split() if word not in stop_words] for email in texts]


def lemmatize_text(texts, lemmatizer):
    return [[lemmatizer.lemmatize(word) for word in email] for email in texts]

In [5]:
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dmitriy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/dmitriy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [6]:
train_messages = mailbox.mbox('train.mbox')
train_bodies, train_subjects, train_ids, train_content_types = parse_emails(train_messages)
train_bodies = lemmatize_text(del_stop_words(del_punct_symbols(train_bodies), stop_words), lemmatizer)

In [7]:
import re
s = 'fdsf ds 0x215443'
re.sub(r'0x*?',r'', s) 

'fdsf ds x215443'

### DOC2VEC

In [8]:
def create_tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

#train_data_bds, test_data_bds = train_bodies[:3000], train_bodies[3000:] 
train_data_bds = train_bodies
train_data_bds_tagged = list(create_tagged_document(train_data_bds))

In [9]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(train_data_bds_tagged)

In [10]:
model.train(train_data_bds_tagged, total_examples=model.corpus_count, epochs=model.epochs)

In [11]:
import collections

ranks = []
second_ranks = []
for doc_id in range(len(train_data_bds_tagged)):
    inferred_vector = model.infer_vector(train_data_bds_tagged[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])
counter = collections.Counter(ranks)
print(counter)

Counter({0: 4228, 1: 46, 3: 16, 2: 9, 7: 4, 9: 3, 6: 3, 4: 3, 44: 2, 15: 2, 11: 2, 18: 2, 2969: 1, 1134: 1, 23: 1, 17: 1, 60: 1, 587: 1, 876: 1, 2700: 1, 1659: 1, 399: 1, 3005: 1, 251: 1, 1203: 1, 4143: 1, 1695: 1, 641: 1, 70: 1, 4307: 1, 2622: 1, 1757: 1, 128: 1, 1302: 1, 3627: 1, 749: 1, 63: 1, 3534: 1, 2886: 1, 3681: 1, 26: 1, 5: 1, 76: 1, 1565: 1, 13: 1, 205: 1, 37: 1, 2858: 1, 1733: 1, 121: 1, 141: 1, 555: 1, 4145: 1, 4069: 1, 1845: 1, 10: 1, 247: 1, 697: 1, 35: 1, 8: 1, 2304: 1, 374: 1, 617: 1, 3839: 1, 4231: 1, 1549: 1, 4093: 1, 2734: 1, 50: 1, 31: 1, 89: 1, 3313: 1, 16: 1, 3120: 1, 1420: 1, 24: 1, 990: 1, 3615: 1, 3668: 1, 4257: 1, 2400: 1, 3532: 1, 3898: 1})


In [12]:
X_train = np.array([model.infer_vector(vec) for vec in train_data_bds])

### OneClassSVM, LocalOutlierFactor

In [13]:
from sklearn.svm import OneClassSVM

clf = OneClassSVM(gamma='auto').fit(X_train)

In [14]:
from sklearn.neighbors import LocalOutlierFactor

model_lof = LocalOutlierFactor(n_neighbors=20,leaf_size=30, contamination = 0.5, novelty=True)
model_lof.fit(X_train)

LocalOutlierFactor(contamination=0.5, novelty=True)

### Binary classifier with external data

In [58]:
len(phish_bodies[]

4391

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, shuffle=True)

In [56]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier()
model_rf.

### Saving models

In [16]:
model.save('doc2vec_model')


2021-12-12 00:09:06,187 - gensim.utils - INFO - Doc2Vec lifecycle event {'fname_or_handle': 'doc2vec_model', 'separately': 'None', 'sep_limit': 10485760, 'ignore': frozenset(), 'datetime': '2021-12-12T00:09:06.187873', 'gensim': '4.1.2', 'python': '3.8.8 (default, Apr 13 2021, 19:58:26) \n[GCC 7.3.0]', 'platform': 'Linux-5.4.0-91-generic-x86_64-with-glibc2.10', 'event': 'saving'}
2021-12-12 00:09:06,190 - gensim.utils - INFO - not storing attribute cum_table
2021-12-12 00:09:06,235 - gensim.utils - INFO - saved doc2vec_model


In [18]:
import joblib

svm_filename = 'oneclass_svm_model.sav'
joblib.dump(clf, svm_filename)
lof_filename = 'lof_model.sav'
joblib.dump(model_lof, lof_filename)

['lof_model.sav']

### Test data prediction

In [127]:
test_messages = mailbox.mbox('test.mbox')
test_bodies, test_subjects, test_ids, test_content_types = parse_emails(test_messages)
test_bodies = lemmatize_text(del_stop_words(del_punct_symbols(test_bodies), stop_words), lemmatizer)

In [46]:
test_messages = mailbox.mbox('test.mbox')
test_bodies, test_subjects, test_ids, test_content_types = parse_emails(test_messages)


In [128]:
test_bodies = [[word for word in text if word[0]!='x'] for text in test_bodies]

In [129]:
test_bodies[13][:-10]

['bnnnnndear',
 'business',
 'client',
 'region',
 'banknthe',
 'region',
 'customer',
 'service',
 'request',
 'complete',
 'region',
 'interact',
 'confirmation',
 'formnthis',
 'procedure',
 'obligatory',
 'business',
 'corporate',
 'client',
 'region',
 'banknplease',
 'select',
 'hyperlink',
 'visit',
 'address',
 'listed',
 'access',
 'region',
 'interact',
 'confirmation',
 'formnhttpinteractsession3627896regionscomibsregionscmserveriformcfmnagain',
 'thank',
 'choosing',
 'region',
 'bank',
 'business',
 'need',
 'look',
 'forward',
 'working',
 'youn',
 'please',
 'respond',
 'email',
 'mail',
 'generated',
 'automated',
 'servicenreplies',
 'mail',
 'read',
 'region',
 'bank',
 'customer',
 'service',
 'technical',
 'supportnncvs',
 'start',
 'exe',
 'qtq7',
 'hgnw',
 'tmp',
 'p9r',
 'z433',
 'engine',
 'update',
 'api',
 'revision',
 'juh',
 'o2q',
 'e9n',
 'gbv',
 'hex',
 'serv',
 'rcs',
 '7eob',
 'revision',
 'revision',
 '6233494',
 'nmx4',
 'hex',
 'hex',
 'file',
 'dvk'

In [130]:
X_test = np.array([model.infer_vector(vec) for vec in test_bodies])

In [131]:
res = clf.predict(X_test)
res[res==-1] = 0

In [138]:
res_lof = model_lof.predict(X_test)
res_lof[res_lof==-1] = 0

In [139]:
unique, counts = np.unique(res, return_counts=True)

print(np.asarray((unique, counts)).T)

[[   0 8648]
 [   1 2011]]


In [140]:
unique, counts = np.unique(res_lof, return_counts=True)

print(np.asarray((unique, counts)).T)

[[   0 9383]
 [   1 1276]]


In [None]:
test_bodies[9:13]

In [75]:
test_content_types[5002]

'multipart/related; \n\tboundary="D4RXVN07GRQFT4S3W"'

## Accuracy estimation

In [85]:
targets = []
test_ids_2 = []
for i, test_message in enumerate(test_bodies):
    is_real = 1
    try:
        if 'html' in test_content_types[i] or 'bound' in test_content_types[i]:
            is_real = 0
    except:
        is_real = 0
    targets.append(is_real)
    if i%1000==0:
        print(str(i)+"out of "+str(len(test_messages)))

0out of 10659
1000out of 10659
2000out of 10659
3000out of 10659
4000out of 10659
5000out of 10659
6000out of 10659
7000out of 10659
8000out of 10659
9000out of 10659
10000out of 10659


In [77]:
# target_data = zip(test_ids, np.array(targets))
# target_data_df = pd.DataFrame(target_data, columns = ['UID', 'VERDICT'])
# target_data_df.to_csv('target.csv', index=False)

In [86]:
targets = np.array(targets)
unique, counts = np.unique(targets, return_counts=True)

print(np.asarray((unique, counts)).T)

[[   0 6064]
 [   1 4595]]


In [141]:
from sklearn.metrics import accuracy_score

accuracy_score(targets, res)

0.6725771648372267

In [142]:
accuracy_score(targets, res_lof)

0.6347687400318979

### Making submission csv

In [113]:
submission_data = zip(test_ids, res)
submission_df = pd.DataFrame(submission_data, columns = ['UID', 'VERDICT'])
submission_df.to_csv('result.csv', index=False)

In [153]:
submission_data = zip(test_ids, res_lof)
submission_df = pd.DataFrame(submission_data, columns = ['UID', 'VERDICT'])
submission_df.to_csv('result_lof.csv', index=False)