In [89]:
import random
import numpy as np
import pandas as pd
import gensim
import re
import mailbox
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


In [90]:
random.seed(432)
np.random.seed(432)

### Text preprocessing

In [91]:
def getcharsets(msg):
    charsets = set({})
    for c in msg.get_charsets():
        if c is not None:
            charsets.update([c])
    return charsets

def handleerror(errmsg, emailmsg,cs):
    print()
    print(errmsg)
    print("This error occurred while decoding with ",cs," charset.")
    print("These charsets were found in the one email.",getcharsets(emailmsg))
    print("This is the subject:",emailmsg['subject'])
    print("This is the sender:",emailmsg['From'])
    
def getbodyfromemail(msg):
    body = None
    #Walk through the parts of the email to find the text body.    
    if msg.is_multipart():    
        for part in msg.walk():

            # If part is multipart, walk through the subparts.            
            if part.is_multipart(): 

                for subpart in part.walk():
                    if subpart.get_content_type() == 'text/plain':
                        # Get the subpart payload (i.e the message body)
                        body = subpart.get_payload(decode=True) 
                        #charset = subpart.get_charset()

            # Part isn't multipart so get the email body
            elif part.get_content_type() == 'text/plain':
                body = part.get_payload(decode=True)
                #charset = part.get_charset()

    # If this isn't a multi-part message then get the payload (i.e the message body)
    elif msg.get_content_type() == 'text/plain':
        body = msg.get_payload(decode=True) 

   # No checking done to match the charset with the correct part. 
#     for charset in getcharsets(msg):
#         try:
#             body = body.decode(charset)
#         except UnicodeDecodeError:
#             handleerror("UnicodeDecodeError: encountered.",msg,charset)
#         except AttributeError:
#              handleerror("AttributeError: encountered" ,msg,charset)
    return body    


In [92]:
def parse_emails(mbox_messages):
    email_bodies = []
    email_subjects = []
    email_ids = []
    email_content_types = []
    for i, message in enumerate(mbox_messages):
        body = getbodyfromemail(message)
        email_bodies.append(body)
#         body = message.get_payload()
#         if isinstance(body, str):
#             email_bodies.append(body)
#         else:
#             new_body = ' '.join(body)
#             email_bodies.append(new_body)
        if message['Subject']:
            email_subjects.append(message['Subject'])
        else:
            email_subjects.append("Empty")
        email_ids.append(message['X-UID'])
        if message['Content-Type']:
            email_content_types.append(message['Content-Type'])
        else:
            email_content_types.append("Empty")
    return email_bodies, email_subjects, email_ids, email_content_types


def del_punct_symbols(texts):
    texts = [str(text).lower().replace('\n',' ').replace('\t',' ') for text in texts]
    texts = [re.sub(r'[^\w\s]','',str(text)) for text in texts]
    return texts


def del_stop_words(texts, stop_words):
    return [[word for word in email.split() if word not in stop_words] for email in texts]


def lemmatize_text(texts, lemmatizer):
    return [[lemmatizer.lemmatize(word) for word in email] for email in texts]

In [93]:
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dmitriy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/dmitriy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [94]:
train_messages = mailbox.mbox('train.mbox')
train_bodies, train_subjects, train_ids, train_content_types = parse_emails(train_messages)
train_bodies = lemmatize_text(del_stop_words(del_punct_symbols(train_bodies), stop_words), lemmatizer)

### DOC2VEC

In [95]:
def create_tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

#train_data_bds, test_data_bds = train_bodies[:3000], train_bodies[3000:] 
train_data_bds = train_bodies
train_data_bds_tagged = list(create_tagged_document(train_data_bds))

In [96]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(train_data_bds_tagged)

In [97]:
model.train(train_data_bds_tagged, total_examples=model.corpus_count, epochs=model.epochs)

In [98]:
import collections

ranks = []
second_ranks = []
for doc_id in range(len(train_data_bds_tagged)):
    inferred_vector = model.infer_vector(train_data_bds_tagged[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])
counter = collections.Counter(ranks)
print(counter)

Counter({0: 4205, 1: 54, 2: 14, 4: 10, 3: 9, 5: 7, 10: 3, 6: 3, 8: 2, 25: 2, 71: 2, 4332: 2, 23: 2, 19: 2, 21: 2, 210: 2, 55: 2, 11: 2, 9: 2, 373: 1, 1127: 1, 3069: 1, 13: 1, 4390: 1, 385: 1, 682: 1, 270: 1, 3424: 1, 1328: 1, 628: 1, 3196: 1, 699: 1, 1379: 1, 3263: 1, 311: 1, 335: 1, 3342: 1, 3157: 1, 1915: 1, 78: 1, 871: 1, 34: 1, 3902: 1, 1234: 1, 2367: 1, 305: 1, 2052: 1, 12: 1, 2048: 1, 30: 1, 36: 1, 1084: 1, 43: 1, 1723: 1, 626: 1, 3783: 1, 1664: 1, 724: 1, 904: 1, 29: 1, 24: 1, 37: 1, 14: 1, 4107: 1, 1367: 1, 4366: 1, 3306: 1, 3096: 1, 3365: 1, 397: 1, 414: 1, 17: 1, 126: 1, 45: 1, 3300: 1, 4303: 1, 565: 1, 3367: 1, 374: 1, 2108: 1, 356: 1, 3885: 1, 157: 1})


In [106]:
X_train = np.array([model.infer_vector(vec) for vec in train_data_bds])

### OneClassSVM

In [107]:
from sklearn.svm import OneClassSVM

clf = OneClassSVM(gamma='auto').fit(X_train)

### Test data prediction

In [108]:
test_messages = mailbox.mbox('test.mbox')
test_bodies, test_subjects, test_ids, test_content_types = parse_emails(test_messages)
test_bodies = lemmatize_text(del_stop_words(del_punct_symbols(test_bodies), stop_words), lemmatizer)

In [109]:
X_test = np.array([model.infer_vector(vec) for vec in test_bodies])

In [110]:
res = clf.predict(X_test)
res[res==-1] = 0

In [111]:
unique, counts = np.unique(res, return_counts=True)

print(np.asarray((unique, counts)).T)

[[   0 3329]
 [   1 7330]]


### Making submission csv

In [113]:
submission_data = zip(test_ids, res)
submission_df = pd.DataFrame(submission_data, columns = ['UID', 'VERDICT'])
submission_df.to_csv('result.csv', index=False)