In [85]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
import gensim
import re
import mailbox

### Text preprocessing

In [96]:
def parse_emails(mbox_messages):
    email_bodies = []
    email_subjects = []
    email_ids = []
    email_content_types = []
    for i, message in enumerate(train_messages):
        body = message.get_payload()
        email_bodies.append(body)
        if message['Subject']:
            email_subjects.append(message['Subject'])
        else:
            email_subjects.append("Empty")
        email_ids.append(message['X-UID'])
        if message['Content-Type']:
            email_content_types.append(message['Content-Type'])
        else:
            email_content_types.append("Empty")
    return email_bodies, email_subjects, email_ids, email_content_types


def del_punct_symbols(texts):
    texts = [text.lower().replace('\n',' ').replace('\t',' ') for text in texts]
    texts = [re.sub(r'[^\w\s]','',text) for text in texts]
    return texts

def del_stop_words(texts, stop_words):
    return [[word for word in email.split() if word not in stop_words] for email in texts]

def lemmatize_text(texts, lemmatizer):
    return [[lemmatizer.lemmatize(word) for word in email] for email in texts]

In [97]:
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/dmitriy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/dmitriy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
test_messages = mailbox.mbox('train.mbox')
train_bodies, train_subjects, train_ids, train_content_types = parse_emails(test_messages)
train = lemmatize_text(del_stop_words(del_punct_symbols(train_bodies), stop_words), lemmatizer)

### DOC2VEC

In [66]:
def create_tagged_document(list_of_list_of_words):
    for i, list_of_words in enumerate(list_of_list_of_words):
        yield gensim.models.doc2vec.TaggedDocument(list_of_words, [i])

train_data_bds, test_data_bds = test_bodies[:3000], test_bodies[3000:] 
train_data_bds_tagged = list(create_tagged_document(train_data_bds))

In [68]:
model = gensim.models.doc2vec.Doc2Vec(vector_size=50, min_count=2, epochs=40)
model.build_vocab(train_data_bds_tagged)

In [69]:
model.train(train_data_bds_tagged, total_examples=model.corpus_count, epochs=model.epochs)

In [70]:
import collections

ranks = []
second_ranks = []
for doc_id in range(len(train_data_bds_tagged)):
    inferred_vector = model.infer_vector(train_data_bds_tagged[doc_id].words)
    sims = model.dv.most_similar([inferred_vector], topn=len(model.dv))
    rank = [docid for docid, sim in sims].index(doc_id)
    ranks.append(rank)

    second_ranks.append(sims[1])


counter = collections.Counter(ranks)
print(counter)


Counter({0: 2932, 1: 29, 2: 6, 3: 5, 16: 2, 449: 1, 877: 1, 2034: 1, 74: 1, 565: 1, 2851: 1, 2321: 1, 2013: 1, 611: 1, 2996: 1, 93: 1, 77: 1, 64: 1, 5: 1, 8: 1, 2733: 1, 49: 1, 1051: 1, 419: 1, 6: 1, 10: 1, 45: 1, 1708: 1, 89: 1, 1802: 1, 164: 1})


In [82]:
X_train = np.array([model.infer_vector(vec) for vec in test_data_bds]

### OneClassSVM

In [83]:
from sklearn.svm import OneClassSVM

clf = OneClassSVM(gamma='auto').fit(X_train)

### Test data prediction

In [99]:
test_messages = mailbox.mbox('test.mbox')
test_bodies, test_subjects, test_ids, test_content_types = parse_emails(test_messages)
test_bodies = lemmatize_text(del_stop_words(del_punct_symbols(test_bodies), stop_words), lemmatizer)

In [100]:
X_test = np.array([model.infer_vector(vec) for vec in test_bodies])

In [101]:
res = clf.predict(X_test)
res[res==-1] = 0

In [110]:
unique, counts = np.unique(res, return_counts=True)

print(np.asarray((unique, counts)).T)

[[   0 2279]
 [   1 2112]]


### Making submission csv

In [111]:
submission_data = zip(test_ids, res)
submission_df = pd.DataFrame(submission_data, columns = ['UID', 'VERDICT'])
submission_df.to_csv('result.csv', index=False)