In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import wikipedia
import spacy
from textblob import TextBlob

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD

In [4]:
import os
from os import listdir
from os.path import isfile, join
from bs4 import BeautifulSoup
import codecs

MINIMUM_DOC_SIZE = 20

df = pd.DataFrame(columns=['name', 'content'])

data_directory = os.path.join('.', 'mails')
files = [f for f in listdir(data_directory) if isfile(join(data_directory, f))]

mail_count = 0

# open each mail file
for mail_file in files:

    #print join(data_directory, mail_file)
    with codecs.open(join(data_directory, mail_file), encoding='utf_8') as mf:

        soup = BeautifulSoup(mf.read(), "lxml")

        # kill all script and style elements
        for script in soup(["script", "style"]):
            script.extract()    # rip it out

        # get text
        text = soup.get_text()
        trimmed_text = text.replace('\r', '').strip().replace('\n', '\\n')
        if len(trimmed_text) < MINIMUM_DOC_SIZE:
            print("content of email, '{}' is too short, so skip it : {}".format(mail_file, trimmed_text))
            continue

        # write the mail content as a line in the new file
        # escape newline characters in the original mail text
        df = df.append({'name': mail_file, 'content': trimmed_text}, ignore_index=True)


        mail_count += 1
        #print mail_count

print u'''Text from {:,} mails added to a dataframe.'''.format(mail_count)

content of email, '15b6e00b1da5a8c6-0.0' is too short, so skip it : API Key Compromised
content of email, '15b6e00b1da5a8c6-0.1' is too short, so skip it : API Key Compromised
content of email, '15b6e021f56e921a-0.0' is too short, so skip it : AWS Cases
content of email, '15b6e021f56e921a-0.1' is too short, so skip it : AWS Cases
content of email, '15b6e0a06d9f892c-0.0' is too short, so skip it : Important - 5/3/16
content of email, '15b6e0a06d9f892c-0.1' is too short, so skip it : Important - 5/3/16
content of email, '15b6e0b9c03d19c9-0.0' is too short, so skip it : Important - rest
content of email, '15b6e0b9c03d19c9-0.1' is too short, so skip it : Important - rest
content of email, '15b6e12104423626-0.0' is too short, so skip it : Inbox - 3/20/17
content of email, '15b6e12104423626-0.1' is too short, so skip it : Inbox - 3/20/17
content of email, '15b6e158ff1ad6a6-0.0' is too short, so skip it : Inbox - 2/10/17
content of email, '15b6e158ff1ad6a6-0.1' is too short, so skip it : Inbo

In [5]:
df.head()

Unnamed: 0,name,content
0,15b6e00b1da5a8c6-1.0.0,"API Key Compromised Hello, We've received a ..."
1,15b6e00b1da5a8c6-1.0.1,Your Amazon EC2 Abuse Report [16415975652-2] [...
2,15b6e00b1da5a8c6-10.0,RE: [Case 1454568521] Your AWS account is com...
3,15b6e00b1da5a8c6-11.0,Amazon Web Services: New Support case: 1454568...
4,15b6e00b1da5a8c6-12.0,RE: [Case 1429709441] Your AWS account is com...


In [8]:
# Convert words to vectors
cv = CountVectorizer(stop_words='english')
cv.fit(df['content'].values)
X = cv.transform(df['content'].values)
X

<12403x35655 sparse matrix of type '<type 'numpy.int64'>'
	with 1365709 stored elements in Compressed Sparse Row format>

In [36]:
# Generates topics using LDA

#num_topics = 5
num_topics = 10
#num_topics = 20

feature_names = cv.get_feature_names()
lda = LatentDirichletAllocation(n_components=num_topics)
lda.fit(X)



LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7, learning_method=None,
             learning_offset=10.0, max_doc_update_iter=100, max_iter=10,
             mean_change_tol=0.001, n_components=10, n_jobs=1,
             n_topics=None, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [37]:
# see the result, num_topics * num_words
print lda.components_.shape

(10, 35655)


In [38]:
# build a topic dataframe
results = pd.DataFrame(lda.components_, columns=feature_names)
results.shape

(10, 35655)

In [39]:
# Let's see each topic with its most used 25 words
for topic in range(num_topics):
    print('Topic', topic)
    word_list = results.T[topic].sort_values(ascending=False).index
    print(' '.join(word_list[0:25]), '\n')

('Topic', 0)
(u'com sungardas aws marketplace product domain meg amazon mail email seller io amazonaws ramsey information city thanks account png iam pm ops need sungard awscto', '\n')
('Topic', 1)
(u'amazon services web aws com message account wa subsidiary 410 ave terry seattle distributed 98109 north 5210 produced trademark registered view new reserved emails privacy', '\n')
('Topic', 2)
(u'amazon com https aws url case v2 urldefense gkv5bqa9zrq1gfcvg gybc proofpoint vl services link web dwicaq following support contact mail issue use http html using', '\n')
('Topic', 3)
(u'00 2017 usd amazon invoice email aws account com number payment services web 03 ca 12 17 receivables past sep 20 invoices 18 aug support', '\n')
('Topic', 4)
(u'amazon aws service available new learn series rds supports data s3 cloud announcements region blog elastic server cloudwatch redshift container read api support storage hope', '\n')
('Topic', 5)
(u'amazon ec2 instances instance aws com windows latest data

In [40]:
# Now cluster all email documents to topics
prop = lda.transform(X)

In [41]:
# Persist models, cv & lda
import pickle
#s = pickle.dumps(lda)
from sklearn.externals import joblib
joblib.dump(cv, 'cv.%02d.pkl' % num_topics) 
joblib.dump(lda, 'lda.%02d.pkl' % num_topics) 

['lda.10.pkl']

In [42]:
# Create another dataframe with all email documents with their topic proportions
columns = []
for tidx in range(num_topics):
    columns.append('t%02d' % tidx)
    columns.append('p%02d' % tidx)
df_prop = pd.DataFrame(columns=columns)

for doc_idx in range(len(prop)):
    prop_topics = []
    for idx, prop_val in enumerate(prop[doc_idx]):
        prop_topics.append({'topic': str(idx), 'prop': prop_val})
    prop_topics.sort(key=lambda k : k['prop'], reverse=True)
    #print prop_topics
    topic_prop = {}
    for tidx in range(num_topics):
        topic_prop['t%02d' % tidx] = prop_topics[tidx]['topic']
        topic_prop['p%02d' % tidx] = prop_topics[tidx]['prop']
    df_prop = df_prop.append(topic_prop, ignore_index=True)
print u'completed to create a dataframe with topics'

completed to create a dataframe with topics


In [43]:
df_prop.head()

Unnamed: 0,t00,p00,t01,p01,t02,p02,t03,p03,t04,p04,t05,p05,t06,p06,t07,p07,t08,p08,t09,p09
0,5,0.968858,2,0.028374,9,0.000346,3,0.000346,1,0.000346,4,0.000346,7,0.000346,0,0.000346,6,0.000346,8,0.000346
1,5,0.982762,2,0.014588,9,0.000331,3,0.000331,1,0.000331,0,0.000331,7,0.000331,6,0.000331,4,0.000331,8,0.000331
2,2,0.734974,0,0.098499,1,0.072154,5,0.053202,9,0.039325,6,0.000369,4,0.000369,3,0.000369,7,0.000369,8,0.000369
3,2,0.42952,0,0.236991,1,0.219291,5,0.08865,9,0.024304,4,0.000249,7,0.000249,3,0.000249,6,0.000249,8,0.000249
4,2,0.63829,0,0.212442,5,0.101161,9,0.044025,1,0.000681,3,0.00068,7,0.00068,6,0.00068,4,0.00068,8,0.00068


In [44]:
# Now merge two dataframes, email documents & topic proportions
df_res = pd.concat([df, df_prop], axis=1)
df_res.head()

Unnamed: 0,name,content,t00,p00,t01,p01,t02,p02,t03,p03,...,t05,p05,t06,p06,t07,p07,t08,p08,t09,p09
0,15b6e00b1da5a8c6-1.0.0,"API Key Compromised Hello, We've received a ...",5,0.968858,2,0.028374,9,0.000346,3,0.000346,...,4,0.000346,7,0.000346,0,0.000346,6,0.000346,8,0.000346
1,15b6e00b1da5a8c6-1.0.1,Your Amazon EC2 Abuse Report [16415975652-2] [...,5,0.982762,2,0.014588,9,0.000331,3,0.000331,...,0,0.000331,7,0.000331,6,0.000331,4,0.000331,8,0.000331
2,15b6e00b1da5a8c6-10.0,RE: [Case 1454568521] Your AWS account is com...,2,0.734974,0,0.098499,1,0.072154,5,0.053202,...,6,0.000369,4,0.000369,3,0.000369,7,0.000369,8,0.000369
3,15b6e00b1da5a8c6-11.0,Amazon Web Services: New Support case: 1454568...,2,0.42952,0,0.236991,1,0.219291,5,0.08865,...,4,0.000249,7,0.000249,3,0.000249,6,0.000249,8,0.000249
4,15b6e00b1da5a8c6-12.0,RE: [Case 1429709441] Your AWS account is com...,2,0.63829,0,0.212442,5,0.101161,9,0.044025,...,3,0.00068,7,0.00068,6,0.00068,4,0.00068,8,0.00068


In [45]:
# Save this new dataframe into csv file for each topic
for idx in range(num_topics):
    df_res.loc[df_res['t00'] == '%d' % idx].to_csv('./results/topic%02d/topic_%02d.csv' % (num_topics, idx), header=True, index=False, encoding='utf-8')