In [18]:
import spacy
import pandas as pd
from gensim.corpora.dictionary import Dictionary# build model
from gensim.models import LdaMulticore
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt
import pyLDAvis.gensim_models

In [None]:
pyLDAvis.enable_notebook()

In [19]:
# load spacy english model
en_model = spacy.load('en_core_web_sm')

In [20]:
# load dataset
emails = pd.read_csv('dataset/train_test_tm.csv')

In [21]:
emails.head()

Unnamed: 0,file,from,email_body
0,mann___k___sent_mail_608,kay.mann@enron.com,hi warren please print all of these documents ...
1,farmer___d__wellhead_195,mary.poorman@enron.com,i have measurement actuals for the first days ...
2,dasovich___j__notes_inbox_1882,susan.mara@enron.com,ill check around with some people today
3,dasovich___j__notes_inbox_1821,susan.mara@enron.com,this is pretty much an allout attack against t...
4,keavey___p__deleted_items_80,postmaster@dowjones.com,your message to omeara dina edmiston john gas ...


In [22]:
# define tags to be removed from the email body
tags_rm = ['ADV','PRON','CCONJ','PUNCT','PART','DET','ADP','SPACE','NUM','SYM']

In [23]:
# tokenize the email body text while removing particular words including stopwords
# also the tokens are lemmatized versions of the original words used in the emails
tokens = []

for email in en_model.pipe(emails['email_body']):
    email_token = [token.lemma_.lower() for token in email if token.pos_ not in tags_rm and not token.is_stop and token.is_alpha]
    
    tokens.append(email_token)

In [24]:
# add new tokens to dataset as a new column
emails['e_b_token'] = tokens

In [25]:
emails.head()

Unnamed: 0,file,from,email_body,e_b_token
0,mann___k___sent_mail_608,kay.mann@enron.com,hi warren please print all of these documents ...,"[hi, warren, print, document, place, individua..."
1,farmer___d__wellhead_195,mary.poorman@enron.com,i have measurement actuals for the first days ...,"[measurement, actual, day, meter, deal, system..."
2,dasovich___j__notes_inbox_1882,susan.mara@enron.com,ill check around with some people today,"[ill, check, people, today]"
3,dasovich___j__notes_inbox_1821,susan.mara@enron.com,this is pretty much an allout attack against t...,"[allout, attack, position, generator, take]"
4,keavey___p__deleted_items_80,postmaster@dowjones.com,your message to omeara dina edmiston john gas ...,"[message, omeara, dina, edmiston, john, gas, d..."


In [26]:
emails.to_csv('dataset/enron_email_body_tokenized_tm.csv', index=False)

In [27]:
# assign each word in the tokens an id
dictionary = Dictionary(emails['e_b_token'])

NameError: name 'Dictionary' is not defined

In [None]:
dictionary.token2id

In [None]:
# filter tokens (remove tokens that do not appear much and tokens that appear too much)
dictionary.filter_extremes(no_below=5, no_above=0.5, keep_m=1000)

In [None]:
# construct corpus
corpus = [dictionary.doc3bow(doc) for doc in emails['e_b_token']]

In [None]:
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=50, num_topics=10, workers=4, passes=5)

In [None]:
# determine optimal number of topics

# using u_mass
topics = []
score = []

for i in range(1,20,1):
    lda_model = LdaMulticore(
        corpus=corpus, 
        id2word=dictionary, 
        iterations=50,
        num_topics=i,
        workers=4,
        passes=5,
        random_state=7
    )

    cm = CoherenceModel(model=lda_model, corpus=corpus, dictionary=dictionary, coherence='u_mass')
    topics.append(i)
    score.append(cm.get_coherence())

In [None]:
# using c_v
topics = []
score = []

for i in range(1,20,1):
    lda_model = LdaMulticore(
        corpus=corpus, 
        id2word=dictionary, 
        iterations=50,
        num_topics=i,
        workers=4,
        passes=5,
        random_state=7
    )

    cm = CoherenceModel(model=lda_model, corpus=corpus, dictionary=dictionary, coherence='c_v')
    topics.append(i)
    score.append(cm.get_coherence())

In [None]:
plt.plot(topics, score)
plt.xlabel('Number of Topics')
plt.ylabel('Coherence Score')
plt.show()

In [None]:
lda_model = LdaMulticore(corpus=corpus, id2word=dictionary, iterations=50, num_topics=5, workers=4, passes=5)

In [None]:
# save model
from gensim.test.utils import datapath
model_file = datapath('moodels/tm/lda_model_1')
lda_model.save(model_file)