In [1]:
import pandas as pd
import nltk
import pickle
from gensim.models.ldamodel import LdaModel
from gensim import corpora



# Text pre-processing

We perform a similar text pre-processing pipeline (e.g., tokenization, stopword removal, stemming, etc.) as in task 1. The difference here is that we need to handle every row of data one by one, generating a nested list of a list of words.

In [2]:
emails = pd.read_csv('hillary-clinton-emails/Emails.csv')

We merge the ExtractBodyText column and MetadataSubject column into a new column named MergedText.

In [3]:
emails['MergedText'] = emails.apply(lambda x: str(x.ExtractedBodyText) + str(x.MetadataSubject), axis=1)

We define stop words and stemmer as in task 1.

In [4]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
local_stop = {"Re", "RE", "Fw", "FW", "Cc", "also", "PM", "AM", "pm", "am", "btw", "fyi", "FYI", "to", "from", "To", "From"}
stop_words = stop_words | local_stop

In [5]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [8]:
def preprocess(text):
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token.isalpha()]  # remove punctuations
    filtered_content = [w for w in tokens if w not in stop_words]
    stemmed_content = [ps.stem(word) for word in filtered_content]
    return stemmed_content

In [9]:
texts = [ preprocess(text) for text in emails.MergedText]

# Topic modeling

In [10]:
dictionary = corpora.Dictionary(texts)

In [11]:
corpus = [dictionary.doc2bow(text) for text in texts]

In [13]:
def print_topics(topics):
    for topic in topics:
        print("Topic ", topic[0], end="    ")
        for word_tuple in topic[1]:
            print(word_tuple[0], end="     ")
        print()

Now, let's train the LDA model with different number of topics.
TODO: try different number of topics, number of words, number of passes and find a good fit.

In [14]:
model = LdaModel(corpus, num_topics=5, id2word=dictionary, passes=5)
topics = model.show_topics(num_topics=5, num_words=10, formatted=False)

In [15]:
topics[1]

(1,
 [('I', 0.019280088026253587),
  ('call', 0.015950882385789081),
  ('H', 0.015017886536467769),
  ('D', 0.008837346584936465),
  ('Cheryl', 0.0074708472886379756),
  ('Huma', 0.0072314524025088138),
  ('CALL', 0.0069496356249629529),
  ('IN', 0.0062789655466385305),
  ('Will', 0.006235758404736216),
  ('OF', 0.0061655219045239525)])

In [16]:
print_topics(topics)

Topic  0    I     work     would     get     want     We     think     know     go     call     
Topic  1    I     call     H     D     Cheryl     Huma     CALL     IN     Will     OF     
Topic  2    The     Obama     American     said     would     In     govern     polit     State     one     
Topic  3    Secretari     TO     Offic     MEET     FOR     THE     Room     AND     ON     nan     
Topic  4    TO     No     ON     NO     Depart     INFORM     BENGHAZI     AGREEMENT     SUBJECT     HOUS     
