# Imports

In [1]:
import matplotlib.pyplot as plt
from os import path
from wordcloud import WordCloud
import nltk
import pandas as pd

### Example of usage

In [2]:
d = path.dirname('hillary-clinton-emails/')

# Read the whole text.
text = open(path.join(d, 'Emails.csv'),encoding='utf8').read()

# Generate a word cloud image
wordcloud = WordCloud().generate(text)

# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
plt.imshow(wordcloud)
plt.axis("off")

# lower max_font_size
wordcloud = WordCloud(max_font_size=40).generate(text)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [7]:
type(text)

str

# Read database

In [3]:
df=pd.read_csv('hillary-clinton-emails/Emails.csv')

In [4]:
df.shape

(7945, 22)

In [5]:
df.head(1)

Unnamed: 0,Id,DocNumber,MetadataSubject,MetadataTo,MetadataFrom,SenderPersonId,MetadataDateSent,MetadataDateReleased,MetadataPdfLink,MetadataCaseNumber,...,ExtractedTo,ExtractedFrom,ExtractedCc,ExtractedDateSent,ExtractedCaseNumber,ExtractedDocNumber,ExtractedDateReleased,ExtractedReleaseInPartOrFull,ExtractedBodyText,RawText
0,1,C05739545,WOW,H,"Sullivan, Jacob J",87.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739545...,F-2015-04841,...,,"Sullivan, Jacob J <Sullivan11@state.gov>",,"Wednesday, September 12, 2012 10:16 AM",F-2015-04841,C05739545,05/13/2015,RELEASE IN FULL,,UNCLASSIFIED\r\nU.S. Department of State\r\nCa...


## Tokenization of RawText

In [6]:
df.ExtractedBodyTextString=df.ExtractedBodyText.apply(lambda x: str(x))

In [7]:
sentence="".join(df.ExtractedBodyTextString)

In [8]:
len(sentence)

3645959

In [9]:
tokens = nltk.word_tokenize(sentence)

In [10]:
len(tokens)

689100

In [11]:
text = nltk.Text(tokens)

In [12]:
text.collocations()

United States; White House; State Department; FOIA WAIVER; BENGHAZI
COMM; SELECT BENGHAZI; HOUSE SELECT; SENSITIVE INFORMATION;
F-2015-04841 Doc; STATE DEPT; 05/13/2015 STATE; Private Residence; New
York; *En route; State Case; OFFICE TIME; U.S. Department; health
care; Middle East; Conference Room


## Stop words removal

In [12]:
from nltk.corpus import stopwords

In [13]:
stop = stopwords.words('english')

In [14]:
cont=0
for i in tokens:
    if i in stop:
        cont+=1

In [15]:
cont

220707

In [16]:
good_tokens=[x for x in tokens if x not in stop]

In [17]:
len(good_tokens)

468393

In [18]:
text = nltk.Text(tokens)

In [20]:
text.collocations()

United States; White House; State Department; FOIA WAIVER; BENGHAZI
COMM; SELECT BENGHAZI; HOUSE SELECT; SENSITIVE INFORMATION;
F-2015-04841 Doc; STATE DEPT; 05/13/2015 STATE; Private Residence; New
York; *En route; State Case; OFFICE TIME; U.S. Department; health
care; Middle East; Conference Room


## Stemming

In [19]:
from nltk.stem.wordnet import WordNetLemmatizer

In [20]:
lmtzr = WordNetLemmatizer()

In [21]:
stemmed_tokens=[lmtzr.lemmatize(x) for x in good_tokens]

In [22]:
len(stemmed_tokens)

468393

In [23]:
text = nltk.Text(tokens)

In [26]:
text.collocations()

United States; White House; State Department; FOIA WAIVER; BENGHAZI
COMM; SELECT BENGHAZI; HOUSE SELECT; SENSITIVE INFORMATION;
F-2015-04841 Doc; STATE DEPT; 05/13/2015 STATE; Private Residence; New
York; *En route; State Case; OFFICE TIME; U.S. Department; health
care; Middle East; Conference Room


## Wordcloud

In [27]:
from wordcloud import WordCloud

In [25]:
final_string=' '.join(stemmed_tokens)

In [26]:
# Generate a word cloud image
wordcloud = WordCloud().generate(final_string)

# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
plt.imshow(wordcloud)
plt.axis("off")

# lower max_font_size
wordcloud = WordCloud(max_font_size=40).generate(final_string)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

# LDA

Import gensim library and define a Latent Dirichlet Allocation (**LDA**) on the corpus to perform topic analysis:

In [27]:
#NOTE: there is some punctuation left in the stemmed tokens list

At first we define a list of stopwords to be wiped out from the documents; this words are typical English language stopwords or some trivial expressions which are contained in the *RawText*, such as 'fw:', 'sent:', 'from:', 'to:' (mail parameters) or 'u.s.', 'state', 'department' and so on:

In [58]:
# clear the documents from trivial recurrent words
# NOTE: 08/31/2015 is a recurrent date
# f-2014-20439 is related to the department identification code or something similar
stop = stopwords.words('english')

trivial_topics = ['date:','unclassified','from:','sent:','subject:','to:','state','department',
                  'no.','doc','case','u.s.','fw:','f-2014-20439','08/31/2015',
                 're:','cc:','h','j']

punctuation_symbols = ", : ; . ' % & $ @ - ( ) — <  > • * / + -".split()

# add punctuation symbols to the stopwords into a new list called stoplist
stoplist = list(set(stop).union(set(punctuation_symbols).union(trivial_topics)))

# apply the stoplist to each document in RawText
documents = [[word for word in text.lower().split() if word not in stoplist]
            for text in df.RawText]

Now we import the *gensim* library and define a **dictionary**, which matches any word in each text with a numeric ID; notice that the documents are treated as *bows* (numeric vectors); the output of this operation is the **corpus** we will perform analysis on:

In [59]:
# define a dictionary to associate ad Id to each token and build the corpus
from gensim import corpora, models
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(text) for text in documents]

Now we define the Latent Dirichlet Allocation using the dictionary and the corpus. Num_topics = **20** seems a reasonable choice:

In [62]:
# define an lda model using the previously defined dictionary
no_topics = 20
lda = models.ldamodel.LdaModel(corpus, id2word = dictionary, num_topics=no_topics)

We perform now some technical operations to print the topics in a sufficiently readable way:

In [63]:
topics = []
for num in range(no_topics):
    topic_prob = lda.show_topic(num)
    topic = []
    for word in range(len(topic_prob)):
        topic.append(topic_prob[word][0])
    topic = '  '.join(topic)
    topics.append(topic)
topics

['mchale,  2010  a;  judith  david  aid  <mchaleja@state.gov>  t;  goldman  family',
 'cheryl  mills,  pm  2010  release  secretary  said  full  would  also',
 '2010  original  huma  message  call  release  abedin,  pm  part  b6',
 '2010  1.4(d)  (reuters)  release  may  us  would  b1  part  said',
 '2  1  release  b5  3  part  pm  b1  2010  1.4(d)',
 'obama  said  would  american  president  one  new  could  political  people',
 '2010  sullivan,  jacob  pm  message  original  release  speech  part  lissa',
 '2010  abedin,  huma  release  original  full  message  <abedinh@state.gov>  pm  07/31/2015',
 'release  2010  message  original  part  <slaughtera@state.gov>  may  b6  2009  cheryl',
 'message  release  original  2010  pm  2009  07/31/2015  may  cheryl  part',
 "pm  office  secretary's  meeting  arrive  route  depart  room  private  residence",
 '2010  message  original  cheryl  pm  release  mills,  b6  part  2009',
 'government  people  2010  president  united  new  ventures  hai