# Imports

In [1]:
import matplotlib.pyplot as plt
from os import path
from wordcloud import WordCloud
import nltk
import pandas as pd

### Example of usage

In [2]:
d = path.dirname('hillary-clinton-emails/')

# Read the whole text.
text = open(path.join(d, 'Emails.csv'),encoding='utf8').read()

# Generate a word cloud image
wordcloud = WordCloud().generate(text)

# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
plt.imshow(wordcloud)
plt.axis("off")

# lower max_font_size
wordcloud = WordCloud(max_font_size=40).generate(text)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

In [7]:
type(text)

str

# Read database

In [3]:
df=pd.read_csv('hillary-clinton-emails/Emails.csv')

In [4]:
df.shape

(7945, 22)

In [5]:
df.head(1)

Unnamed: 0,Id,DocNumber,MetadataSubject,MetadataTo,MetadataFrom,SenderPersonId,MetadataDateSent,MetadataDateReleased,MetadataPdfLink,MetadataCaseNumber,...,ExtractedTo,ExtractedFrom,ExtractedCc,ExtractedDateSent,ExtractedCaseNumber,ExtractedDocNumber,ExtractedDateReleased,ExtractedReleaseInPartOrFull,ExtractedBodyText,RawText
0,1,C05739545,WOW,H,"Sullivan, Jacob J",87.0,2012-09-12T04:00:00+00:00,2015-05-22T04:00:00+00:00,DOCUMENTS/HRC_Email_1_296/HRCH2/DOC_0C05739545...,F-2015-04841,...,,"Sullivan, Jacob J <Sullivan11@state.gov>",,"Wednesday, September 12, 2012 10:16 AM",F-2015-04841,C05739545,05/13/2015,RELEASE IN FULL,,UNCLASSIFIED\r\nU.S. Department of State\r\nCa...


## Tokenization of RawText

In [6]:
df.ExtractedBodyTextString=df.ExtractedBodyText.apply(lambda x: str(x))

In [7]:
sentence="".join(df.ExtractedBodyTextString)

In [8]:
len(sentence)

3645959

In [9]:
tokens = nltk.word_tokenize(sentence)

In [10]:
len(tokens)

689100

In [11]:
text = nltk.Text(tokens)

In [12]:
text.collocations()

United States; White House; State Department; FOIA WAIVER; BENGHAZI
COMM; SELECT BENGHAZI; HOUSE SELECT; SENSITIVE INFORMATION;
F-2015-04841 Doc; STATE DEPT; 05/13/2015 STATE; Private Residence; New
York; *En route; State Case; OFFICE TIME; U.S. Department; health
care; Middle East; Conference Room


## Stop words removal

In [9]:
from nltk.corpus import stopwords

In [10]:
stop = stopwords.words('english')

In [14]:
cont=0
for i in tokens:
    if i in stop:
        cont+=1

In [15]:
cont

220707

In [16]:
good_tokens=[x for x in tokens if x not in stop]

In [17]:
len(good_tokens)

468393

In [18]:
text = nltk.Text(tokens)

In [20]:
text.collocations()

United States; White House; State Department; FOIA WAIVER; BENGHAZI
COMM; SELECT BENGHAZI; HOUSE SELECT; SENSITIVE INFORMATION;
F-2015-04841 Doc; STATE DEPT; 05/13/2015 STATE; Private Residence; New
York; *En route; State Case; OFFICE TIME; U.S. Department; health
care; Middle East; Conference Room


## Stemming

In [19]:
from nltk.stem.wordnet import WordNetLemmatizer

In [20]:
lmtzr = WordNetLemmatizer()

In [21]:
stemmed_tokens=[lmtzr.lemmatize(x) for x in good_tokens]

In [22]:
len(stemmed_tokens)

468393

In [23]:
text = nltk.Text(tokens)

In [26]:
text.collocations()

United States; White House; State Department; FOIA WAIVER; BENGHAZI
COMM; SELECT BENGHAZI; HOUSE SELECT; SENSITIVE INFORMATION;
F-2015-04841 Doc; STATE DEPT; 05/13/2015 STATE; Private Residence; New
York; *En route; State Case; OFFICE TIME; U.S. Department; health
care; Middle East; Conference Room


## Wordcloud

In [27]:
from wordcloud import WordCloud

In [25]:
final_string=' '.join(stemmed_tokens)

In [26]:
# Generate a word cloud image
wordcloud = WordCloud().generate(final_string)

# Display the generated image:
# the matplotlib way:
import matplotlib.pyplot as plt
plt.imshow(wordcloud)
plt.axis("off")

# lower max_font_size
wordcloud = WordCloud(max_font_size=40).generate(final_string)
plt.figure()
plt.imshow(wordcloud)
plt.axis("off")
plt.show()

# LDA

Import at first the cleaned body text from a csv file:

In [2]:
df = pd.read_csv('hillary-clinton-emails/sentimentEmails.csv')

We take a look at the dataset before proceeding:

In [11]:
df.ProcessedData.isnull().value_counts()

False    6453
True        3
Name: ProcessedData, dtype: int64

Given that there are NaNs left, we will wipe them out when building the corpus.

At first we define a list of stopwords to be wiped out from the documents; this words are typical English language stopwords or some trivial expressions which are contained in the *RawText*, such as 'fw:', 'sent:', 'from:', 'to:' (mail parameters) or 'u.s.', 'state', 'department' and so on:

In [90]:
# clear the documents from trivial recurrent words and from punctuation symbols

punctuation_symbols = ['.',',',';',':','-','•','"',"'",'?','!','@','#','/','*','+','(',')','—','{','}',
                      '."',',"','),','(,','<','>','%','&','$','---','----','-----','------','[',']',
                      '■','--','...','://']
trivial_words = ['u','w','h','j','us','fyi','would','fw','get']

# here we decide to eliminate small numbers, as they prevent from a clear interpretation of the text
# only 'great' ones (that is dates, likely) remain in the corpus
numbs = range(100)
numbers = [str(n) for n in numbs]
numbers = list(set(numbers).union(set(['00'])))

stoplist = list(set(trivial_words).union(set(punctuation_symbols).union(set(numbers))))

# apply the stoplist to each document in RawText
documents = [[word for word in text.lower().split() if word not in stoplist]
            for text in df.ProcessedData.dropna()]

Now we import the *gensim* library and define a **dictionary**, which matches any word in each text with a numeric ID; notice that the documents are treated as *bows* (numeric vectors); the output of this operation is the **corpus** we will perform analysis on:

In [91]:
# define a dictionary to associate ad Id to each token and build the corpus
from gensim import corpora, models
dictionary = corpora.Dictionary(documents)
corpus = [dictionary.doc2bow(text) for text in documents]

Now we define the Latent Dirichlet Allocation using the dictionary and the corpus. Num_topics = **20** seems a reasonable choice:

In [92]:
# define an lda model using the previously defined dictionary
no_topics = 10
lda = models.ldamodel.LdaModel(corpus, id2word = dictionary, num_topics=no_topics)
lda.show_topics()

[(0,
  '0.011*"today" + 0.009*"letter" + 0.007*"state" + 0.006*"vote" + 0.006*"2010" + 0.005*"party" + 0.004*"gov" + 0.004*"good" + 0.004*"branch" + 0.003*"dan"'),
 (1,
  '0.007*"time" + 0.005*"try" + 0.005*"last" + 0.004*"issue" + 0.004*"state" + 0.004*"qddr" + 0.004*"come" + 0.003*"email" + 0.003*"like" + 0.003*"russia"'),
 (2,
  '0.005*"right" + 0.005*"send" + 0.004*"ask" + 0.004*"speech" + 0.004*"give" + 0.003*"report" + 0.003*"tell" + 0.003*"follow" + 0.003*"bill" + 0.003*"come"'),
 (3,
  '0.010*"obama" + 0.009*"president" + 0.009*"secretary" + 0.009*"israel" + 0.007*"office" + 0.006*"time" + 0.006*"state" + 0.005*"nuclear" + 0.005*"white" + 0.005*"israeli"'),
 (4,
  '0.027*"secretary" + 0.026*"office" + 0.023*"state" + 0.013*"department" + 0.011*"room" + 0.008*"arrive" + 0.008*"route" + 0.008*"en" + 0.007*"depart" + 0.007*"time"'),
 (5,
  '0.006*"bill" + 0.005*"good" + 0.005*"clinton" + 0.005*"book" + 0.005*"obama" + 0.004*"state" + 0.004*"could" + 0.004*"like" + 0.003*"time" + 0

We perform now some technical operations to print the topics in a sufficiently readable way:

In [93]:
topics = []
for num in range(no_topics):
    topic_prob = lda.show_topic(num)
    topic = []
    for word in range(len(topic_prob)):
        topic.append(topic_prob[word][0])
    topic = '  '.join(topic)
    topics.append(topic)
topics

['today  letter  state  vote  2010  party  gov  good  branch  dan',
 'time  try  last  issue  state  qddr  come  email  like  russia',
 'right  send  ask  speech  give  report  tell  follow  bill  come',
 'obama  president  secretary  israel  office  time  state  nuclear  white  israeli',
 'secretary  office  state  department  room  arrive  route  en  depart  time',
 'bill  good  clinton  book  obama  state  could  like  time  diplomacy',
 'palin  care  state  republicans  obama  health  2010  percent  republican  company',
 'state  unite  people  conflict  american  force  government  world  support  security',
 'time  party  state  china  may  take  first  leave  obama  labour',
 'state  2010  gov  b6  pls  com  clintonemail  cheryl  print  hrod17']