In [1]:
import os
# Pandas
import pandas as pd
#NTK
import nltk
from nltk import word_tokenize

## Part 3.

> Using the models.ldamodel module from the gensim library, run topic modeling over the corpus. Explore different numbers of topics (varying from 5 to 50), and settle for the parameter which returns topics that you consider to be meaningful at first sight

In [2]:
import gensim
from gensim.models.ldamodel import LdaModel
from gensim.corpora.dictionary import Dictionary

from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

import re



We will perform the same data preprocessing steps as in part I:

- Tokenization with *RegexpTokenizer(r'\w+')
- Stopwords removal 
- Punctuation removal

In [3]:
# SAME PREPROCESSING AS IN STEP I

data_path = "hillary-clinton-emails/"
emails  = pd.read_csv(data_path+"Emails.csv",index_col=0)

# Email Body
cleanBodyText = emails.ExtractedBodyText.dropna()
emailBodyText = ["".join(x) for x in cleanBodyText]

# Email Subject
cleanSubject = emails.ExtractedSubject.dropna()
emailSubject = ["".join(x) for x in cleanSubject]

# Data = Body + Subject
emailRawText = emailBodyText + emailSubject

# Tokens + Stop words
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'])

def stepI_preprocessing(data):
    token = tokenizer.tokenize(str(data))
    lower = [word.lower() for word in token]
    words_no_stop = [w for w in lower if w not in stop_words]
    
    return words_no_stop

# Final preprocessing
words_clean = [stepI_preprocessing(x) for x in emailRawText]

We start by computing two datas, *dico* and *corpus*, needed by the gensim library.

In [4]:
dico = Dictionary(words_clean)
corpus = [dico.doc2bow(text) for text in words_clean]

In [5]:
def clean_lda_string (string):
    """
        Clean the lda topic string. Only keeps the alphanumerical characters.
    """
    return " ".join(re.split("[^a-zA-Z]*", string))

### Now we compute the *LdaModel*. There is no precise way to choose the best number of topics to search for. Therefore, we try several numbers of topics, from 5 to 50 (step of 5).

In [6]:
for nbr_topics in range(5,51,5):
    s = "======================="+str(nbr_topics)+"==============================="
    print(s)
    
    lda = LdaModel(corpus, num_topics=nbr_topics, id2word=dico)
    for i in range(0, lda.num_topics):
        string = lda.print_topic(i)
        topic = clean_lda_string(string)
        print(topic)
    
    print()



  return _compile(pattern, flags).split(string, maxsplit)


 obama new u one israel said president would state american 
 pm call secretary office w meeting 
 fw h sid reuters ap schedule state update tomorrow 
 haiti sid would h health memo care bill new 
 call fyi speech talk u list new qddr today one 

 said obama new us mr afghanistan one would iran party 
 call ni fw sheet haiti tomorrow calls state 
 reuters ashton afghan fw state talks security sudan gaza obama 
 fw h sid schedule ap update state mini 
 would one israel new u netanyahu president also china government 
 pm office secretary call w meeting state 
 call talk tomorrow list today tonight followup confirmed email jim 
 call speech get statement good draft u see report want 
 fyi haiti care health would new happy work senate stone 
 u calls israel state israeli rights human us report statement 

 tomorrow talk today call happy u get mubarak trip update 
 obama new said rights one president mr people political human 
 pm office secretary meeting w 
 call schedule list update conf

As we can see, the more number of topics we have, to more diverse topics the algorithm outputs. 

With *num_topics=50* for example, we have topics that are too many detailed.

With *num_topics=5* there is topics about meetings, obama, afganhistan, and israel.

We think that with a value of num_topics around 15 was a good choice. Therefor, there is enough topics and not too much detailed. It represents well the politics subject that Hillary Clinton dealed with while she was secretary of state: Some U.S. politics, international subject, like Israel, Middle East, and Haiti, but also a lot of meetings and organisational issues.
