In [1]:
% matplotlib inline
from __future__ import division
import gensim.corpora as corpora
from gensim.models.wrappers import LdaMallet
from gensim.models import CoherenceModel
from helpers import *
import nltk
import json

In [7]:
NUM_TOPICS = 50
MALLET = "/Users/ddemszky/mallet-2.0.8/bin/mallet"
RANDOM_SEED = 42
output_dir = "topics/gensim_" + str(NUM_TOPICS)

In [4]:
books = get_book_txts("data/coref_resolved_txts", splitlines=False)

Getting books...
America_A_Narrative_History_WWNorton_10th
America_Past_And_Present_Pearson_10th
Americas_History_Bedford_8th
Give_Me_Liberty_An_American_History_WWNorton_3rd
The_American_Pageant_Cengage_14th
The_Unfinished_Nation_A_Concise_History_of_the_American_People_McGraw-Hill_8th
Visions_of_America_A_History_of_the_United_States_Pearson_2nd
american_history_connecting_with_the_past
by_the_people
history_alive_united_states_thru_industrialism
hmh_the_americans_us_history_since_1877
mastering_the_teks
pearson_us_history
teks_us_history
us_history_early_colonial_period_through_reconstruction


In [11]:
print("Cleaning and combining texts...")
all_sentences = []
start_end = []
prev = 0
for title, book in books.items():
    print(title)
    sents = nltk.sent_tokenize(book)
    start = prev
    for i, s in enumerate(sents):
        all_sentences.append(clean_text(s, stem=True, remove_short=True))
    end = start + len(sents) - 1
    start_end.append((title, start, end))
    prev = end + 1

Cleaning and combining texts...
America_A_Narrative_History_WWNorton_10th
America_Past_And_Present_Pearson_10th
Americas_History_Bedford_8th
Give_Me_Liberty_An_American_History_WWNorton_3rd
The_American_Pageant_Cengage_14th
The_Unfinished_Nation_A_Concise_History_of_the_American_People_McGraw-Hill_8th
Visions_of_America_A_History_of_the_United_States_Pearson_2nd
american_history_connecting_with_the_past
by_the_people
history_alive_united_states_thru_industrialism
hmh_the_americans_us_history_since_1877
mastering_the_teks
pearson_us_history
teks_us_history
us_history_early_colonial_period_through_reconstruction


In [9]:
start_end_dict = {}
for tup in start_end:
    start_end_dict[tup[0]] = (tup[1], tup[2])
with open(output_dir + '/book_start_end.json', 'w') as f:
    f.write(json.dumps(start_end_dict))

In [10]:
print("%d sentences total" % len(all_sentences))


333134 sentences total


In [12]:
print("Creating dictionary...")
id2word = corpora.Dictionary(all_sentences)
id2word.save(output_dir + '/dictionary.dict')

Creating dictionary...


In [13]:
print("Getting term-document frequencies...")
corpus = [id2word.doc2bow(t) for t in all_sentences]

Getting term-document frequencies...


In [22]:
def get_topics(num, corpus, id2word, output_dir, all_sentences):
    print(num)
    ldamallet = LdaMallet(MALLET,
                          corpus=corpus,
                          num_topics=num,
                          prefix=output_dir + "/",
                          workers=1,   # workers has to be 1, otherwise you get a java out of bounds exception
                          id2word=id2word,
                          iterations=1000,
                          random_seed=RANDOM_SEED,
                         alpha=5,
                         optimize_interval=20)
    coherence_model_ldamallet = CoherenceModel(model=ldamallet,
                                               texts=all_sentences,
                                               dictionary=id2word,
                                               coherence='c_v')
    coherence_ldamallet = coherence_model_ldamallet.get_coherence()
    print('\nCoherence Score: ', coherence_ldamallet)
    keywords = {i: ", ".join([word for word, prop in ldamallet.show_topic(i)]) for i in range(ldamallet.num_topics)}
    with open(output_dir + "/topic_names.json', 'w') as f:
        f.write(json.dumps(keywords))
    ldamallet.save(output_dir + "/model.mallet")
    ldamallet.show_topics(num_topics=num, formatted=True)
    return coherence_ldamallet

In [23]:
print("Running topic model with %d topics..." % NUM_TOPICS)
get_topics(NUM_TOPICS, corpus, id2word, output_dir, all_sentences)

Running topic model with 50 topics...
50

Coherence Score:  0.6760558828930116


0.6760558828930116

In [27]:
ldamallet = LdaMallet.load(output_dir + "/model.mallet")

In [28]:
ldamallet.show_topics(num_topics=NUM_TOPICS, formatted=True)

[(0,
  '0.049*"percent" + 0.047*"million" + 0.039*"year" + 0.030*"american" + 0.027*"popul" + 0.022*"number" + 0.014*"half" + 0.014*"rate" + 0.013*"increas" + 0.013*"peopl"'),
 (1,
  '0.025*"american" + 0.018*"peopl" + 0.018*"freedom" + 0.014*"liberti" + 0.012*"govern" + 0.010*"idea" + 0.010*"equal" + 0.009*"individu" + 0.009*"nation" + 0.008*"independ"'),
 (2,
  '0.036*"peopl" + 0.011*"man" + 0.010*"time" + 0.010*"make" + 0.008*"countri" + 0.008*"good" + 0.007*"live" + 0.007*"person" + 0.007*"thing" + 0.006*"made"'),
 (3,
  '0.108*"slave" + 0.041*"black" + 0.032*"white" + 0.027*"african" + 0.026*"slaveri" + 0.024*"free" + 0.017*"southern" + 0.017*"south" + 0.015*"american" + 0.013*"enslav"'),
 (4,
  '0.044*"south" + 0.035*"cotton" + 0.030*"slave" + 0.026*"planter" + 0.023*"southern" + 0.017*"plantat" + 0.015*"carolina" + 0.013*"farmer" + 0.012*"labor" + 0.012*"tobacco"'),
 (5,
  '0.018*"kill" + 0.016*"soldier" + 0.014*"die" + 0.012*"thousand" + 0.011*"peopl" + 0.011*"diseas" + 0.010*"