In [4]:
% matplotlib inline
from __future__ import division
import matplotlib.pyplot as plt
import seaborn as sns
import gensim.corpora as corpora
from gensim.models.wrappers import LdaMallet
from gensim.models import CoherenceModel
from helpers import *
import nltk
import json
import numpy as np
import operator
import pandas as pd
from scipy.stats import pearsonr
from scipy.stats import linregress
import matplotlib.gridspec as gridspec

In [5]:
NUM_TOPICS = 50
MALLET = "/Users/ddemszky/mallet-2.0.8/bin/mallet"
RANDOM_SEED = 42
output_dir = "topics/gensim_" + str(NUM_TOPICS)

In [9]:
books = get_book_txts("data/coref_resolved_txts", splitlines=False)

Getting books...
America_A_Narrative_History_WWNorton_10th
America_Past_And_Present_Pearson_10th
Americas_History_Bedford_8th
Give_Me_Liberty_An_American_History_WWNorton_3rd
The_American_Pageant_Cengage_14th
The_Unfinished_Nation_A_Concise_History_of_the_American_People_McGraw-Hill_8th
Visions_of_America_A_History_of_the_United_States_Pearson_2nd
american_history_connecting_with_the_past
by_the_people
history_alive_united_states_thru_industrialism
hmh_the_americans_us_history_since_1877
mastering_the_teks
pearson_us_history
teks_us_history
us_history_early_colonial_period_through_reconstruction


In [10]:
print("Cleaning and combining texts...")
all_sentences = []
start_end = []
prev = 0
for title, book in books.items():
    print(title)
    sents = nltk.sent_tokenize(book)
    start = prev
    for i, s in enumerate(sents):
        if len(s) < 15:
            continue
        all_sentences.append(clean_text(s, stem=True, remove_short=True))
    end = start + len(sents) - 1
    start_end.append((title, start, end))
    prev = end + 1

Cleaning and combining texts...
America_A_Narrative_History_WWNorton_10th
America_Past_And_Present_Pearson_10th
Americas_History_Bedford_8th
Give_Me_Liberty_An_American_History_WWNorton_3rd
The_American_Pageant_Cengage_14th
The_Unfinished_Nation_A_Concise_History_of_the_American_People_McGraw-Hill_8th
Visions_of_America_A_History_of_the_United_States_Pearson_2nd
american_history_connecting_with_the_past
by_the_people
history_alive_united_states_thru_industrialism
hmh_the_americans_us_history_since_1877
mastering_the_teks
pearson_us_history
teks_us_history
us_history_early_colonial_period_through_reconstruction


In [11]:
start_end_dict = {}
for tup in start_end:
    start_end_dict[tup[0]] = (tup[1], tup[2])
with open(output_dir + '/book_start_end.json', 'w') as f:
    f.write(json.dumps(start_end_dict))

In [12]:
print("%d sentences total" % len(all_sentences))


316124 sentences total


In [24]:
print("Creating dictionary...")
id2word = corpora.Dictionary(all_sentences)
id2word.save(output_dir + '/dictionary.dict')

Creating dictionary...


In [25]:
print("Getting term-document frequencies...")
corpus = [id2word.doc2bow(t) for t in all_sentences]

Getting term-document frequencies...


In [26]:
def get_topics(num, corpus, id2word, output_dir, all_sentences):
    print(num)
    ldamallet = LdaMallet(MALLET,
                          corpus=corpus,
                          num_topics=num,
                          prefix=output_dir + "/",
                          workers=1,   # workers has to be 1, otherwise you get a java out of bounds exception
                          id2word=id2word,
                          iterations=1000,
                          random_seed=RANDOM_SEED,
                         alpha=5)
    coherence_model_ldamallet = CoherenceModel(model=ldamallet,
                                               texts=all_sentences,
                                               dictionary=id2word,
                                               coherence='c_v')
    coherence_ldamallet = coherence_model_ldamallet.get_coherence()
    print('\nCoherence Score: ', coherence_ldamallet)
    keywords = {i: ", ".join([word for word, prop in ldamallet.show_topic(i)]) for i in range(ldamallet.num_topics)}
    with open(output_dir + "/topic_names.json", 'w') as f:
        f.write(json.dumps(keywords))
    ldamallet.save(output_dir + "/model.mallet")
    ldamallet.show_topics(num_topics=num, formatted=True)
    return coherence_ldamallet

In [27]:
print("Running topic model with %d topics..." % NUM_TOPICS)
get_topics(NUM_TOPICS, corpus, id2word, output_dir, all_sentences)

Running topic model with 50 topics...
50

Coherence Score:  0.6697524666698742


0.6697524666698742

In [18]:
ldamallet = LdaMallet.load(output_dir + "/model.mallet")

In [19]:
coherence_model_ldamallet = CoherenceModel(model=ldamallet,
                                               texts=all_sentences,
                                               dictionary=id2word,
                                               coherence='c_v')

In [None]:
ldamallet.show_topics(num_topics=NUM_TOPICS, formatted=True)

In [None]:
doc_topic_file =  output_dir+ '/doctopics.txt'
doc_topics = open(doc_topic_file).read().splitlines()
print(len(doc_topics), 'articles total')

In [None]:
topic_names = json.load(open(output_dir + '/topic_names.json', 'r'))

In [None]:
topic_names

### Prominence as measured by topic keys

In [None]:
topic_keys = open(output_dir+ '/topickeys.txt').read().splitlines()
topic2weight = {}
for t in topic_keys:
    topic2weight[topic_names[t.split()[0]]] = float(t.split()[1])

In [None]:
sorted_topics = sorted(topic2weight.items(), key=operator.itemgetter(1), reverse=True)

In [None]:
sorted_topics

### Prominence as measured by averaging the topic matrix

In [None]:
doc_topic_mat = np.array([[float(n) for n in l.strip().split("\t")[2:]] for l in doc_topics])

In [None]:
doc_topic_mat.shape

In [None]:
for t in doc_topic_mat.mean(axis=0).argsort()[::-1]:
    print(doc_topic_mat.mean(axis=0)[t], topic_names[str(t)])

### Prominence based on averaging across books

In [167]:
titles = ["America_A_Narrative_History_WWNorton_10th",
          "America_Past_And_Present_Pearson_10th",
          "american_history_connecting_with_the_past",
          "Americas_History_Bedford_8th",
          "by_the_people","Give_Me_Liberty_An_American_History_WWNorton_3rd",
          "history_alive_united_states_thru_industrialism",
          "hmh_the_americans_us_history_since_1877","mastering_the_teks","pearson_us_history","teks_us_history","The_American_Pageant_Cengage_14th","The_Unfinished_Nation_A_Concise_History_of_the_American_People_McGraw-Hill_8th","us_history_early_colonial_period_through_reconstruction","Visions_of_America_A_History_of_the_United_States_Pearson_2nd"]

In [None]:
#titles = [t for t in titles if not ('early' in t or 'industr' in t or 'since' in t)]

In [None]:
book_means = []
for title in titles:
    start, end = start_end_dict[title]
    doc_topics_book = doc_topics[start:end]
    book_means.append(np.array([[float(n) for n in l.strip().split("\t")[2:]] for l in doc_topics_book]).mean(axis=0))

In [None]:
for t in np.array(book_means).mean(axis=0).argsort()[::-1]:
    print(np.array(book_means).mean(axis=0)[t], topic_names[str(t)])

In [None]:
def get_topic_for_doc(doc_id, printout=True):
    doc = all_sentences[doc_id]
    if printout:
        print(doc)
        #print(doc_topics[doc_id])
    topics = doc_topics[doc_id].strip().split()[2:]
    topics = set([i for (i, v) in enumerate(topics)
                         if float(v) > 0.1])
    if printout:
        print("Topics:")
        for t in topics:
            print(topic_names[str(t)])
    return len(topics)

In [None]:
get_topic_for_doc(9)

In [None]:
def get_topic_for_word(word):
    topics = []
    for k, v in topic_names.items():
        if word in v:
            print(k, v)
            topics.append(int(k))
    return topics

In [None]:
get_topic_for_word("wom")

In [None]:
get_topic_for_word("men,")

### By most prominent topic(s)

In [None]:
topic_counts = np.zeros(NUM_TOPICS)
for i in range(NUM_TOPICS):
    topic_counts[i] = (doc_topic_mat[:, i] > 0.1).sum()

In [None]:
topic_counts

In [None]:
for t in topic_counts.argsort()[::-1]:
    print(topic_counts[t], topic_names[str(t)])

### Topic ratios

In [None]:
shape_mapper = get_shapes(abbr=True)
color_mapper = get_colors(abbr=True)
title_abbr = get_title_abbr()

In [None]:
shape_mapper

In [None]:
def get_ratio_of_topic_prominence(topic1, topic2, name1, name2):
    df_ratio = []
    df_book = []
    df_dem = []
    for i, t in enumerate(titles):
        # remove books that only cover half of US history
        if 'early' in t or 'industr' in t or 'since' in t:
            continue
        df_book.append(title_abbr[t])
        df_dem.append(dem_per_book[t])
        df_ratio.append(book_means[i][topic1] / book_means[i][topic2])
    return pd.DataFrame({'Book': df_book, 'ratio': df_ratio, 'dem': df_dem})

In [None]:
get_topic_for_word("slaveri")

In [None]:
get_topic_for_word("milit")

In [None]:
with open('data/dem_per_book.json', 'r') as f:
    dem_per_book = json.load(f)

In [None]:
name1 = "Slavery"
name2 = "Military"
slavery_df = get_ratio_of_topic_prominence(29, 45, name1, name2)

In [None]:
def get_ratio_of_topic_group_prominence(topics_1, topics_2):
    df_ratio = []
    df_book = []
    df_dem = []
    df_minratio = []
    df_maxratio = []
    for i, t in enumerate(titles):
        if 'early' in t or 'industr' in t or 'since' in t:
            continue
        df_book.append(title_abbr[t])
        df_dem.append(dem_per_book[t])
        df_ratio.append(np.sum([book_means[i][topic1] for topic1 in topics_1]) / 
                        np.sum([book_means[i][topic2] for topic2 in topics_2]))
        
        # get leave-out values
        ratios = []
        for t1 in topics_1:
            for t2 in topics_2:
                ratios.append(book_means[i][t1] / book_means[i][t2])
        df_minratio.append(min(ratios))
        df_maxratio.append(max(ratios))
        
    return pd.DataFrame({'Book': df_book, 'ratio': df_ratio, 'dem': df_dem, 'min_ratio': df_minratio, 'max_ratio': df_maxratio})

In [None]:
women_topics = get_topic_for_word("women")
president_topics = get_topic_for_word("presid")
women_df = get_ratio_of_topic_group_prominence(women_topics, president_topics)

In [None]:
women_df