# APWA Latent Topic Analysis

### Import necessary libraries

In [34]:
import chardet
import csv
import gensim
import logging
import nltk
import os
import pickle
import string

%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from itertools import cycle
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim import corpora
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel

from sklearn.feature_extraction.text import TfidfVectorizer


### Load essays into hash table

In [2]:
root = os.path.dirname(os.path.realpath('__file__'))
essay_path = root + '/../essays/'


In [3]:
files = os.listdir(essay_path)

essays = {}
for file in files:
    # attempt to confidently guess encoding; otherwise, default to ISO-8859-1
    encoding = "ISO-8859-1"
    guess = chardet.detect(open(essay_path + file, "rb").read())
    if (guess["confidence"] >= 0.95):
        encoding = guess["encoding"]
    
    with open(essay_path + file, "r", encoding=encoding) as f:
        essays[file] = f.read()


### Tokenize and preprocess

In [4]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
tokenized_essays = {label: gensim.utils.simple_preprocess(corpus, deacc=True, min_len=2, max_len=20) for (label, corpus) in essays.items()}


In [6]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": nltk.corpus.wordnet.ADJ,
                "N": nltk.corpus.wordnet.NOUN,
                "V": nltk.corpus.wordnet.VERB,
                "R": nltk.corpus.wordnet.ADV}

    return tag_dict.get(tag, nltk.corpus.wordnet.NOUN)

lemmatizer = nltk.stem.WordNetLemmatizer()
tokenized_essays = {label: [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in token_lst if w not in string.punctuation] for (label, token_lst) in tokenized_essays.items()}


In [7]:
english_stopwords = nltk.corpus.stopwords.words('english')
custom_stopwords = open("custom_stopwords.txt", "r").read().splitlines()
tokenized_essays = {label: [w for w in token_lst if (w not in english_stopwords and w not in custom_stopwords)] for (label, token_lst) in tokenized_essays.items()}


### Vectorize with doc2vec

In [8]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenized_essays.values())]
d2v_model = Doc2Vec(documents, vector_size=100)


2019-10-28 19:11:37,658 : INFO : collecting all words and their counts
2019-10-28 19:11:37,660 : INFO : PROGRESS: at example #0, processed 0 words (0/s), 0 word types, 0 tags
2019-10-28 19:11:38,173 : INFO : collected 57681 word types and 1573 unique tags from a corpus of 1573 examples and 963713 words
2019-10-28 19:11:38,174 : INFO : Loading a fresh vocabulary
2019-10-28 19:11:38,583 : INFO : effective_min_count=5 retains 13711 unique words (23% of original 57681, drops 43970)
2019-10-28 19:11:38,584 : INFO : effective_min_count=5 leaves 902048 word corpus (93% of original 963713, drops 61665)
2019-10-28 19:11:38,645 : INFO : deleting the raw counts dictionary of 57681 items
2019-10-28 19:11:38,650 : INFO : sample=0.001 downsamples 9 most-common words
2019-10-28 19:11:38,650 : INFO : downsampling leaves estimated 896167 word corpus (99.3% of prior 902048)
2019-10-28 19:11:38,762 : INFO : estimated required memory for 13711 words and 100 dimensions: 18453500 bytes
2019-10-28 19:11:38,7

# LDA before clustering

In [87]:
essays_matt = list(tokenized_essays.values())

dictionary_matt = corpora.Dictionary(essays_matt)

corpus_matt = [dictionary.doc2bow(essay) for essay in essays_matt]

# m2 = models.ldamodel.LdaModel(corpus=corpus_matt, id2word=dictionary, num_topics=2, passes=10)
# m3 = models.ldamodel.LdaModel(corpus=corpus_matt, id2word=dictionary, num_topics=3, passes=10)
# m4 = models.ldamodel.LdaModel(corpus=corpus_matt, id2word=dictionary, num_topics=4, passes=10)
# m5 = models.ldamodel.LdaModel(corpus=corpus_matt, id2word=dictionary, num_topics=5, passes=10)
# m6 = models.ldamodel.LdaModel(corpus=corpus_matt, id2word=dictionary, num_topics=6, passes=10)
# m7 = models.ldamodel.LdaModel(corpus=corpus_matt, id2word=dictionary, num_topics=7, passes=10)
# m8 = models.ldamodel.LdaModel(corpus=corpus_matt, id2word=dictionary, num_topics=8, passes=10)
# m9 = models.ldamodel.LdaModel(corpus=corpus_matt, id2word=dictionary, num_topics=9, passes=10)
# m10 = models.ldamodel.LdaModel(corpus=corpus_matt, id2word=dictionary, num_topics=10, passes=10)
m11 = models.ldamodel.LdaModel(corpus=corpus_matt, id2word=dictionary, num_topics=11, passes=10)
# m12 = models.ldamodel.LdaModel(corpus=corpus_matt, id2word=dictionary, num_topics=12, passes=10)
# m13 = models.ldamodel.LdaModel(corpus=corpus_matt, id2word=dictionary, num_topics=13, passes=10)
# m14 = models.ldamodel.LdaModel(corpus=corpus_matt, id2word=dictionary, num_topics=14, passes=10)
# m15 = models.ldamodel.LdaModel(corpus=corpus_matt, id2word=dictionary, num_topics=15, passes=10)
# m16 = models.ldamodel.LdaModel(corpus=corpus_matt, id2word=dictionary, num_topics=16, passes=10)
# m17 = models.ldamodel.LdaModel(corpus=corpus_matt, id2word=dictionary, num_topics=17, passes=10)
# m18 = models.ldamodel.LdaModel(corpus=corpus_matt, id2word=dictionary, num_topics=18, passes=10)


2019-10-28 21:17:09,853 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-10-28 21:17:11,239 : INFO : built Dictionary(57681 unique tokens: ['aggressive', 'air', 'anger', 'anyone', 'anything']...) from 1573 documents (total 963713 corpus positions)
2019-10-28 21:17:12,314 : INFO : using symmetric alpha at 0.09090909090909091
2019-10-28 21:17:12,316 : INFO : using symmetric eta at 0.09090909090909091
2019-10-28 21:17:12,328 : INFO : using serial LDA version on this node
2019-10-28 21:17:12,363 : INFO : running online (multi-pass) LDA training, 11 topics, 10 passes over the supplied corpus of 1573 documents, updating model once every 1573 documents, evaluating perplexity every 1573 documents, iterating 50x with a convergence threshold of 0.001000
2019-10-28 21:17:17,445 : INFO : -10.446 per-word bound, 1395.0 perplexity estimate based on a held-out corpus of 1573 documents with 852913 words
2019-10-28 21:17:17,446 : INFO : PROGRESS: pass 0, at document #1573/1573
2019-1

2019-10-28 21:17:48,521 : INFO : topic #8 (0.091): 0.006*"inmate" + 0.005*"officer" + 0.004*"state" + 0.004*"unit" + 0.004*"write" + 0.004*"staff" + 0.003*"money" + 0.003*"guard" + 0.003*"food" + 0.003*"yard"
2019-10-28 21:17:48,523 : INFO : topic #7 (0.091): 0.007*"state" + 0.007*"system" + 0.005*"law" + 0.004*"right" + 0.004*"staff" + 0.003*"court" + 0.003*"tha" + 0.003*"black" + 0.003*"justice" + 0.003*"criminal"
2019-10-28 21:17:48,525 : INFO : topic #9 (0.091): 0.009*"sentence" + 0.008*"crime" + 0.008*"state" + 0.007*"program" + 0.007*"offender" + 0.006*"parole" + 0.005*"system" + 0.005*"inmate" + 0.005*"law" + 0.005*"family"
2019-10-28 21:17:48,527 : INFO : topic diff=0.403387, rho=0.377964
2019-10-28 21:17:52,369 : INFO : -8.137 per-word bound, 281.5 perplexity estimate based on a held-out corpus of 1573 documents with 852913 words
2019-10-28 21:17:52,370 : INFO : PROGRESS: pass 6, at document #1573/1573
2019-10-28 21:17:54,249 : INFO : topic #9 (0.091): 0.010*"sentence" + 0.008

In [85]:
cm = CoherenceModel(model=m11, texts=essays_matt, coherence='c_v')
coherence = cm.get_coherence()
coherence


2019-10-28 20:54:27,909 : INFO : using ParallelWordOccurrenceAccumulator(processes=3, batch_size=64) to estimate probabilities from sliding windows
2019-10-28 20:54:27,993 : INFO : 1 batches submitted to accumulate stats from 64 documents (38321 virtual)
2019-10-28 20:54:28,043 : INFO : 2 batches submitted to accumulate stats from 128 documents (72346 virtual)
2019-10-28 20:54:28,097 : INFO : 3 batches submitted to accumulate stats from 192 documents (104099 virtual)
2019-10-28 20:54:28,194 : INFO : 4 batches submitted to accumulate stats from 256 documents (137126 virtual)
2019-10-28 20:54:28,262 : INFO : 5 batches submitted to accumulate stats from 320 documents (177345 virtual)
2019-10-28 20:54:28,318 : INFO : 6 batches submitted to accumulate stats from 384 documents (216371 virtual)
2019-10-28 20:54:30,195 : INFO : 7 batches submitted to accumulate stats from 448 documents (250140 virtual)
2019-10-28 20:54:30,295 : INFO : 8 batches submitted to accumulate stats from 512 documents 

0.41437948400493707

### some notes
2: 0.3992337026308589
3: 0.371993080904979
4: 0.40025270112726735
5: 0.4174421259723099
6: 0.37134097989863984
7: 0.42337526139475695
8: 0.39162248222910934
9: 0.379028631160457
10: 0.4137827965771722
11: 0.4500098234881914
12: 0.4318264581457793
13: 0.4127596451219076
14: 0.4039530315591046
15: 0.4117885886817264
16: 0.41180308515646974
17: 0.41060383798197586
18: 0.4321508008523154


In [9]:
vectorized_df = pd.DataFrame(d2v_model.docvecs.vectors_docs)
vectorized_df


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
0,0.147509,-0.014784,-0.011532,0.024092,0.127395,0.085249,0.138889,0.096642,-0.029476,-0.004723,...,0.008632,-0.047249,-0.017429,-0.007457,0.062582,0.022023,0.085102,0.196062,0.048372,-0.250067
1,0.336653,0.249148,-0.043525,0.706586,-0.233412,0.320358,0.295057,-0.073638,0.076726,-0.360512,...,0.013873,0.582860,-0.063119,-0.274440,-0.174419,0.134983,0.172573,-0.251397,0.078798,-0.403040
2,0.265048,0.034929,-0.143668,0.082906,-0.179479,0.300218,0.028328,0.027059,-0.208596,-0.336358,...,0.087104,0.156282,0.074614,-0.152939,-0.126736,0.070535,0.031246,0.155973,0.047020,-0.156064
3,0.255031,0.146072,-0.028686,0.045554,0.538038,-0.046723,0.412369,0.086602,0.085714,0.103039,...,0.001472,-0.231739,-0.091940,-0.061719,0.060482,0.107179,0.110714,0.380833,0.094955,-0.390728
4,0.213440,-0.071343,-0.144156,-0.248104,0.478187,-0.186624,0.323790,0.397954,0.111181,0.156982,...,-0.130110,-0.255869,0.064748,-0.137551,0.240025,0.097368,0.145716,0.605082,0.207671,-0.496480
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1568,0.344591,0.289569,0.018266,0.122705,-0.734210,0.738716,0.212049,-0.188311,-0.145424,-0.807569,...,0.100896,0.090312,0.048385,-0.260574,-0.413471,0.172159,0.087545,0.064364,-0.199121,-0.041960
1569,0.167273,0.163935,-0.092415,-0.060433,0.213361,-0.010032,0.184890,0.044825,0.166070,-0.033748,...,0.032203,-0.115270,0.050044,-0.100978,0.039179,0.094837,0.095134,0.153526,0.131756,-0.229616
1570,0.225892,0.031512,-0.134012,-0.058413,0.240788,-0.042440,0.290294,0.264335,0.151973,0.036415,...,-0.067169,-0.171892,0.011226,-0.155270,0.095842,0.093631,0.148598,0.370317,0.119133,-0.386080
1571,0.166018,0.086796,0.046333,0.045508,-0.083053,0.337955,0.130337,-0.031834,-0.177276,-0.198782,...,0.056631,-0.030734,-0.055120,-0.001568,-0.095165,0.070136,0.056693,0.085705,-0.054171,-0.165933


### Feature scaling through standardization

In [10]:
stdsclr = StandardScaler()
standardized_df = pd.DataFrame(stdsclr.fit_transform(vectorized_df.astype(float)))


# Principle component analysis

In [11]:
pca = PCA(n_components=3)
reduced_df = pd.DataFrame(pca.fit_transform(standardized_df))


# Output to visualize effectiveness of vectors

In [12]:
reduced_df.to_csv('new1.csv', sep='\t', index=False, header=False)


# Clustering w/ k-means 

In [13]:
num_clusters = 7

km = KMeans(n_clusters=num_clusters, init="k-means++", max_iter=100)

%time km.fit(reduced_df.values)

CPU times: user 139 ms, sys: 8.55 ms, total: 147 ms
Wall time: 183 ms


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
       n_clusters=7, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

# Essays per cluster / Theme(s) per cluster

In [18]:
output = reduced_df
output['cluster'] = km.labels_
output['essay'] = tokenized_essays.values()
output['title'] = tokenized_essays.keys()

output['cluster'].value_counts()

0    570
3    341
5    336
4     94
2     85
6     80
1     67
Name: cluster, dtype: int64

In [19]:
theme_index = {
                'Autobiographical, Paths to Prison': 'parent, damage, abuse, gang, alcohol, drug, neighborhood, hood, youth, pressure, fit, broken', 
                'Family': 'family, abandonment, relation, visit, partner, mother, father, sibling, child, wife, husband, abuse, support', 
                'Physical Conditions and Security': 'physical, condition, security, search, censorship, food, cold, hygiene, heat, misfunction, infestation, solitary, strip, search, fear, filth, violence, staff, abuse', 
                'Prison Culture/Community/Society': 'violence, fear, staff, sexual, crime, outcasts, racial, cellmate, gay, LGBTQ, dehumanize, uniform, pecking, order, hierarchy, solid, dirty, skin, chomo',
                'Staff/prison Abuse of IP': 'abuse, sexual, torture, humiliation, racist, assault, antagonism, exacerbation, right, violation, food, hygiene, environment, legal, extraction, search, taunt',
                'Personal/Intern Change/Coping': 'survival, art, reading, writing, peace, faith, prayer, meditation, practice, community, activities, hobbies, cooking, remorse, motivation, education, discipline, coping, adjustment, responsibility, god, redemption, transformation',
                'Judicial Misconduct and Legal Remediation': 'judicial, incompetence, corruption, witness, evidence, excessive, political, jailhouse, lawyer, misconduct, unfair, pretender, plra, plea, grievance',
                'Political and Intellectual Labor among IP': 'activism, resistence, critique, race, class, change, policies, practices, write, organize, strike, solidarity',
                'Prison Industry/Prison as Business': 'labor, slave, condition, safety, health, profit, job, budget, tax, taxpayer, exploitation, corruption, mismanagement, nepotism',
                'Education, Re-entry, Other Programs': 'rehabilitation, entry, education, indifference, college, vocation',
                'Health Care': 'health, care, negligence, hostility, incompetence, indifference, death, injury, treatment, medication',
                'Social Alienation, Indifference, Hostility': 'public, misperception, identity, stigma'
              }

In [28]:
from __future__ import print_function
import random

order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

print("***** Random sample essays per cluster *****")
print()

terms_per_essays = {}
themes_per_essays = {}
for i in range(num_clusters):
    
    ten = [random.choice(output[output.cluster == i]['title'].values.tolist()) for _ in range(10)]
    print()
    print("** Cluster %d: **\n" % i)
    
    context = []
    for lst in [tokenized_essays[title] for title in ten]:
        context += lst
    
    tfidf_vectorizer = TfidfVectorizer(max_features=50)
    tfidf_vectorizer.fit(context)
    
    terms_per_essays[i] = tfidf_vectorizer.get_feature_names()
    
    print("Top terms #{} : {}".format(i, terms_per_essays[i]))
    print()
    
    themes_per_essays[i] = set()
    theme_term_count = {}
    for term in terms_per_essays[i]:
        for key, value in theme_index.items():
            if term in value:
                if key in theme_term_count:
                    theme_term_count[key] += 1
                else:
                    theme_term_count[key] = 1
                themes_per_essays[i].add(key)
    
    themes_per_essays[i] = [theme for theme, count in theme_term_count.items() if count > 1]
    print("Dominant theme(s): {}".format(themes_per_essays[i]))
    print()
    print(sorted(theme_term_count.items(), key=lambda x : x[1])[::-1])
    
    print()
    print("Randomly selected essays:", ten)
    print()


***** Random sample essays per cluster *****


** Cluster 0: **

Top terms #0 : ['african', 'air', 'alone', 'along', 'black', 'business', 'condition', 'consciousness', 'custody', 'deal', 'emotional', 'environment', 'everyday', 'exercise', 'existence', 'experience', 'extreme', 'family', 'force', 'free', 'house', 'inmate', 'intelligence', 'level', 'market', 'move', 'nature', 'official', 'order', 'outside', 'pm', 'policy', 'reflection', 'roughly', 'sentence', 'set', 'side', 'speak', 'state', 'strong', 'symbolic', 'system', 'talk', 'tax', 'think', 'threat', 'undifferentiated', 'value', 'victim', 'wrong']

Dominant theme(s): ['Judicial Misconduct and Legal Remediation', 'Prison Industry/Prison as Business']

[('Prison Industry/Prison as Business', 2), ('Judicial Misconduct and Legal Remediation', 2), ('Prison Culture/Community/Society', 1), ('Family', 1), ('Staff/prison Abuse of IP', 1), ('Physical Conditions and Security', 1)]

Randomly selected essays: ['apw_12347608.txt', 'apw_12343099.t

# Save model/Load from pickle

In [30]:
filename = 'word2vec_cluster.pkl'
pickle.dump(km, open(filename, 'wb'))

# LDA

In [31]:
terms_per_cluster = {}
themes_per_cluster = {}

#Apply LDA for each cluster (and for each essay in cluster)
for i in range(num_clusters):
    print("************ Cluster # {} ************".format(i))
    essays_in_cluster = [tokens for tokens in list(output[output.cluster == i].essay)]
    
    dictionary = corpora.Dictionary(essays_in_cluster)
    
    cluster_corpus = [dictionary.doc2bow(essay) for essay in essays_in_cluster]
    
    lda = models.ldamodel.LdaModel(corpus=cluster_corpus, id2word=dictionary, num_topics=1, passes=10)
    
    term_score = {}
    terms_per_cluster[i] = []
    for idx, terms in lda.print_topics(i, 20):
        #terms_per_cluster[i] = terms
        
        print('Top terms: {}'.format(terms.split('+')))
        print()
        print()
        
        for term_with_score in terms.split('+'):
            term = term_with_score.split('*')[1][1:-2]
            score = term_with_score.split('*')[0]
            
            #print("term is {} with score {}".format(term, score))
            
            terms_per_cluster[i].append(term)
            term_score[term] = float(score)
    
    print("Terms per cluster:", terms_per_cluster)
    print()
    print("Terms/Scores:", term_score)
    print()
    
    themes_per_cluster[i] = {}
    theme_term_score = {}
    for term in terms_per_cluster[i]:
        for theme, defining_terms in theme_index.items():
            if term in defining_terms:
                if theme in theme_term_score:
                    theme_term_score[theme] += term_score[term]
                else:
                    theme_term_score[theme] = term_score[term]
                
    themes_per_cluster[i] = sorted(theme_term_score.items(), key = lambda x : x[1])[::-1]
    print("Themes ranked strongest to weakest: {}".format(themes_per_cluster[i]))
    print()

2019-10-28 19:27:39,185 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-10-28 19:27:39,386 : INFO : built Dictionary(14822 unique tokens: ['aggressive', 'air', 'anger', 'anyone', 'anything']...) from 570 documents (total 125193 corpus positions)


************ Cluster # 0 ************


2019-10-28 19:27:39,512 : INFO : using symmetric alpha at 1.0
2019-10-28 19:27:39,513 : INFO : using symmetric eta at 1.0
2019-10-28 19:27:39,517 : INFO : using serial LDA version on this node
2019-10-28 19:27:39,522 : INFO : running online (multi-pass) LDA training, 1 topics, 10 passes over the supplied corpus of 570 documents, updating model once every 570 documents, evaluating perplexity every 570 documents, iterating 50x with a convergence threshold of 0.001000
2019-10-28 19:27:40,081 : INFO : -9.890 per-word bound, 948.9 perplexity estimate based on a held-out corpus of 570 documents with 125193 words
2019-10-28 19:27:40,082 : INFO : PROGRESS: pass 0, at document #570/570
2019-10-28 19:27:40,194 : INFO : topic #0 (1.000): 0.009*"inmate" + 0.005*"state" + 0.004*"system" + 0.004*"officer" + 0.003*"write" + 0.003*"right" + 0.003*"family" + 0.003*"help" + 0.003*"law" + 0.003*"society"
2019-10-28 19:27:40,194 : INFO : topic diff=1.208399, rho=1.000000
2019-10-28 19:27:40,746 : INFO : -

Top terms: ['0.009*"inmate" ', ' 0.005*"state" ', ' 0.004*"system" ', ' 0.004*"officer" ', ' 0.003*"write" ', ' 0.003*"right" ', ' 0.003*"family" ', ' 0.003*"help" ', ' 0.003*"law" ', ' 0.003*"society" ', ' 0.003*"think" ', ' 0.003*"sentence" ', ' 0.002*"crime" ', ' 0.002*"staff" ', ' 0.002*"feel" ', ' 0.002*"love" ', ' 0.002*"program" ', ' 0.002*"criminal" ', ' 0.002*"correctional" ', ' 0.002*"problem"']


Terms per cluster: {0: ['inmate', 'state', 'system', 'officer', 'write', 'right', 'family', 'help', 'law', 'society', 'think', 'sentence', 'crime', 'staff', 'feel', 'love', 'program', 'criminal', 'correctional', 'proble']}

Terms/Scores: {'inmate': 0.009, 'state': 0.005, 'system': 0.004, 'officer': 0.004, 'write': 0.003, 'right': 0.003, 'family': 0.003, 'help': 0.003, 'law': 0.003, 'society': 0.003, 'think': 0.003, 'sentence': 0.003, 'crime': 0.002, 'staff': 0.002, 'feel': 0.002, 'love': 0.002, 'program': 0.002, 'criminal': 0.002, 'correctional': 0.002, 'proble': 0.002}

Themes rank

2019-10-28 19:27:46,518 : INFO : using symmetric alpha at 1.0
2019-10-28 19:27:46,519 : INFO : using symmetric eta at 1.0
2019-10-28 19:27:46,523 : INFO : using serial LDA version on this node
2019-10-28 19:27:46,526 : INFO : running online (multi-pass) LDA training, 1 topics, 10 passes over the supplied corpus of 67 documents, updating model once every 67 documents, evaluating perplexity every 67 documents, iterating 50x with a convergence threshold of 0.001000
2019-10-28 19:27:46,822 : INFO : -9.800 per-word bound, 891.2 perplexity estimate based on a held-out corpus of 67 documents with 112957 words
2019-10-28 19:27:46,823 : INFO : PROGRESS: pass 0, at document #67/67
2019-10-28 19:27:46,897 : INFO : topic #0 (1.000): 0.004*"society" + 0.004*"black" + 0.004*"think" + 0.004*"experience" + 0.003*"mind" + 0.003*"body" + 0.003*"social" + 0.003*"write" + 0.003*"state" + 0.003*"system"
2019-10-28 19:27:46,900 : INFO : topic diff=1.156554, rho=1.000000
2019-10-28 19:27:47,240 : INFO : -8.4

Top terms: ['0.004*"society" ', ' 0.004*"black" ', ' 0.004*"think" ', ' 0.004*"experience" ', ' 0.003*"mind" ', ' 0.003*"body" ', ' 0.003*"social" ', ' 0.003*"write" ', ' 0.003*"state" ', ' 0.003*"system" ', ' 0.003*"criminal" ', ' 0.003*"consciousness" ', ' 0.003*"learn" ', ' 0.002*"appear" ', ' 0.002*"human" ', ' 0.002*"behavior" ', ' 0.002*"process" ', ' 0.002*"problem" ', ' 0.002*"power" ', ' 0.002*"member"']


Terms per cluster: {0: ['inmate', 'state', 'system', 'officer', 'write', 'right', 'family', 'help', 'law', 'society', 'think', 'sentence', 'crime', 'staff', 'feel', 'love', 'program', 'criminal', 'correctional', 'proble'], 1: ['society', 'black', 'think', 'experience', 'mind', 'body', 'social', 'write', 'state', 'system', 'criminal', 'consciousness', 'learn', 'appear', 'human', 'behavior', 'process', 'problem', 'power', 'membe']}

Terms/Scores: {'society': 0.004, 'black': 0.004, 'think': 0.004, 'experience': 0.004, 'mind': 0.003, 'body': 0.003, 'social': 0.003, 'write': 0.00

2019-10-28 19:27:50,403 : INFO : using symmetric alpha at 1.0
2019-10-28 19:27:50,404 : INFO : using symmetric eta at 1.0
2019-10-28 19:27:50,410 : INFO : using serial LDA version on this node
2019-10-28 19:27:50,412 : INFO : running online (multi-pass) LDA training, 1 topics, 10 passes over the supplied corpus of 85 documents, updating model once every 85 documents, evaluating perplexity every 85 documents, iterating 50x with a convergence threshold of 0.001000
2019-10-28 19:27:50,714 : INFO : -9.927 per-word bound, 973.5 perplexity estimate based on a held-out corpus of 85 documents with 110007 words
2019-10-28 19:27:50,714 : INFO : PROGRESS: pass 0, at document #85/85
2019-10-28 19:27:50,765 : INFO : topic #0 (1.000): 0.010*"state" + 0.008*"law" + 0.006*"court" + 0.006*"sentence" + 0.005*"crime" + 0.004*"system" + 0.004*"criminal" + 0.004*"parole" + 0.003*"offender" + 0.003*"right"
2019-10-28 19:27:50,766 : INFO : topic diff=1.118190, rho=1.000000
2019-10-28 19:27:51,052 : INFO : -8

Top terms: ['0.010*"state" ', ' 0.008*"law" ', ' 0.006*"court" ', ' 0.006*"sentence" ', ' 0.005*"crime" ', ' 0.004*"system" ', ' 0.004*"criminal" ', ' 0.004*"parole" ', ' 0.003*"offender" ', ' 0.003*"right" ', ' 0.003*"justice" ', ' 0.003*"inmate" ', ' 0.002*"federal" ', ' 0.002*"government" ', ' 0.002*"judge" ', ' 0.002*"program" ', ' 0.002*"child" ', ' 0.002*"public" ', ' 0.002*"issue" ', ' 0.002*"conviction"']


Terms per cluster: {0: ['inmate', 'state', 'system', 'officer', 'write', 'right', 'family', 'help', 'law', 'society', 'think', 'sentence', 'crime', 'staff', 'feel', 'love', 'program', 'criminal', 'correctional', 'proble'], 1: ['society', 'black', 'think', 'experience', 'mind', 'body', 'social', 'write', 'state', 'system', 'criminal', 'consciousness', 'learn', 'appear', 'human', 'behavior', 'process', 'problem', 'power', 'membe'], 2: ['state', 'law', 'court', 'sentence', 'crime', 'system', 'criminal', 'parole', 'offender', 'right', 'justice', 'inmate', 'federal', 'government'

2019-10-28 19:27:54,007 : INFO : built Dictionary(22467 unique tokens: ['accomplishment', 'accord', 'across', 'active', 'actual']...) from 341 documents (total 205305 corpus positions)
2019-10-28 19:27:54,235 : INFO : using symmetric alpha at 1.0
2019-10-28 19:27:54,236 : INFO : using symmetric eta at 1.0
2019-10-28 19:27:54,245 : INFO : using serial LDA version on this node
2019-10-28 19:27:54,250 : INFO : running online (multi-pass) LDA training, 1 topics, 10 passes over the supplied corpus of 341 documents, updating model once every 341 documents, evaluating perplexity every 341 documents, iterating 50x with a convergence threshold of 0.001000
2019-10-28 19:27:54,996 : INFO : -10.304 per-word bound, 1263.8 perplexity estimate based on a held-out corpus of 341 documents with 205305 words
2019-10-28 19:27:54,996 : INFO : PROGRESS: pass 0, at document #341/341
2019-10-28 19:27:55,131 : INFO : topic #0 (1.000): 0.006*"state" + 0.005*"system" + 0.005*"inmate" + 0.004*"society" + 0.004*"c

Top terms: ['0.006*"state" ', ' 0.005*"system" ', ' 0.005*"inmate" ', ' 0.004*"society" ', ' 0.004*"crime" ', ' 0.004*"program" ', ' 0.003*"sentence" ', ' 0.003*"law" ', ' 0.003*"right" ', ' 0.002*"criminal" ', ' 0.002*"parole" ', ' 0.002*"justice" ', ' 0.002*"release" ', ' 0.002*"family" ', ' 0.002*"human" ', ' 0.002*"death" ', ' 0.002*"help" ', ' 0.002*"incarcerate" ', ' 0.002*"court" ', ' 0.002*"public"']


Terms per cluster: {0: ['inmate', 'state', 'system', 'officer', 'write', 'right', 'family', 'help', 'law', 'society', 'think', 'sentence', 'crime', 'staff', 'feel', 'love', 'program', 'criminal', 'correctional', 'proble'], 1: ['society', 'black', 'think', 'experience', 'mind', 'body', 'social', 'write', 'state', 'system', 'criminal', 'consciousness', 'learn', 'appear', 'human', 'behavior', 'process', 'problem', 'power', 'membe'], 2: ['state', 'law', 'court', 'sentence', 'crime', 'system', 'criminal', 'parole', 'offender', 'right', 'justice', 'inmate', 'federal', 'government', 'ju

2019-10-28 19:28:03,419 : INFO : using symmetric alpha at 1.0
2019-10-28 19:28:03,420 : INFO : using symmetric eta at 1.0
2019-10-28 19:28:03,427 : INFO : using serial LDA version on this node
2019-10-28 19:28:03,431 : INFO : running online (multi-pass) LDA training, 1 topics, 10 passes over the supplied corpus of 94 documents, updating model once every 94 documents, evaluating perplexity every 94 documents, iterating 50x with a convergence threshold of 0.001000
2019-10-28 19:28:03,810 : INFO : -10.162 per-word bound, 1145.3 perplexity estimate based on a held-out corpus of 94 documents with 116082 words
2019-10-28 19:28:03,812 : INFO : PROGRESS: pass 0, at document #94/94
2019-10-28 19:28:03,881 : INFO : topic #0 (1.000): 0.006*"state" + 0.005*"inmate" + 0.003*"staff" + 0.003*"right" + 0.003*"officer" + 0.003*"sentence" + 0.003*"law" + 0.003*"court" + 0.002*"unit" + 0.002*"offender"
2019-10-28 19:28:03,883 : INFO : topic diff=1.054163, rho=1.000000
2019-10-28 19:28:04,243 : INFO : -8.

Top terms: ['0.006*"state" ', ' 0.005*"inmate" ', ' 0.003*"staff" ', ' 0.003*"right" ', ' 0.003*"officer" ', ' 0.003*"sentence" ', ' 0.003*"law" ', ' 0.003*"court" ', ' 0.002*"unit" ', ' 0.002*"offender" ', ' 0.002*"system" ', ' 0.002*"write" ', ' 0.002*"child" ', ' 0.002*"crime" ', ' 0.002*"death" ', ' 0.002*"drug" ', ' 0.002*"issue" ', ' 0.002*"family" ', ' 0.002*"letter" ', ' 0.002*"federal"']


Terms per cluster: {0: ['inmate', 'state', 'system', 'officer', 'write', 'right', 'family', 'help', 'law', 'society', 'think', 'sentence', 'crime', 'staff', 'feel', 'love', 'program', 'criminal', 'correctional', 'proble'], 1: ['society', 'black', 'think', 'experience', 'mind', 'body', 'social', 'write', 'state', 'system', 'criminal', 'consciousness', 'learn', 'appear', 'human', 'behavior', 'process', 'problem', 'power', 'membe'], 2: ['state', 'law', 'court', 'sentence', 'crime', 'system', 'criminal', 'parole', 'offender', 'right', 'justice', 'inmate', 'federal', 'government', 'judge', 'progr

2019-10-28 19:28:08,583 : INFO : built Dictionary(21174 unique tokens: ['accord', 'administration', 'admire', 'admit', 'advocate']...) from 336 documents (total 185152 corpus positions)
2019-10-28 19:28:08,819 : INFO : using symmetric alpha at 1.0
2019-10-28 19:28:08,822 : INFO : using symmetric eta at 1.0
2019-10-28 19:28:08,833 : INFO : using serial LDA version on this node
2019-10-28 19:28:08,840 : INFO : running online (multi-pass) LDA training, 1 topics, 10 passes over the supplied corpus of 336 documents, updating model once every 336 documents, evaluating perplexity every 336 documents, iterating 50x with a convergence threshold of 0.001000
2019-10-28 19:28:09,531 : INFO : -10.248 per-word bound, 1216.1 perplexity estimate based on a held-out corpus of 336 documents with 185152 words
2019-10-28 19:28:09,532 : INFO : PROGRESS: pass 0, at document #336/336
2019-10-28 19:28:09,657 : INFO : topic #0 (1.000): 0.006*"inmate" + 0.003*"write" + 0.003*"love" + 0.003*"think" + 0.003*"help

Top terms: ['0.006*"inmate" ', ' 0.003*"write" ', ' 0.003*"love" ', ' 0.003*"think" ', ' 0.003*"help" ', ' 0.003*"officer" ', ' 0.002*"right" ', ' 0.002*"state" ', ' 0.002*"family" ', ' 0.002*"god" ', ' 0.002*"friend" ', ' 0.002*"feel" ', ' 0.002*"mind" ', ' 0.002*"thought" ', ' 0.002*"word" ', ' 0.002*"live" ', ' 0.002*"care" ', ' 0.002*"find" ', ' 0.002*"death" ', ' 0.002*"learn"']


Terms per cluster: {0: ['inmate', 'state', 'system', 'officer', 'write', 'right', 'family', 'help', 'law', 'society', 'think', 'sentence', 'crime', 'staff', 'feel', 'love', 'program', 'criminal', 'correctional', 'proble'], 1: ['society', 'black', 'think', 'experience', 'mind', 'body', 'social', 'write', 'state', 'system', 'criminal', 'consciousness', 'learn', 'appear', 'human', 'behavior', 'process', 'problem', 'power', 'membe'], 2: ['state', 'law', 'court', 'sentence', 'crime', 'system', 'criminal', 'parole', 'offender', 'right', 'justice', 'inmate', 'federal', 'government', 'judge', 'program', 'child',

2019-10-28 19:28:17,136 : INFO : using symmetric alpha at 1.0
2019-10-28 19:28:17,137 : INFO : using symmetric eta at 1.0
2019-10-28 19:28:17,143 : INFO : using serial LDA version on this node
2019-10-28 19:28:17,146 : INFO : running online (multi-pass) LDA training, 1 topics, 10 passes over the supplied corpus of 80 documents, updating model once every 80 documents, evaluating perplexity every 80 documents, iterating 50x with a convergence threshold of 0.001000
2019-10-28 19:28:17,472 : INFO : -9.924 per-word bound, 971.7 perplexity estimate based on a held-out corpus of 80 documents with 109017 words
2019-10-28 19:28:17,473 : INFO : PROGRESS: pass 0, at document #80/80
2019-10-28 19:28:17,531 : INFO : topic #0 (1.000): 0.003*"guard" + 0.003*"door" + 0.003*"head" + 0.003*"hand" + 0.002*"face" + 0.002*"room" + 0.002*"inmate" + 0.002*"mother" + 0.002*"eye" + 0.002*"little"
2019-10-28 19:28:17,532 : INFO : topic diff=1.086199, rho=1.000000
2019-10-28 19:28:17,859 : INFO : -8.617 per-word

Top terms: ['0.003*"guard" ', ' 0.003*"door" ', ' 0.003*"head" ', ' 0.003*"hand" ', ' 0.002*"face" ', ' 0.002*"room" ', ' 0.002*"inmate" ', ' 0.002*"mother" ', ' 0.002*"eye" ', ' 0.002*"little" ', ' 0.002*"right" ', ' 0.002*"walk" ', ' 0.002*"think" ', ' 0.002*"night" ', ' 0.002*"home" ', ' 0.002*"state" ', ' 0.002*"wall" ', ' 0.002*"turn" ', ' 0.002*"love" ', ' 0.002*"thought"']


Terms per cluster: {0: ['inmate', 'state', 'system', 'officer', 'write', 'right', 'family', 'help', 'law', 'society', 'think', 'sentence', 'crime', 'staff', 'feel', 'love', 'program', 'criminal', 'correctional', 'proble'], 1: ['society', 'black', 'think', 'experience', 'mind', 'body', 'social', 'write', 'state', 'system', 'criminal', 'consciousness', 'learn', 'appear', 'human', 'behavior', 'process', 'problem', 'power', 'membe'], 2: ['state', 'law', 'court', 'sentence', 'crime', 'system', 'criminal', 'parole', 'offender', 'right', 'justice', 'inmate', 'federal', 'government', 'judge', 'program', 'child', 'pu