In [156]:
import collections
from collections import Counter
from earthy.nltk_wrappers import lemmatize_sent
from earthy.nltk_wrappers import porter_stem
import gensim
from gensim import models
from gensim import corpora
from gensim.utils import simple_preprocess, lemmatize
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords 
from nltk.stem.wordnet import WordNetLemmatizer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.cluster import KMeans
from sklearn.manifold import MDS
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
import os
import pandas as pd
import pickle
import pyLDAvis.sklearn
import pyLDAvis.gensim


In [157]:
pyLDAvis.enable_notebook()

In [158]:
path = 'essays/'

In [159]:
filenames = os.listdir(path)

data = {}
for filename in filenames:
    with open(path + filename, "r", encoding="ISO-8859-1") as file:
        data[filename] = file.read()

In [160]:
pd.set_option('max_colwidth',150)

data_df = pd.DataFrame(data, index=[1]).transpose()
data_df.columns = ['essay']
data_df = data_df.sort_index()

In [161]:
def clean(doc):
    stop = set(stopwords.words('english'))
    exclude = set(string.punctuation) 
    lemma = WordNetLemmatizer()

    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    normalized = " ".join(lemma.lemmatize(word, 'v') for word in punc_free.split())
    return normalized

data_clean = pd.DataFrame(data_df.essay.apply(clean))
essay_compiled = data_clean.essay.tolist()
essay_clean = [essay.split() for essay in essay_compiled]

In [163]:
# Creating the term dictionary of our courpus, where every unique term is assigned an index. 
dictionary = corpora.Dictionary(essay_clean)
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=10000)

# Converting list of documents (corpus) into Document Term Matrix using dictionary prepared above.
corpus = [dictionary.doc2bow(essay) for essay in essay_clean]
corpus_df = pd.DataFrame(corpus)

#essay_term_df.to_csv('~/Desktop/gensim_vectorized_data.csv')

# Create the TF-IDF model
tfidf = models.TfidfModel(corpus, smartirs='ntc')
corpus_tfidf = tfidf[corpus]

# Show the TF-IDF weights
#for essay in tfidf[essay_term_matrix]:
#    print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in essay])

# Topic modeling: LDA w/o tf-idf
# Ana's:
#lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=15, update_every=1, chunksize=100, random_state=100, passes=1)

# Other:
#lda = gensim.models.LdaMulticore(corpus=corpus, num_topics=15, id2word=dictionary, passes=2, workers=2, chunksize=100, random_state=100,)


# Topic modeling: LDA w/ tf-idf
lda_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)

# Print topics.
#for idx, topic in lda_tfidf.print_topics(-1):
#    print('Topic: {} Word: {}'.format(idx, topic))

In [164]:
def formatTopicSentences(ldamodel=lda_tfidf, corpus=corpus_tfidf, texts=essay_clean):
    # Init output
    sent_topics_df = pd.DataFrame()

    # Get main topic in each document
    for i, row in enumerate(ldamodel[corpus]):
        row = sorted(row, key=lambda x: (x[1]), reverse=True)
        # Get the Dominant topic, Perc Contribution and Keywords for each document
        for j, (topic_num, prop_topic) in enumerate(row):
            if j == 0:  # => dominant topic
                wp = lda.show_topic(topic_num)
                topic_keywords = ", ".join([word for word, prop in wp])
                sent_topics_df = sent_topics_df.append(pd.Series([int(topic_num), round(prop_topic,4), topic_keywords]), ignore_index=True)
            else:
                break
    sent_topics_df.columns = ['Dominant_Topic', 'Perc_Contribution', 'Topic_Keywords']

    # Add original text to the end of the output
    contents = pd.Series(texts)
    sent_topics_df = pd.concat([sent_topics_df, contents], axis=1)
    return(sent_topics_df)


df_topic_sents_keywords = formatTopicSentences(ldamodel=lda_tfidf, corpus=corpus_tfidf, texts=essay_clean)

# Format
df_dominant_topic = df_topic_sents_keywords.reset_index()
df_dominant_topic.columns = ['Document_No', 'Dominant_Topic', 'Topic_Perc_Contrib', 'Keywords', 'Text']

# Show
df_dominant_topic.head(15)

Unnamed: 0,Document_No,Dominant_Topic,Topic_Perc_Contrib,Keywords,Text
0,0,3.0,0.9544,"inmate, officer, cell, im, staff, want, here, guy, work, food","[tne, z, j, hole, experience, rouse, bed, correctional, officer, 9, be, early, me, wednesday, morning, pack, up, transfer, id, wait, list, go, hol..."
1,1,3.0,0.9576,"inmate, officer, cell, im, staff, want, here, guy, work, food","[ok, stephen, whetzel, page, 1, 14, van, slice, dense, morning, fog, like, jet, vapor, d1trk, mood, somber, ten, us, twg, guard, eight, prisoners,..."
2,2,3.0,0.9454,"inmate, officer, cell, im, staff, want, here, guy, work, food","[ï»¿jamaal, freeman, alabama, introspection, topic, thoughts, konvict, the, divine, manifestation, god, prisons, place, criminals, confine, break,..."
3,3,3.0,0.8915,"inmate, officer, cell, im, staff, want, here, guy, work, food","[plan, action, come, america, start, work, construction, industry, work, two, years, san, antonio, area, progress, manual, labor, job, attend, col..."
4,4,3.0,0.9311,"inmate, officer, cell, im, staff, want, here, guy, work, food","[worthy, live, write, list, below, âreasons, dismiss, older, people, live, worthless, uselessâ, tongueâinâcheek, manner, inspire, contempt..."
5,5,3.0,0.8681,"inmate, officer, cell, im, staff, want, here, guy, work, food","[satisfaction, mira, judgment, black, clone, ï¬ebt, ï¬eparted, october, 16, 2013, state, missouri, file, gatisfaction, juï¬gment, motion, cole,..."
6,6,3.0,0.6304,"inmate, officer, cell, im, staff, want, here, guy, work, food","[72, 4, prison, witers, workshop, recently, moveo, exchange, passiveagressive, ole, man, josh, guy, year, younger, myself, also, like, write, weve..."
7,7,3.0,0.8783,"inmate, officer, cell, im, staff, want, here, guy, work, food","[hâgc, 1, h, second, prison, writers, workshop, finish, write, first, story, prison, writers, workshop, twentyâtwo, hundred, word, long, make,..."
8,8,3.0,0.963,"inmate, officer, cell, im, staff, want, here, guy, work, food","[3, xi, nao3, xi3, wash, ânao3, brain, word, root, derive, chinese, mandarin, english, deï¬nition, brainwash, 1, intensive, usu, political, ind..."
9,9,3.0,0.9262,"inmate, officer, cell, im, staff, want, here, guy, work, food","[mock, bird, endanger, effort, awaken, social, conscience, evils, injustice, intolerance, discrimination, vengeance, hate, author, harper, lee, wr..."


In [168]:
full_path = '/Users/inesayara/Desktop/senior_seminar/essays/' 
vectorizer = TfidfVectorizer(encoding="ISO-8859-1", input='filename', stop_words='english')
dtm = vectorizer.fit_transform([full_path + filename for filename in filenames])

In [169]:
# Vectorize w/ CountVectorizer

#vectorizer = CountVectorizer(stop_words='english')
#vectorized_data = vectorizer.fit_transform(data_clean.essay)
#data_dtm = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names())
#data_dtm.to_csv('~/Desktop/sklearn_vectorized_data.csv')
#data_dtm.index = data_clean.index
#data_dtm

In [170]:
true_k = 2
model = KMeans(n_clusters=true_k, init='k-means++', max_iter=1000, n_init=1)
model.fit(dtm)

order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

terms = [lemma.lemmatize(term) for term in terms]
terms = " ".join(w for w in nltk.wordpunct_tokenize(word) if w.lower() in terms or not w.isalpha())

'solidarity'

In [171]:
dtm = dtm.toarray() 
terms = np.array(terms)

In [172]:
n_clusters= 15
estimator = KMeans(n_clusters)
k = estimator.fit(dtm)

In [173]:
#count and output how many files/essays per cluster
labels = estimator.labels_
print (Counter(labels))


Counter({9: 381, 10: 238, 4: 199, 7: 183, 11: 183, 1: 93, 3: 72, 8: 44, 2: 38, 14: 35, 5: 34, 13: 22, 6: 21, 12: 16, 0: 14})


In [174]:
#output top terms per cluster
print("Top terms per cluster:")
order_centroids = estimator.cluster_centers_.argsort()[:, ::-1]
for i in range(n_clusters):
    print ("Cluster %d:" % i),
    for ind in order_centroids[i, :15]:
        print (' %s' % terms[ind]),
    print


Top terms per cluster:
Cluster 0:
 ptsd
 richard
 ip
 brain
 cdcr
 bladder
 stress
 symptom
 plaintiff
 health
 cystectomy
 treatment
 medical
 disorder
 prison
Cluster 1:
 inmate
 prison
 inmate
 staff
 time
 officer
 prison
 correctional
 like
 life
 people
 don
 work
 money
 year
Cluster 2:
 carolina
 african
 north
 party
 american
 vote
 black
 worley
 prison
 voting
 willie
 american
 political
 3rd
 community
Cluster 3:
 god
 tucson
 inmate
 prison
 usp
 officer
 guy
 don
 pause
 huffstuttler
 inmate
 people
 like
 officer
 let
Cluster 4:
 cell
 prison
 time
 like
 told
 said
 just
 day
 unit
 got
 year
 prisoner
 did
 door
 jail
Cluster 5:
 parole
 board
 ohio
 release
 prisoner
 year
 law
 commissioner
 sentence
 old
 offender
 prison
 crime
 hearing
 released
Cluster 6:
 translated
 stull
 loera
 don
 school
 mexico
 family
 violence
 arrived
 year
 prison
 houston
 want
 thing
 going
Cluster 7:
 life
 prison
 time
 ve
 year
 just
 like
 know
 people
 family
 day
 don
 thing
