In [None]:
from __future__ import print_function

import chardet
import csv
import gensim
import logging
import nltk
import os
import pickle
import string

import numpy as np
import pandas as pd

from itertools import cycle
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim import corpora
from gensim.models.ldamodel import LdaModel

from sklearn.feature_extraction.text import TfidfVectorizer


# Upload Essays 

In [None]:
root = os.path.dirname(os.path.realpath('__file__'))
essay_path = root + '/../essays/'


In [None]:
files = os.listdir(essay_path)

essays = {}
for file in files:
    # attempt to confidently guess encoding; otherwise, default to ISO-8859-1
    encoding = "ISO-8859-1"
    guess = chardet.detect(open(essay_path + file, "rb").read())
    if (guess["confidence"] >= 0.95):
        encoding = guess["encoding"]
    
    with open(essay_path + file, "r", encoding=encoding) as f:
        essays[file] = f.read()
        

In [None]:
compound_mapping = [ ("broken home", "brokenhome"), ("fit in", "fitin"), ("home boys", "homeboys"),
("crime partners", "crimepartners"), ("road dogs", "roaddogs"), ("step father", "stepfather"),
("old lady", "oldlady"), ("strip search", "stripsearch"), ("pecking order", "peckingorder"),
("solid crime", "solidcrime"), ("dirty crime", "dirtycrime"), ("skin crime", "skincrime"),
("solid prisoner", "solidprisoner"), ("dick sucker", "dicksucker"), ("cock sucker", "cocksucker"),
("shot caller", "shotcaller"), ("butt pirate", "buttpirate"), ("falsely accuse", "falselyaccuse"),
("born again", "bornagain"), ("good guy", "goodguy"), ("habeas corpus", "habeascorpus"),
("time barred", "timebarred"), ("successive petitions", "successivepetitions"), ("hunger strike", "hungerstrike"),
("1983 lawsuits", "1983lawsuits"), ("civil rights complaints", "civilrightscomplaints"), ("tax payers", "taxpayers"),
("private prisons", "privateprisons"), ("prison-industrial complex", "prison-industrialcomplex"), ("make money off of us", "makemoneyoffofus"),
("higher education", "highereducation"), ("correspondence courses", "correspondencecourses"), ("self help", "selfhelp"),
("mental health", "mentalhealth"), ("psych meds", "psychmeds"), ("kehea meds", "keheameds"), ("deliberate indifference", "deliberateindifference") ]


# Tokenize + Preprocess

In [None]:
tokenized_essays = {label: gensim.utils.simple_preprocess(corpus, deacc=True, min_len=2, max_len=20) for (label, corpus) in essays.items()}


In [None]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": nltk.corpus.wordnet.ADJ,
                "N": nltk.corpus.wordnet.NOUN,
                "V": nltk.corpus.wordnet.VERB,
                "R": nltk.corpus.wordnet.ADV}

    return tag_dict.get(tag, nltk.corpus.wordnet.NOUN)

lemmatizer = nltk.stem.WordNetLemmatizer()
tokenized_essays = {label: [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in token_lst if w not in string.punctuation] for (label, token_lst) in tokenized_essays.items()}


In [None]:
english_stopwords = nltk.corpus.stopwords.words('english')
custom_stopwords = open("custom_stopwords.txt", "r").read().splitlines()
tokenized_essays = {label: [w for w in token_lst if (w not in english_stopwords and w not in custom_stopwords)] for (label, token_lst) in tokenized_essays.items()}


In [None]:
# not currently linked to tokenized_essays
import re
lst = list(tokenized_essays.values())
for i in range(len(lst)):
    essay = ' '.join(lst[i])
    for k, v in compound_mapping:
        essay = re.sub(k, v, essay, flags=re.IGNORECASE) # replaces spaced words with their compounded versions, ignoring case
    lst[i] = essay.split(' ')
    

# Vectorize w/ doc2vec

In [None]:
documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(tokenized_essays.values())]
d2v_model = Doc2Vec(documents, vector_size=100)


In [None]:
vectorized_df = pd.DataFrame(d2v_model.docvecs.vectors_docs)


# Feature scaling through standardization

In [None]:
stdsclr = StandardScaler()
standardized_df = pd.DataFrame(stdsclr.fit_transform(vectorized_df.astype(float)))


# Principle component analysis for visualization

In [None]:
# # only need this for visualization of vectors presumably
# pca = PCA(n_components=3)
# reduced_df = pd.DataFrame(pca.fit_transform(standardized_df))
# reduced_df.to_csv('vectors.csv', sep='\t', index=False, header=False)


# Clustering w/ k-means 

In [None]:
num_clusters = 7

km = KMeans(n_clusters=num_clusters, init="k-means++", max_iter=100)

%time km.fit(standardized_df.values)


# Essays per cluster / Theme(s) per cluster

In [None]:
output = standardized_df
output['cluster'] = km.labels_
output['essay'] = tokenized_essays.values()
output['title'] = tokenized_essays.keys()

output['cluster'].value_counts()


In [None]:
theme_index = {
                'Autobiographical, Paths to Prison': 'parent, damage, abuse, gang, alcohol, drug, neighborhood, hood, youth, pressure, fit, broken', 
                'Family': 'family, abandonment, relation, visit, partner, mother, father, sibling, child, wife, husband, abuse, support', 
                'Physical Conditions and Security': 'physical, condition, security, search, censorship, food, cold, hygiene, heat, misfunction, infestation, solitary, strip, search, fear, filth, violence, staff, abuse', 
                'Prison Culture/Community/Society': 'violence, fear, staff, sexual, crime, outcasts, racial, cellmate, gay, LGBTQ, dehumanize, uniform, pecking, order, hierarchy, solid, dirty, skin, chomo',
                'Staff/prison Abuse of IP': 'abuse, sexual, torture, humiliation, racist, assault, antagonism, exacerbation, right, violation, food, hygiene, environment, legal, extraction, search, taunt',
                'Personal/Intern Change/Coping': 'survival, art, reading, writing, peace, faith, prayer, meditation, practice, community, activities, hobbies, cooking, remorse, motivation, education, discipline, coping, adjustment, responsibility, god, redemption, transformation',
                'Judicial Misconduct and Legal Remediation': 'judicial, incompetence, corruption, witness, evidence, excessive, political, jailhouse, lawyer, misconduct, unfair, pretender, plra, plea, grievance',
                'Political and Intellectual Labor among IP': 'activism, resistence, critique, race, class, change, policies, practices, write, organize, strike, solidarity',
                'Prison Industry/Prison as Business': 'labor, slave, condition, safety, health, profit, job, budget, tax, taxpayer, exploitation, corruption, mismanagement, nepotism',
                'Education, Re-entry, Other Programs': 'rehabilitation, entry, education, indifference, college, vocation',
                'Health Care': 'health, care, negligence, hostility, incompetence, indifference, death, injury, treatment, medication',
                'Social Alienation, Indifference, Hostility': 'public, misperception, identity, stigma'
              }


In [None]:
# create column for each topic, assign initial value to zero
for theme in theme_index:
    output[theme] = 0
    
print(output.iloc[0])


In [None]:
from __future__ import print_function
import random

order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

essays_per_cluster = {}
for i in range(num_clusters):
    essays_per_cluster[i] = []
    #print("Cluster %d titles:" % i, end='')
    for title in output[output.cluster == i]['title'].values.tolist():
        #print(' %s,' % title, end='')
        essays_per_cluster[i].append(title)
        
print("***** Random sample essays per cluster *****")
print()

terms_per_essays = {}
themes_per_essays = {}
for i in range(num_clusters):
    ten = []
    if len(essays_per_cluster[i]) < 10:
        ten = essays_per_cluster[i]
    else:
        while len(ten) < 10:
            essay = random.choice(essays_per_cluster[i])
            if essay not in ten:
                ten.append(essay)

    print("** Cluster %d: **" % i)
    print()
    context = [' '.join(tokens) for tokens in list(output[output.cluster == i].essay)]
    
    tfidf_vectorizer = TfidfVectorizer(max_features=50)
    tfidf_vectorizer.fit(context)
    
    #terms_per_essays[i] = [term for term in tfidf_vectorizer.get_feature_names() if term not in english_stopwords + custom_stopwords]
    terms_per_essays[i] = [term for term in tfidf_vectorizer.get_feature_names()]
    
    print("Top terms #{} : {}".format(i, terms_per_essays[i]))
    print()
    
    themes_per_essays[i] = set()
    theme_term_count = {}
    for term in terms_per_essays[i]:
        for key, value in theme_index.items():
            if term in value:
                if key in theme_term_count:
                    theme_term_count[key] += 1
                else:
                    theme_term_count[key] = 1
                themes_per_essays[i].add(key)
    
    themes_per_essays[i] = [theme for theme, count in theme_term_count.items() if count > 1]
    print("Dominant theme(s): {}".format(themes_per_essays[i]))
    print()
    print(sorted(theme_term_count.items(), key=lambda x : x[1])[::-1])
    
    print()
    print("Randomly selected essays:", ten)
#     files = [i for i in os.listdir(src) if i in ten and path.isfile(path.join(src, i))]
#     for f in files:
#         shutil.copy(path.join(src, f), dst + str(i))
    print()
    print()


# Running LDA on each cluster 

In [None]:
from gensim import corpora, models

val = True
terms_per_cluster = {}
themes_per_cluster = {}
essays_per_cluster = {}

#Apply LDA for each cluster (and for each essay in cluster)
for i in range(num_clusters):
    print("************ Cluster # {} ************".format(i))
    essays_in_cluster = [tokens for tokens in list(output[output.cluster == i].essay)]
    
    # Store titles of all essays in a given cluster.
    essays_per_cluster[i] = list(output[output.cluster == i].title)
    
    dictionary = corpora.Dictionary(essays_in_cluster)
    
    cluster_corpus = [dictionary.doc2bow(essay) for essay in essays_in_cluster]
    
    lda = models.ldamodel.LdaModel(corpus=cluster_corpus, id2word=dictionary, num_topics=1, passes=10)
    
    term_score = {}
    terms_per_cluster[i] = []
    for idx, terms in lda.print_topics(0, 20):
        #terms_per_cluster[i] = terms
        
        #print('Top terms: {}'.format(terms.split('+')))
        #print()
        #print()
        
        for term_with_score in terms.split('+'):
            term = term_with_score.split('*')[1][1:-2]
            score = term_with_score.split('*')[0]
            
            #print("term is {} with score {}".format(term, score))
            
            terms_per_cluster[i].append(term)
            term_score[term] = float(score)
    
    #print("Terms per cluster:", terms_per_cluster)
    #print()
    #print("Terms/Scores:", term_score)
    #print()
    
    themes_per_cluster[i] = {}
    theme_term_score = {}
    for term in terms_per_cluster[i]:
        for theme, defining_terms in theme_index.items():
            if term in defining_terms:
                if theme in theme_term_score:
                    theme_term_score[theme] += term_score[term]
                else:
                    theme_term_score[theme] = term_score[term]
                
    themes_per_cluster[i] = sorted(theme_term_score.items(), key = lambda x : x[1])[::-1]
    print("Cluster themes ranked strongest to weakest: {}".format(themes_per_cluster[i]))
    print()
    
    themes_per_essay = {}
    terms_per_essay = {}
    for title in essays_per_cluster[i]:
        essay = [tokens for tokens in list(output[output.title == title].essay)]
        dictionary = corpora.Dictionary(essay)
        essay_corpus = [dictionary.doc2bow(e) for e in essay]
        lda = models.ldamodel.LdaModel(corpus=essay_corpus, id2word=dictionary, num_topics=1, passes=10)
    
        essay_term_score = {}
        terms_per_essay[title] = []
        for idx, terms in lda.print_topics(i, 20):
            
            for term_with_score in terms.split('+'):
                term = term_with_score.split('*')[1][1:-2]
                score = term_with_score.split('*')[0]

                terms_per_essay[title].append(term)
                essay_term_score[term] = float(score)

        themes_per_essay[title] = {}
        essay_theme_term_score = {}
        for term in terms_per_essay[title]:
            for theme, defining_terms in theme_index.items():
                if term in defining_terms:
                    if theme in essay_term_score:
                        essay_theme_term_score[theme] += essay_term_score[term]
                    else:
                        essay_theme_term_score[theme] = essay_term_score[term]

        themes_per_essay[title] = sorted(essay_theme_term_score.items(), key = lambda x : x[1])[::-1]
        
        #### store initial scores #################################
        for theme, score in essay_theme_term_score.items():
            output.loc[output[output['title'] == title].index, theme] = score

        
    print("Total number of essays in this cluster: {}.".format(len(essays_per_cluster[i])))
    print()
    
    for title in essays_per_cluster[i][:10]:
        print("Essay ({}) themes: {}".format(title, sorted(themes_per_essay[title], key = lambda x : x[1])))
        print()

In [None]:
# see initial scores
output


# Save model/Load from pickle

In [None]:
filename = 'doc2vec_cluster.pkl'
pickle.dump(km, open(filename, 'wb'))


In [None]:
def distance(v1, v2):
    # L2 norm
    return np.linalg.norm(v1-v2)


In [None]:
# calculate global centroids for each topic
theme_globcentroid = {}
for theme in theme_index.keys():
        
    # get all vector columns matching theme
    sub_df = output[output[theme] != 0].iloc[:, :100]

    # number of vectors with that theme
    n = len(sub_df)  

    # mean of all vectors is centroid
    globalcentroid = sum([sub_df.iloc[i] for i in range(n)])/n
    theme_globcentroid[theme] = globalcentroid


# calculate local centroids for each topic
for cluster in range(num_clusters):
    
    for theme in theme_index.keys():
        
        # get all vector columns matching cluster and theme
        sub_df = output[output['cluster'] == cluster]
        sub_df = sub_df[sub_df[theme] != 0].iloc[:, :100]

        # number of vectors in cluster with this theme
        n = len(sub_df)  

        # mean of all vectors is centroid with this theme
        localcentroid = sum([sub_df.iloc[i] for i in range(n)])/n
                
        # find distance between current localcentroid and its corresponding globalcentoid by theme
        d1 = distance(theme_globcentroid[theme], localcentroid)
        
        # find distance between each vector and its corresponding localcentroid, update rank
        vectors = output[output['cluster'] == cluster]
        vectors = vectors[vectors[theme] != 0].index
        for ident in vectors:
            loc = output.iloc[ident, :100]
            d2 = distance(loc, localcentroid)
            
            # update rank - lda score * dist from v to localcentroid * dist from localcentroid to globalcentroid
            output.at[ident, theme] = output.at[ident, theme] * d1 * d2


In [None]:
all_we_need = output.iloc[:, 102:]
all_we_need