In [1]:
import chardet
import csv
import gensim
import logging
import nltk
import os
import string

%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from itertools import cycle
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from gensim.models import Doc2Vec
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Upload Essays 

In [2]:
essay_path = 'essays/'

In [3]:
files = os.listdir(essay_path)

essays = {}
for file in files:
    # attempt to confidently guess encoding; otherwise, default to ISO-8859-1
    encoding = "ISO-8859-1"
    guess = chardet.detect(open(essay_path + file, "rb").read())
    if (guess["confidence"] >= 0.95):
        encoding = guess["encoding"]
    
    with open(essay_path + file, "r", encoding=encoding) as f:
        essays[file] = f.read()

# Tokenize + Preprocess

In [4]:
tokenized_essays = {label: gensim.utils.simple_preprocess(corpus, deacc=True, min_len=2, max_len=15) for (label, corpus) in essays.items()}

In [6]:
english_stopwords = nltk.corpus.stopwords.words('english')

custom_stopwords = [
        "prison",
        "prisoner",
        "also",
        "said",
        "would",
        "could",
        "should",
        "first",
        "like",
        "get",
        "going",
        "thing",
        "something",
        "use",
        "get",
        "go",
        "one"
    ]

tokenized_essays = {label: [w for w in token_lst if w not in english_stopwords and w not in custom_stopwords] for (label, token_lst) in tokenized_essays.items()}

In [8]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": nltk.corpus.wordnet.ADJ,
                "N": nltk.corpus.wordnet.NOUN,
                "V": nltk.corpus.wordnet.VERB,
                "R": nltk.corpus.wordnet.ADV}

    return tag_dict.get(tag, nltk.corpus.wordnet.NOUN)

lemmatizer = nltk.stem.WordNetLemmatizer()
tokenized_essays = {label: [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in token_lst if w not in string.punctuation] for (label, token_lst) in tokenized_essays.items()}

# Vectorize w/ doc2vec

In [9]:
LabeledSentence1 = gensim.models.doc2vec.TaggedDocument
all_content_train = []
j=0
for essay in tokenized_essays.values():
    all_content_train.append(LabeledSentence1(essay, [j]))
    j+=1
    
print("Number of texts processed: ", j)

Number of texts processed:  1573


In [10]:
d2v_model = Doc2Vec(all_content_train, vector_size = 100, window = 10, min_count = 500, workers=7, dm = 1,alpha=0.025, min_alpha=0.001)
d2v_model.train(all_content_train, total_examples=d2v_model.corpus_count, epochs=10, start_alpha=0.002, end_alpha=-0.016)

In [11]:
essay_vectors = d2v_model.docvecs.vectors_docs
vectorized_df = pd.DataFrame(essay_vectors)
index_ref = vectorized_df.index

# Feature scaling through standardization

In [14]:
stdsclr = StandardScaler()
standardized_df = pd.DataFrame(stdsclr.fit_transform(vectorized_df.astype(float)), index=index_ref)

# Principle component analysis

In [17]:
pca = PCA(n_components=3)
#reduced_df = pd.DataFrame(reduced, index=index_ref, columns = ['title', 'cluster', 'essay'])
reduced_df = pd.DataFrame(pca.fit_transform(standardized_df), index=index_ref)

# Guide for output to visualize effectiveness of vectors

In [18]:
#reduced_df.to_csv('new.csv', sep='\t', index=False, header=False)
#pd.DataFrame(index_ref).to_csv('index.csv', index=False, header=False)

# Clustering w/ k-means 

In [19]:
num_clusters = 12

km = KMeans(n_clusters=num_clusters, init="k-means++", max_iter=100)

%time km.fit(reduced_df.values)

CPU times: user 215 ms, sys: 5.75 ms, total: 220 ms
Wall time: 240 ms


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=12, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [21]:
output = reduced_df
output['cluster'] = km.labels_
print(output['cluster'].value_counts())
output['essay'] = tokenized_essays.values()
output['title'] = tokenized_essays.keys()

0     583
1     222
5     200
9     158
2     149
3      57
4      48
6      40
7      36
8      29
11     28
10     23
Name: cluster, dtype: int64


In [45]:
theme_index = {'Autobiographical, Paths to Prison': 'socioeconomic, parent, damage, abuse, gang, alcohol, drug', 
                 'Family': 'family, abandonment, relation, visit, partner, mother, father, sibling', 
                 'Physical Conditions and Security': 'physical, condition, security, search, censorship, food, cold, hygiene, heat, misfunction, infestation, solitary', 
                 'Prison Culture/Community/Society': 'violence, fear, staff, sexual, crime, outcasts, racial, cellmate, gay, LGBTQ, dehumanize, uniform',
                 'Staff/prison Abuse of IP': 'abuse, sexual, torture, humiliation, racist, assault, antagonism, exacerbation, right, violation, food, hygiene, environment, legal',
                 'Personal/Intern Change/Copin': 'survival, art, reading, writing, peace, faith, prayer, meditation, practice, community, activities, hobbies, cooking, remorse, motivation, education, discipline, coping, adjustment',
                 'Judicial Misconduct and Legal Remediation': 'judicial, incompetence, corruption, witness, evidence, excessive, political, jailhouse, lawyer',
                 'Political and Intellectual Labor among IP': 'activism, resistence, critique, race, class, change, policies, practices',
                 'Prison Industry/Prison as Business': 'labor, slave, condition, safety, health',
                 'Education, Re-entry, Other Programs': 'rehabilitation, re-entry, education, indifference',
                 'Health Care': 'health, care, negligence, hostility, incompetence, indifference, death, injury, treatment, medication',
                 'Social Alienation, Indifference, Hostility': 'public, mispercetion, identity, stigma'}

In [58]:
from __future__ import print_function
import random

order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

essays_per_cluster = {}
for i in range(num_clusters):
    essays_per_cluster[i] = []
    #print("Cluster %d titles:" % i, end='')
    for title in output[output.cluster == i]['title'].values.tolist():
        #print(' %s,' % title, end='')
        essays_per_cluster[i].append(title)
        
print("***** 10 random sample essays per cluster *****")
print()

terms_per_essays = {}
themes_per_essays = {}
for i in range(num_clusters):
    ten = [random.choice(essays_per_cluster[i]) for _ in range(20)]
    print("** Cluster %d: **" % i)
    print()
    context = [' '.join(tokens) for tokens in list(output[output.cluster == i].essay)]
    
    tfidf_vectorizer = TfidfVectorizer(max_features=30)
    tfidf_vectorizer.fit(context)
    
    terms_per_essays[i] = [term for term in tfidf_vectorizer.get_feature_names() if term not in english_stopwords + custom_stopwords]
    
    print("Topic #{} : {}".format(i , terms_per_essays[i]))
    print()
    
    themes_per_essays[i] = set()
    for term in terms_per_essays[i]:
        #print(terms_per_essays[i])
        for key, value in theme_index.items():
            if term in value and key not in themes_per_essays[i]:
                #print(term, value)
                themes_per_essays[i].add(key)
    
    print("Theme(s): {}".format(themes_per_essays[i]))
    
    print()
    print(ten)
    print()
    print()


***** 10 random sample essays per cluster *****

** Cluster 0: **

Topic #0 : ['back', 'come', 'crime', 'day', 'even', 'give', 'inmate', 'know', 'law', 'life', 'make', 'many', 'need', 'never', 'people', 'right', 'say', 'see', 'sentence', 'state', 'system', 'take', 'time', 'want', 'way', 'well', 'work', 'write', 'year']

Theme(s): {'Prison Culture/Community/Society', 'Staff/prison Abuse of IP', 'Judicial Misconduct and Legal Remediation'}

['essay_775.txt', 'essay_1329.txt', 'essay_950.txt', 'essay_1121.txt', 'essay_470.txt', 'essay_1365.txt', 'essay_504.txt', 'essay_83.txt', 'essay_1416.txt', 'essay_1154.txt', 'essay_264.txt', 'essay_1083.txt', 'essay_1138.txt', 'essay_1243.txt', 'essay_1239.txt', 'essay_1410.txt', 'essay_718.txt', 'essay_1560.txt', 'essay_1283.txt', 'essay_1422.txt']


** Cluster 1: **

Topic #1 : ['american', 'court', 'crime', 'criminal', 'even', 'incarcerate', 'inmate', 'justice', 'law', 'life', 'make', 'many', 'need', 'parole', 'people', 'program', 'right', 'see', 

# Save model/Load from pickle

In [None]:
joblib.dump(kmeans_model,'doc_cluster.pkl')

#km = joblib.load('doc_cluster.pkl')
clusters = kmeans_model.labels_.tolist()

# Top terms per cluster

In [None]:
for i in range(num_clusters) :
    print("\n\n")
    context = [' '.join(tokens) for tokens in list(output[output.cluster == i].essay)]
    
    m1 =TfidfVectorizer(max_features=30)
    m1.fit(context)
    print("Topic #{} : {}".format(i , " , ".join(term for term in m1.get_feature_names() if term not in combined_stopwords)))    


# Visualize cluster (2D)

In [227]:
#import random

#%matplotlib inline
#plt.figure

#cluster_colors = []
#for i in range(num_clusters):
#    r = lambda: random.randint(0,255)
#    cluster_colors.append('#%02X%02X%02X' % (r(),r(),r()))

#color = [i for i in cluster_colors]
#plt.scatter(datapoint[:, 0], datapoint[:, 1])
#centroids = kmeans_model.cluster_centers_
#centroidpoint = pca.transform(centroids)
#plt.scatter(centroidpoint[:, 0], centroidpoint[:, 1], marker="^", s=150, c="#000000")
#plt.show()