In [1]:
import chardet
import csv
import gensim
import logging
import nltk
import os
import string

%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from itertools import cycle
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from gensim.models import Doc2Vec
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Upload Essays 

In [2]:
essay_path = 'essays/'

In [3]:
files = os.listdir(essay_path)

essays = {}
for file in files:
    # attempt to confidently guess encoding; otherwise, default to ISO-8859-1
    encoding = "ISO-8859-1"
    guess = chardet.detect(open(essay_path + file, "rb").read())
    if (guess["confidence"] >= 0.95):
        encoding = guess["encoding"]
    
    with open(essay_path + file, "r", encoding=encoding) as f:
        essays[file] = f.read()

# Tokenize + Preprocess

In [4]:
tokenized_essays = {label: gensim.utils.simple_preprocess(corpus, deacc=True, min_len=2, max_len=20) for (label, corpus) in essays.items()}

In [5]:
english_stopwords = nltk.corpus.stopwords.words('english')

custom_stopwords = [
        "prison",
        "jail",
        "prisoner",
        "also",
        "said",
        "would",
        "could",
        "should",
        "first",
        "like",
        "get",
        "going",
        "thing",
        "something",
        "use",
        "get",
        "go",
        "one",
        "mr",
        "many",
        "much",
        "see",
        "take",
        "another",
        "aroud",
        "away",
        "back",
        "even",
        "every",
        "guy",
        "know",
        "let",
        "make",
        "look",
        "person",
        "place",
        "day",
        "year",
        "well",
        "good",
        "bad",
        "with",
        "without",
        "may",
        "new",
        "two",
        "want",
        "people",
        "within",
        "upon",
        "come",
        "tilocblob",
        "yyyyyy",
        "way",
        "around",
        "high",
        "inside",
        "long",
        "men",
        "must",
        "need",
        "never",
        "old",
        "other",
        "others",
        "really",
        "say",
        "seem",
        "still",
        "try",
        "become",
        "allow",
        "give",
        "month",
        "result",
        "always",
        "ask",
        "begin",
        "end",
        "hour",
        "man",
        "woman",
        "put",
        "someone",
        "start",
        "next",
        "act",
        "create",
        "yet",
        "time",
        "case",
        "cell",
        "work",
        "call",
        "world",
        "tell",
        "week",
        "told",
        "lot",
        "change",
        "self",
        "since"
    ]


In [6]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": nltk.corpus.wordnet.ADJ,
                "N": nltk.corpus.wordnet.NOUN,
                "V": nltk.corpus.wordnet.VERB,
                "R": nltk.corpus.wordnet.ADV}

    return tag_dict.get(tag, nltk.corpus.wordnet.NOUN)

lemmatizer = nltk.stem.WordNetLemmatizer()
tokenized_essays = {label: [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in token_lst if (w not in string.punctuation and w not in english_stopwords and w not in custom_stopwords)] for (label, token_lst) in tokenized_essays.items()}

In [7]:
tokenized_essays = {label: [w for w in token_lst if (w not in english_stopwords and w not in custom_stopwords)] for (label, token_lst) in tokenized_essays.items()}

# Vectorize w/ doc2vec

In [8]:
LabeledSentence1 = gensim.models.doc2vec.TaggedDocument
all_content_train = []
j=0
for essay in tokenized_essays.values():
    all_content_train.append(LabeledSentence1(essay, [j]))
    j+=1
    
print("Number of texts processed: ", j)

Number of texts processed:  1574


In [9]:
d2v_model = Doc2Vec(all_content_train, vector_size = 100, window = 10, min_count = 500, workers=7, dm = 1,alpha=0.025, min_alpha=0.001)
d2v_model.train(all_content_train, total_examples=d2v_model.corpus_count, epochs=10, start_alpha=0.002, end_alpha=-0.016)

In [10]:
essay_vectors = d2v_model.docvecs.vectors_docs
vectorized_df = pd.DataFrame(essay_vectors)
index_ref = vectorized_df.index

# Feature scaling through standardization

In [11]:
stdsclr = StandardScaler()
standardized_df = pd.DataFrame(stdsclr.fit_transform(vectorized_df.astype(float)), index=index_ref)

# Principle component analysis

In [12]:
pca = PCA(n_components=3)
#reduced_df = pd.DataFrame(reduced, index=index_ref, columns = ['title', 'cluster', 'essay'])
reduced_df = pd.DataFrame(pca.fit_transform(standardized_df), index=index_ref)

# Guide for output to visualize effectiveness of vectors

In [23]:
#reduced_df.to_csv('new1.csv', sep='\t', index=False, header=False)
#pd.DataFrame(index_ref).to_csv('index.csv', index=False, header=False)

# Clustering w/ k-means 

In [14]:
num_clusters = 12

km = KMeans(n_clusters=num_clusters, init="k-means++", max_iter=100)

%time km.fit(reduced_df.values)

CPU times: user 315 ms, sys: 66.7 ms, total: 381 ms
Wall time: 175 ms


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=12, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

# Essays per cluster / Theme(s) per cluster

In [15]:
output = reduced_df
output['cluster'] = km.labels_
print(output['cluster'].value_counts())
output['essay'] = tokenized_essays.values()
output['title'] = tokenized_essays.keys()

11    517
1     343
7     254
8     146
3     108
2      76
6      32
9      31
10     25
5      17
0      16
4       9
Name: cluster, dtype: int64


In [16]:
theme_index = {'Autobiographical, Paths to Prison': 'parent, damage, abuse, gang, alcohol, drug, neighborhood, hood, youth, pressure, fit, broken', 
                 'Family': 'family, abandonment, relation, visit, partner, mother, father, sibling, child, wife, husband, abuse, support', 
                 'Physical Conditions and Security': 'physical, condition, security, search, censorship, food, cold, hygiene, heat, misfunction, infestation, solitary, strip, search, fear, filth, violence, staff, abuse', 
                 'Prison Culture/Community/Society': 'violence, fear, staff, sexual, crime, outcasts, racial, cellmate, gay, LGBTQ, dehumanize, uniform, pecking, order, hierarchy, solid, dirty, skin, chomo',
                 'Staff/prison Abuse of IP': 'abuse, sexual, torture, humiliation, racist, assault, antagonism, exacerbation, right, violation, food, hygiene, environment, legal, extraction, search, taunt',
                 'Personal/Intern Change/Coping': 'survival, art, reading, writing, peace, faith, prayer, meditation, practice, community, activities, hobbies, cooking, remorse, motivation, education, discipline, coping, adjustment, responsibility, god, redemption, transformation',
                 'Judicial Misconduct and Legal Remediation': 'judicial, incompetence, corruption, witness, evidence, excessive, political, jailhouse, lawyer, misconduct, unfair, pretender, plra, plea, grievance',
                 'Political and Intellectual Labor among IP': 'activism, resistence, critique, race, class, change, policies, practices, write, organize, strike, solidarity',
                 'Prison Industry/Prison as Business': 'labor, slave, condition, safety, health, profit, job, budget, tax, taxpayer, exploitation, corruption, mismanagement, nepotism',
                 'Education, Re-entry, Other Programs': 'rehabilitation, entry, education, indifference, college, vocation',
                 'Health Care': 'health, care, negligence, hostility, incompetence, indifference, death, injury, treatment, medication',
                 'Social Alienation, Indifference, Hostility': 'public, misperception, identity, stigma'}

In [17]:
import os
from os import path
import shutil

src = "/Users/inesayara/Desktop/senior_seminar/essays"
dst = "/Users/inesayara/Desktop/clusters_1_0/cluster_"

#files = [i for i in os.listdir(src) if path.isfile(path.join(src, i))]
#for f in files:
#    shutil.copy(path.join(src, f), dst)

In [22]:
from __future__ import print_function
import random

order_centroids = km.cluster_centers_.argsort()[:, ::-1] 

essays_per_cluster = {}
for i in range(num_clusters):
    essays_per_cluster[i] = []
    #print("Cluster %d titles:" % i, end='')
    for title in output[output.cluster == i]['title'].values.tolist():
        #print(' %s,' % title, end='')
        essays_per_cluster[i].append(title)
        
print("***** Random sample essays per cluster *****")
print()

terms_per_essays = {}
themes_per_essays = {}
for i in range(num_clusters):
    ten = []
    if len(essays_per_cluster[i]) < 10:
        ten = essays_per_cluster[i]
    else:
        while len(ten) < 10:
            essay = random.choice(essays_per_cluster[i])
            if essay not in ten:
                ten.append(essay)

    print("** Cluster %d: **" % i)
    print()
    context = [' '.join(tokens) for tokens in list(output[output.cluster == i].essay)]
    
    tfidf_vectorizer = TfidfVectorizer(max_features=50)
    tfidf_vectorizer.fit(context)
    
    #terms_per_essays[i] = [term for term in tfidf_vectorizer.get_feature_names() if term not in english_stopwords + custom_stopwords]
    terms_per_essays[i] = [term for term in tfidf_vectorizer.get_feature_names()]
    
    print("Top terms #{} : {}".format(i, terms_per_essays[i]))
    print()
    
    themes_per_essays[i] = set()
    theme_term_count = {}
    for term in terms_per_essays[i]:
        for key, value in theme_index.items():
            if term in value:
                if key in theme_term_count:
                    theme_term_count[key] += 1
                else:
                    theme_term_count[key] = 1
                themes_per_essays[i].add(key)
    
    themes_per_essays[i] = [theme for theme, count in theme_term_count.items() if count > 1]
    print("Dominant theme(s): {}".format(themes_per_essays[i]))
    print()
    print(sorted(theme_term_count.items(), key=lambda x : x[1])[::-1])
    
    print()
    print("Randomly selected essays:", ten)
    files = [i for i in os.listdir(src) if i in ten and path.isfile(path.join(src, i))]
    for f in files:
        shutil.copy(path.join(src, f), dst + str(i))
    print()
    print()


***** Random sample essays per cluster *****

** Cluster 0: **

Top terms #0 : ['abuse', 'believe', 'center', 'child', 'civil', 'commit', 'court', 'cp', 'crime', 'dr', 'education', 'fact', 'family', 'federal', 'group', 'help', 'include', 'individual', 'inmate', 'issue', 'justice', 'law', 'learn', 'level', 'life', 'nao', 'offender', 'official', 'parole', 'pay', 'pornography', 'problem', 'program', 'provide', 'release', 'right', 'security', 'sentence', 'sex', 'sexual', 'society', 'staff', 'state', 'system', 'texas', 'think', 'treatment', 'veteran', 'write', 'youth']

Dominant theme(s): ['Autobiographical, Paths to Prison', 'Family', 'Physical Conditions and Security', 'Staff/prison Abuse of IP', 'Prison Culture/Community/Society']

[('Prison Culture/Community/Society', 4), ('Staff/prison Abuse of IP', 4), ('Physical Conditions and Security', 3), ('Family', 3), ('Autobiographical, Paths to Prison', 3), ('Political and Intellectual Labor among IP', 1), ('Health Care', 1), ('Prison Industry

Top terms #7 : ['american', 'believe', 'black', 'board', 'california', 'community', 'condition', 'convict', 'correction', 'court', 'crime', 'criminal', 'death', 'department', 'drug', 'fact', 'family', 'federal', 'force', 'government', 'human', 'incarcerate', 'incarceration', 'individual', 'inmate', 'issue', 'justice', 'law', 'life', 'murder', 'offender', 'officer', 'parole', 'population', 'problem', 'program', 'provide', 'public', 'punishment', 'rehabilitation', 'release', 'right', 'sentence', 'serve', 'society', 'staff', 'state', 'system', 'term', 'united']

Dominant theme(s): ['Physical Conditions and Security', 'Prison Culture/Community/Society']

[('Prison Culture/Community/Society', 3), ('Physical Conditions and Security', 2), ('Staff/prison Abuse of IP', 1), ('Education, Re-entry, Other Programs', 1), ('Social Alienation, Indifference, Hostility', 1), ('Judicial Misconduct and Legal Remediation', 1), ('Family', 1), ('Autobiographical, Paths to Prison', 1), ('Health Care', 1), ('P

# Save model/Load from pickle

In [19]:
joblib.dump(km,'doc_cluster.pkl')

#km = joblib.load('doc_cluster.pkl')
clusters = km.labels_.tolist()

# Visualize cluster (2D)

In [20]:
#import random

#%matplotlib inline
#plt.figure

#cluster_colors = []
#for i in range(num_clusters):
#    r = lambda: random.randint(0,255)
#    cluster_colors.append('#%02X%02X%02X' % (r(),r(),r()))

#color = [i for i in cluster_colors]
#plt.scatter(datapoint[:, 0], datapoint[:, 1])
#centroids = kmeans_model.cluster_centers_
#centroidpoint = pca.transform(centroids)
#plt.scatter(centroidpoint[:, 0], centroidpoint[:, 1], marker="^", s=150, c="#000000")
#plt.show()