In [229]:
import chardet
import csv
import gensim
import logging
import nltk
import os
import string

%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from itertools import cycle
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from gensim.models import Doc2Vec
from sklearn.externals import joblib
from sklearn.feature_extraction.text import TfidfVectorizer

# Upload Essays 

In [230]:
essay_path = 'essays/'

In [231]:
files = os.listdir(essay_path)

essays = {}
for file in files:
    # attempt to confidently guess encoding; otherwise, default to ISO-8859-1
    encoding = "ISO-8859-1"
    guess = chardet.detect(open(essay_path + file, "rb").read())
    if (guess["confidence"] >= 0.95):
        encoding = guess["encoding"]
    
    with open(essay_path + file, "r", encoding=encoding) as f:
        essays[file] = f.read()

# Tokenize + Preprocess

In [232]:
tokenized_essays = {label: gensim.utils.simple_preprocess(corpus, deacc=True, min_len=2, max_len=15) for (label, corpus) in essays.items()}

In [233]:
english_stopwords = nltk.corpus.stopwords.words('english')

custom_stopwords = [
        "prison",
        "prisoner",
        "also",
        "said",
        "mr",
        "mrs",
        "im",
        "would",
        "could",
        "should",
        "first",
        "like",
        "dont",
        "wont",
        "get",
        "going",
        "thing",
        "something",
        "use",
        "get",
        "go",
        "one"
    ]

combined_stopwords = english_stopwords + custom_stopwords

tokenized_essays = {label: [w for w in token_lst if w not in combined_stopwords] for (label, token_lst) in tokenized_essays.items()}

In [234]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": nltk.corpus.wordnet.ADJ,
                "N": nltk.corpus.wordnet.NOUN,
                "V": nltk.corpus.wordnet.VERB,
                "R": nltk.corpus.wordnet.ADV}

    return tag_dict.get(tag, nltk.corpus.wordnet.NOUN)

lemmatizer = nltk.stem.WordNetLemmatizer()
tokenized_essays = {label: [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in token_lst if w not in string.punctuation and w not in combined_stopwords] for (label, token_lst) in tokenized_essays.items()}

# Vectorize w/ doc2vec

In [235]:
LabeledSentence1 = gensim.models.doc2vec.TaggedDocument
all_content_train = []
j=0
for essay in tokenized_essays.values():
    all_content_train.append(LabeledSentence1(essay, [j]))
    j+=1
    
print("Number of texts processed: ", j)

Number of texts processed:  1573


In [236]:
d2v_model = Doc2Vec(all_content_train, vector_size = 100, window = 10, min_count = 500, workers=7, dm = 1,alpha=0.025, min_alpha=0.001)
d2v_model.train(all_content_train, total_examples=d2v_model.corpus_count, epochs=10, start_alpha=0.002, end_alpha=-0.016)

In [237]:
essay_vectors = d2v_model.docvecs.vectors_docs
vectorized_df = pd.DataFrame(essay_vectors)
index_ref = vectorized_df.index

# Feature scaling through standardization

In [238]:
stdsclr = StandardScaler()
standardized_df = pd.DataFrame(stdsclr.fit_transform(vectorized_df.astype(float)), index=index_ref)

# Principle component analysis

In [239]:
pca = PCA(n_components=3)
reduced = pca.fit_transform(standardized_df)
reduced_df = pd.DataFrame(reduced, index=index_ref, columns = ['title', 'cluster', 'essay'])

# Guide for output to visualize effectiveness of vectors

In [240]:
#reduced_df.to_csv('new.csv', sep='\t', index=False, header=False)
#pd.DataFrame(index_ref).to_csv('index.csv', index=False, header=False)

# Clustering w/ k-means 

In [241]:
num_clusters = 12

km = KMeans(n_clusters=num_clusters, init="k-means++", max_iter=100)

%time km.fit(reduced_df.values)

CPU times: user 314 ms, sys: 110 ms, total: 424 ms
Wall time: 215 ms


KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=100,
    n_clusters=12, n_init=10, n_jobs=None, precompute_distances='auto',
    random_state=None, tol=0.0001, verbose=0)

In [242]:
output = reduced_df
output['cluster'] = kmeans_model.labels_
print(output['cluster'].value_counts())
output['essay'] = tokenized_essays.values()
output['title'] = tokenized_essays.keys()

0     597
6     245
3     196
9     188
11     78
10     58
4      57
5      50
2      50
7      27
8      19
1       8
Name: cluster, dtype: int64


# Save model/Load from pickle

In [243]:
joblib.dump(kmeans_model,'doc_cluster.pkl')

#km = joblib.load('doc_cluster.pkl')
clusters = kmeans_model.labels_.tolist()

# Top terms per cluster

In [245]:
for i in range(num_clusters) :
    print("\n\n")
    context = [' '.join(tokens) for tokens in list(output[output.cluster == i].essay)]
    
    m1 =TfidfVectorizer(max_features=30)
    m1.fit(context)
    print("Topic #{} : {}".format(i , " , ".join(term for term in m1.get_feature_names() if term not in combined_stopwords)))    





Topic #0 : call , case , cell , come , court , crime , day , even , give , inmate , know , law , life , make , many , need , people , right , see , sentence , state , system , take , time , well , work , write , year



Topic #1 : appear , become , black , body , come , consciousness , even , experience , inside , life , make , male , member , men , mind , move , new , people , reflection , see , seem , social , society , state , system , take , think , time , way , world



Topic #2 : back , come , day , even , give , god , good , know , life , look , love , make , mother , much , never , people , place , say , see , still , take , think , time , want , way , well , year



Topic #3 : back , become , change , come , day , even , family , give , help , know , life , look , love , make , man , many , much , need , never , people , say , see , take , think , time , want , way , well , world , year



Topic #4 : behavior , change , crime , criminal , experience , inmate , law , life , 

# Visualize cluster (2D)

In [227]:
#import random

#%matplotlib inline
#plt.figure

#cluster_colors = []
#for i in range(num_clusters):
#    r = lambda: random.randint(0,255)
#    cluster_colors.append('#%02X%02X%02X' % (r(),r(),r()))

#color = [i for i in cluster_colors]
#plt.scatter(datapoint[:, 0], datapoint[:, 1])
#centroids = kmeans_model.cluster_centers_
#centroidpoint = pca.transform(centroids)
#plt.scatter(centroidpoint[:, 0], centroidpoint[:, 1], marker="^", s=150, c="#000000")
#plt.show()