# APWA - TOPICAL ANALYSIS

## LOGISTICS AND DATA INPUT

### Import necessary libraries and dependencies

In [None]:
import chardet
import csv
import gensim
import logging
import nltk
import os
import pickle
import string

%matplotlib notebook
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from collections import Counter
from itertools import cycle
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from sklearn.feature_extraction.text import TfidfVectorizer


### Specify paths

In [None]:
root = os.path.dirname(os.path.realpath('__file__'))
essay_path = root + '/../essays/'


### Load all essays into hash table

In [None]:
files = os.listdir(essay_path)

essays = {}
for file in files:
    # attempt to confidently guess encoding; otherwise, default to ISO-8859-1
    encoding = "ISO-8859-1"
    guess = chardet.detect(open(essay_path + file, "rb").read())
    if (guess["confidence"] >= 0.95):
        encoding = guess["encoding"]
    
    with open(essay_path + file, "r", encoding=encoding) as f:
        essays[file] = f.read()


### Setup logging for Gensim

In [None]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


## CLEANING DATA

### Preprocess text into lowercase tokens

In [None]:
tokenized_essays = {label: gensim.utils.simple_preprocess(corpus, deacc=True, min_len=2, max_len=15) for (label, corpus) in essays.items()}


### Lemmatize tokens

In [None]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": nltk.corpus.wordnet.ADJ,
                "N": nltk.corpus.wordnet.NOUN,
                "V": nltk.corpus.wordnet.VERB,
                "R": nltk.corpus.wordnet.ADV}

    return tag_dict.get(tag, nltk.corpus.wordnet.NOUN)

lemmatizer = nltk.stem.WordNetLemmatizer()
tokenized_essays = {label: [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in token_lst if w not in string.punctuation] for (label, token_lst) in tokenized_essays.items()}


### Remove stopwords

In [None]:
english_stopwords = nltk.corpus.stopwords.words('english')
custom_stopwords = [
        "prison",
        "prisoner",
        "jail",
        "also",
        "said",
        "would",
        "could",
        "should",
        "first",
        "like",
        "get",
        "going",
        "thing",
        "something",
        "use",
        "get",
        "go",
        "one"
    ]
tokenized_essays = {label: [w for w in token_lst if (w not in english_stopwords and w not in custom_stopwords)] for (label, token_lst) in tokenized_essays.items()}


In [None]:
# tokenized_essays['apw_173.txt']

### Word2Vec

In [None]:
vector_dim = 100
model = gensim.models.Word2Vec(tokenized_essays.values(), size=vector_dim)


### Guide for saving / loading word embedding spaces

In [None]:
# model.save(root + "/mymodel.space")
# model = gensim.models.Word2Vec.load(root + "/mymodel.space")


### Experiment with most_similar terms

In [None]:
model.wv.most_similar(positive="neglect")


### Total number of words in our vocabulary

In [None]:
len(model.wv.vocab)


### Convert tokens to their respective vectors and linearly combine to make single essay:vector representations

In [None]:
vectorized_essays = {label: np.sum(np.array([model.wv.word_vec(token) for token in token_lst if token in model.wv.vocab]), axis=0) for (label, token_lst) in tokenized_essays.items()}


In [None]:
# make it a dataframe and create index reference
vectorized_df = pd.DataFrame.from_dict(vectorized_essays, orient='index')
index_ref = vectorized_df.index


### Feature scaling through standardization

In [None]:
stdsclr = StandardScaler()
standardized_df = pd.DataFrame(stdsclr.fit_transform(vectorized_df), index=index_ref)


### Principle component analysis

In [None]:
pca = PCA(n_components=3)
reduced_df = pd.DataFrame(pca.fit_transform(standardized_df), index=index_ref)


### Guide for output to visualize effectiveness of vectors

In [None]:
reduced_df.to_csv('new.csv', sep='\t', index=False, header=False)
pd.DataFrame(index_ref).to_csv('index.csv', index=False, header=False)


### K-Means Clustering

In [None]:
num_clusters = 12

km = KMeans(n_clusters=num_clusters)

%time km.fit(reduced_df.values)


### Add cluster value to dataframe of vectors

In [None]:
output = reduced_df
output['cluster'] = km.labels_


### Tentative themes; need to work with Larson on these

In [None]:
theme_index = {'Autobiographical, Paths to Prison': 'socioeconomic, parent, damage, abuse, gang, alcohol, drug', 
            
               'Family': 'family, abandonment, relation, visit, partner, mother, father, sibling', 
                 
               'Physical Conditions and Security': 'physical, condition, security, search, censorship, food, cold, hygiene, heat, misfunction, infestation, solitary', 
                 
               'Prison Culture/Community/Society': 'violence, fear, staff, sexual, crime, outcasts, racial, cellmate, gay, LGBTQ, dehumanize, uniform',
                 
               'Staff/prison Abuse of IP': 'abuse, sexual, torture, humiliation, racist, assault, antagonism, exacerbation, right, violation, food, hygiene, environment, legal',
                 
               'Personal/Intern Change/Copin': 'survival, art, reading, writing, peace, faith, prayer, meditation, practice, community, activities, hobbies, cooking, remorse, motivation, education, discipline, coping, adjustment',
                 
               'Judicial Misconduct and Legal Remediation': 'judicial, incompetence, corruption, witness, evidence, excessive, political, jailhouse, lawyer',
                 
               'Political and Intellectual Labor among IP': 'activism, resistence, critique, race, class, change, policies, practices',
                 
               'Prison Industry/Prison as Business': 'labor, slave, condition, safety, health',
                 
               'Education, Re-entry, Other Programs': 'rehabilitation, re-entry, education, indifference',
                 
               'Health Care': 'health, care, negligence, hostility, incompetence, indifference, death, injury, treatment, medication',
                 
               'Social Alienation, Indifference, Hostility': 'public, mispercetion, identity, stigma'
              
              }


### Select 10 random essays from each cluster

In [None]:
from __future__ import print_function
import random

order_centroids = km.cluster_centers_.argsort()[:, ::-1] 
        
print("***** 10 random sample essays per cluster *****")
print()

terms_per_essays = {}
themes_per_essays = {}
for i in range(num_clusters):
    
    ten = [random.choice(output[output.cluster == i].index) for _ in range(10)]
    print("** Cluster %d: **\n" % i)
    
    context = []
    for lst in [tokenized_essays[title] for title in ten]:
        context += lst
    
    tfidf_vectorizer = TfidfVectorizer(max_features=30)
    tfidf_vectorizer.fit(context)
    
    terms_per_essays[i] = tfidf_vectorizer.get_feature_names()
    
    print("Topic #{} : {}".format(i , terms_per_essays[i]))
    print()
    
    themes_per_essays[i] = set()
    for term in terms_per_essays[i]:
        for key, value in theme_index.items():
            if term in value and key not in themes_per_essays[i]:
                themes_per_essays[i].add(key)
    
    print("Theme(s): {}".format(themes_per_essays[i]))
    
    print()
    print(ten)
    print()
    print("************************************************\n")


### Save model / load from pickle

In [None]:
filename = 'word2vec_cluster.pkl'
pickle.dump(km, open(filename, 'wb'))
# km = pickle.load(open(filename, 'rb'))
