In [1]:
import chardet
import gensim
import logging
import string
import numpy as np
import pandas as pd
import nltk
import re
import os
import codecs
from sklearn import feature_extraction

from collections import Counter
from itertools import cycle
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

### Specify paths

In [2]:
root = os.path.dirname(os.path.realpath('__file__'))
essay_path = root + '/../Larson_Project/essays/'


### Load all essays into hash table

In [3]:
files = os.listdir(essay_path)

essays = {}
count = 0
for file in files:
    # attempt to confidently guess encoding; otherwise, default to ISO-8859-1
    encoding = "ISO-8859-1"
    guess = chardet.detect(open(essay_path + file, "rb").read())
    if (guess["confidence"] >= 0.95):
        encoding = guess["encoding"]
    
    with open(essay_path + file, "r", encoding=encoding) as f:
        essays[file] = f.read()
        
    count += 1


### Setup logging for Gensim

In [4]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)


## CLEANING DATA

### Preprocess text into lowercase tokens

In [5]:
tokenized_essays = {label: gensim.utils.simple_preprocess(corpus, deacc=True, min_len=2, max_len=15) for (label, corpus) in essays.items()}


### Lemmatize tokens

In [7]:
def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": nltk.corpus.wordnet.ADJ,
                "N": nltk.corpus.wordnet.NOUN,
                "V": nltk.corpus.wordnet.VERB,
                "R": nltk.corpus.wordnet.ADV}

    return tag_dict.get(tag, nltk.corpus.wordnet.NOUN)

lemmatizer = nltk.stem.WordNetLemmatizer()
tokenized_essays = {label: [lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in token_lst if w not in string.punctuation] for (label, token_lst) in tokenized_essays.items()}
tokenized_essays_list = list(tokenized_essays.values())

### Remove stopwords

In [None]:
english_stopwords = nltk.corpus.stopwords.words('english')
custom_stopwords = [
        "prison",
        "prisoner",
        "jail",
        "also",
        "said",
        "would",
        "could",
        "should",
        "first",
        "like",
        "get",
        "going",
        "thing",
        "something",
        "use",
        "get",
        "go",
        "one"
    ]
tokenized_essays = {label: [w for w in token_lst if w not in english_stopwords and w not in custom_stopwords] for (label, token_lst) in tokenized_essays.items()}

In [8]:
# Create a dictionary from ‘processed_docs’ 
# containing the number of times a word appears in the training set.
dictionary = gensim.corpora.Dictionary(tokenized_essays_list)


2019-10-21 20:49:51,603 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2019-10-21 20:49:53,336 : INFO : built Dictionary(57330 unique tokens: ['aggressive', 'air', 'anger', 'another', 'anyone']...) from 1573 documents (total 1194020 corpus positions)


In [9]:
count = 0
for k, v in dictionary.iteritems():
    print(k, v)
    count += 1
    if count > 10:
        break


0 aggressive
1 air
2 anger
3 another
4 anyone
5 anything
6 attack
7 august
8 away
9 bad
10 battlefield


In [10]:
# Filter out tokens that appear in
# less than 15 documents (absolute number) or
# more than 0.5 documents (fraction of total corpus size, not absolute number).
# after the above two steps, keep only the first 100000 most frequent tokens.
dictionary.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)


2019-10-21 20:49:53,502 : INFO : discarding 51271 tokens: [('another', 856), ('battlefield', 10), ('bile', 4), ('blight', 11), ('bludgeon', 4), ('caustic', 3), ('cheapen', 1), ('coarseness', 1), ('come', 1071), ('defective', 12)]...
2019-10-21 20:49:53,503 : INFO : keeping 6059 tokens which were in no less than 15 and no more than 786 (=50.0%) documents
2019-10-21 20:49:53,539 : INFO : resulting dictionary: Dictionary(6059 unique tokens: ['aggressive', 'air', 'anger', 'anyone', 'anything']...)


In [11]:
# For each document, create a dictionary reporting how many
# words and how many times those words appear. Save this to ‘bow_corpus’, then check our selected document earlier.
bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_essays_list]
bow_corpus[4310]



[(6, 1),
 (18, 1),
 (22, 2),
 (24, 3),
 (31, 3),
 (60, 2),
 (65, 1),
 (71, 4),
 (83, 1),
 (85, 2),
 (91, 3),
 (103, 1),
 (107, 2),
 (108, 3),
 (112, 1),
 (119, 1),
 (122, 1),
 (149, 2),
 (152, 1),
 (163, 2),
 (174, 1),
 (186, 1),
 (194, 2),
 (202, 2),
 (210, 1),
 (227, 2),
 (241, 1),
 (247, 1),
 (248, 2),
 (256, 3),
 (257, 1),
 (264, 1),
 (265, 1),
 (266, 1),
 (271, 1),
 (275, 7),
 (277, 1),
 (280, 2),
 (288, 8),
 (289, 1),
 (291, 1),
 (298, 1),
 (307, 5),
 (325, 2),
 (333, 1),
 (334, 1),
 (336, 5),
 (337, 1),
 (338, 1),
 (347, 1),
 (348, 1),
 (351, 1),
 (353, 2),
 (367, 5),
 (371, 1),
 (375, 6),
 (377, 8),
 (383, 1),
 (386, 28),
 (391, 2),
 (409, 1),
 (411, 1),
 (415, 1),
 (424, 1),
 (434, 1),
 (437, 1),
 (439, 8),
 (453, 1),
 (460, 1),
 (464, 2),
 (470, 4),
 (471, 2),
 (493, 1),
 (505, 2),
 (517, 1),
 (530, 1),
 (531, 7),
 (533, 1),
 (544, 2),
 (552, 2),
 (556, 1),
 (561, 1),
 (565, 3),
 (570, 8),
 (583, 1),
 (601, 1),
 (604, 1),
 (613, 1),
 (615, 1),
 (626, 3),
 (631, 1),
 (658, 1),

In [12]:
# Preview Bag Of Words for our sample preprocessed document.
bow_doc_4310 = bow_corpus[4310]

for i in range(len(bow_doc_4310)):
    print("Word {} (\"{}\") appears {} time.".format(bow_doc_4310[i][0], 
                                                     dictionary[bow_doc_4310[i][0]], 
                                                     bow_doc_4310[i][1]))



Word 6 ("august") appears 1 time.
Word 18 ("cannot") appears 1 time.
Word 22 ("citizen") appears 2 time.
Word 24 ("community") appears 3 time.
Word 31 ("criminal") appears 3 time.
Word 60 ("good") appears 2 time.
Word 65 ("help") appears 1 time.
Word 71 ("incarceration") appears 4 time.
Word 83 ("love") appears 1 time.
Word 85 ("mass") appears 2 time.
Word 91 ("must") appears 3 time.
Word 103 ("prison") appears 1 time.
Word 107 ("remain") appears 2 time.
Word 108 ("restore") appears 3 time.
Word 112 ("robert") appears 1 time.
Word 119 ("shock") appears 1 time.
Word 122 ("side") appears 1 time.
Word 149 ("accord") appears 2 time.
Word 152 ("actual") appears 1 time.
Word 163 ("american") appears 2 time.
Word 174 ("bill") appears 1 time.
Word 186 ("change") appears 1 time.
Word 194 ("consider") appears 2 time.
Word 202 ("create") appears 2 time.
Word 210 ("democrat") appears 1 time.
Word 227 ("education") appears 2 time.
Word 241 ("face") appears 1 time.
Word 247 ("finance") appears 1 tim

### TF-IDF

In [13]:
# Create tf-idf model object using models.TfidfModel on ‘bow_corpus’ and save it to ‘tfidf’, 
# then apply transformation to the entire corpus and call it ‘corpus_tfidf’. 
# Finally we preview TF-IDF scores for our first document.
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)


2019-10-21 20:49:54,378 : INFO : collecting document frequencies
2019-10-21 20:49:54,383 : INFO : PROGRESS: processing document #0
2019-10-21 20:49:54,546 : INFO : calculating IDF weights for 1573 documents and 6059 features (508322 matrix non-zeros)


In [14]:
corpus_tfidf = tfidf[bow_corpus]

In [15]:
from pprint import pprint

for doc in corpus_tfidf:
    pprint(doc)
    break


[(0, 0.09379295575241038),
 (1, 0.06161466199713889),
 (2, 0.060857478466811464),
 (3, 0.03628722174097065),
 (4, 0.0349662870830923),
 (5, 0.06384579965159101),
 (6, 0.06989847196027979),
 (7, 0.09899251907681905),
 (8, 0.02314700881373919),
 (9, 0.020260837005040288),
 (10, 0.05444269060802119),
 (11, 0.11959175426347554),
 (12, 0.08910184732085523),
 (13, 0.06792813246358596),
 (14, 0.09570511174292377),
 (15, 0.07862612145798246),
 (16, 0.12581742367884752),
 (17, 0.08990052541608053),
 (18, 0.08056189427860502),
 (19, 0.057467045277238185),
 (20, 0.1337442716118708),
 (21, 0.04969328230319575),
 (22, 0.04473755288291632),
 (23, 0.04226574003633351),
 (24, 0.03828457317206136),
 (25, 0.07549992208583974),
 (26, 0.04791409055319777),
 (27, 0.0592578292891936),
 (28, 0.07862612145798246),
 (29, 0.042344844364168834),
 (30, 0.08472113208481645),
 (31, 0.028783224145832013),
 (32, 0.11446455419041415),
 (33, 0.06485688113206525),
 (34, 0.09072170442088967),
 (35, 0.06384579965159101),


### Running LDA using Bag of Words

In [16]:
# Train our lda model using gensim.models.LdaMulticore and save it to ‘lda_model’
lda_model = gensim.models.LdaMulticore(bow_corpus, num_topics=10, id2word=dictionary, passes=2, workers=2)


2019-10-21 20:49:54,625 : INFO : using symmetric alpha at 0.1
2019-10-21 20:49:54,634 : INFO : using symmetric eta at 0.1
2019-10-21 20:49:54,636 : INFO : using serial LDA version on this node
2019-10-21 20:49:54,646 : INFO : running online LDA training, 10 topics, 2 passes over the supplied corpus of 1573 documents, updating every 4000 documents, evaluating every ~1573 documents, iterating 50x with a convergence threshold of 0.001000
2019-10-21 20:49:54,656 : INFO : training LDA model using 2 processes
2019-10-21 20:49:54,714 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1573/1573, outstanding queue size 1
2019-10-21 20:49:59,124 : INFO : topic #8 (0.100): 0.004*"inmate" + 0.004*"law" + 0.003*"cell" + 0.003*"court" + 0.003*"sentence" + 0.003*"become" + 0.003*"officer" + 0.003*"write" + 0.003*"love" + 0.002*"new"
2019-10-21 20:49:59,125 : INFO : topic #3 (0.100): 0.004*"world" + 0.004*"society" + 0.004*"family" + 0.003*"write" + 0.003*"cell" + 0.003*"sentence" + 0.0

In [17]:
# For each topic, we will explore the words occuring in that topic and its relative weight.
for idx, topic in lda_model.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx, topic))



2019-10-21 20:50:11,524 : INFO : topic #0 (0.100): 0.009*"inmate" + 0.005*"program" + 0.003*"world" + 0.003*"god" + 0.003*"change" + 0.003*"society" + 0.003*"prison" + 0.003*"help" + 0.003*"good" + 0.003*"crime"
2019-10-21 20:50:11,528 : INFO : topic #1 (0.100): 0.008*"inmate" + 0.007*"black" + 0.004*"men" + 0.003*"new" + 0.003*"society" + 0.003*"world" + 0.003*"cell" + 0.003*"sentence" + 0.003*"seem" + 0.003*"law"
2019-10-21 20:50:11,531 : INFO : topic #2 (0.100): 0.009*"inmate" + 0.004*"write" + 0.004*"officer" + 0.004*"law" + 0.003*"staff" + 0.003*"program" + 0.003*"become" + 0.003*"think" + 0.003*"good" + 0.003*"court"
2019-10-21 20:50:11,533 : INFO : topic #3 (0.100): 0.004*"world" + 0.004*"family" + 0.004*"cell" + 0.004*"society" + 0.004*"become" + 0.004*"write" + 0.004*"feel" + 0.003*"think" + 0.003*"mind" + 0.003*"thought"
2019-10-21 20:50:11,536 : INFO : topic #4 (0.100): 0.006*"law" + 0.005*"sentence" + 0.005*"court" + 0.004*"criminal" + 0.003*"case" + 0.003*"crime" + 0.003*"

Topic: 0 
Words: 0.009*"inmate" + 0.005*"program" + 0.003*"world" + 0.003*"god" + 0.003*"change" + 0.003*"society" + 0.003*"prison" + 0.003*"help" + 0.003*"good" + 0.003*"crime"
Topic: 1 
Words: 0.008*"inmate" + 0.007*"black" + 0.004*"men" + 0.003*"new" + 0.003*"society" + 0.003*"world" + 0.003*"cell" + 0.003*"sentence" + 0.003*"seem" + 0.003*"law"
Topic: 2 
Words: 0.009*"inmate" + 0.004*"write" + 0.004*"officer" + 0.004*"law" + 0.003*"staff" + 0.003*"program" + 0.003*"become" + 0.003*"think" + 0.003*"good" + 0.003*"court"
Topic: 3 
Words: 0.004*"world" + 0.004*"family" + 0.004*"cell" + 0.004*"society" + 0.004*"become" + 0.004*"write" + 0.004*"feel" + 0.003*"think" + 0.003*"mind" + 0.003*"thought"
Topic: 4 
Words: 0.006*"law" + 0.005*"sentence" + 0.005*"court" + 0.004*"criminal" + 0.003*"case" + 0.003*"crime" + 0.003*"person" + 0.003*"new" + 0.003*"government" + 0.003*"release"
Topic: 5 
Words: 0.004*"black" + 0.003*"become" + 0.003*"must" + 0.003*"may" + 0.003*"man" + 0.003*"cell" + 0

### Running LDA using TF-IDF

In [18]:
lda_model_tfidf = gensim.models.LdaMulticore(corpus_tfidf, num_topics=10, id2word=dictionary, passes=2, workers=4)


2019-10-21 20:50:11,564 : INFO : using symmetric alpha at 0.1
2019-10-21 20:50:11,567 : INFO : using symmetric eta at 0.1
2019-10-21 20:50:11,571 : INFO : using serial LDA version on this node
2019-10-21 20:50:11,592 : INFO : running online LDA training, 10 topics, 2 passes over the supplied corpus of 1573 documents, updating every 8000 documents, evaluating every ~1573 documents, iterating 50x with a convergence threshold of 0.001000
2019-10-21 20:50:11,604 : INFO : training LDA model using 4 processes
2019-10-21 20:50:14,476 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #1573/1573, outstanding queue size 1
2019-10-21 20:50:19,478 : INFO : topic #7 (0.100): 0.002*"inmate" + 0.001*"death" + 0.001*"parole" + 0.001*"officer" + 0.001*"cell" + 0.001*"love" + 0.001*"prison" + 0.001*"sentence" + 0.001*"guard" + 0.001*"woman"
2019-10-21 20:50:19,480 : INFO : topic #5 (0.100): 0.001*"inmate" + 0.001*"cell" + 0.001*"society" + 0.001*"program" + 0.001*"sentence" + 0.001*"relea

In [19]:
for idx, topic in lda_model_tfidf.print_topics(-1):
    print('Topic: {} Word: {}'.format(idx, topic))


2019-10-21 20:50:33,952 : INFO : topic #0 (0.100): 0.001*"law" + 0.001*"american" + 0.001*"police" + 0.001*"inmate" + 0.001*"crime" + 0.001*"criminal" + 0.001*"political" + 0.001*"carolina" + 0.001*"consciousness" + 0.001*"african"
2019-10-21 20:50:33,958 : INFO : topic #1 (0.100): 0.001*"inmate" + 0.001*"pennsylvania" + 0.001*"god" + 0.001*"violent" + 0.001*"book" + 0.001*"te" + 0.001*"guard" + 0.001*"remember" + 0.001*"page" + 0.001*"john"
2019-10-21 20:50:33,961 : INFO : topic #2 (0.100): 0.002*"offender" + 0.002*"inmate" + 0.001*"sentence" + 0.001*"society" + 0.001*"crime" + 0.001*"program" + 0.001*"parole" + 0.001*"family" + 0.001*"incarcerate" + 0.001*"law"
2019-10-21 20:50:33,964 : INFO : topic #3 (0.100): 0.001*"inmate" + 0.001*"woman" + 0.001*"california" + 0.001*"program" + 0.001*"write" + 0.001*"god" + 0.001*"parole" + 0.001*"help" + 0.001*"guy" + 0.001*"cell"
2019-10-21 20:50:33,968 : INFO : topic #4 (0.100): 0.001*"inmate" + 0.001*"men" + 0.001*"white" + 0.001*"officer" + 

Topic: 0 Word: 0.001*"law" + 0.001*"american" + 0.001*"police" + 0.001*"inmate" + 0.001*"crime" + 0.001*"criminal" + 0.001*"political" + 0.001*"carolina" + 0.001*"consciousness" + 0.001*"african"
Topic: 1 Word: 0.001*"inmate" + 0.001*"pennsylvania" + 0.001*"god" + 0.001*"violent" + 0.001*"book" + 0.001*"te" + 0.001*"guard" + 0.001*"remember" + 0.001*"page" + 0.001*"john"
Topic: 2 Word: 0.002*"offender" + 0.002*"inmate" + 0.001*"sentence" + 0.001*"society" + 0.001*"crime" + 0.001*"program" + 0.001*"parole" + 0.001*"family" + 0.001*"incarcerate" + 0.001*"law"
Topic: 3 Word: 0.001*"inmate" + 0.001*"woman" + 0.001*"california" + 0.001*"program" + 0.001*"write" + 0.001*"god" + 0.001*"parole" + 0.001*"help" + 0.001*"guy" + 0.001*"cell"
Topic: 4 Word: 0.001*"inmate" + 0.001*"men" + 0.001*"white" + 0.001*"officer" + 0.001*"move" + 0.001*"start" + 0.001*"drug" + 0.001*"visit" + 0.001*"eye" + 0.001*"write"
Topic: 5 Word: 0.001*"cell" + 0.001*"society" + 0.001*"release" + 0.001*"inmate" + 0.001*"

#### Classification of the topics

### Performance evaluation by classifying sample document using LDA Bag of Words model

In [20]:
# We will check where our test document would be classified.
tokenized_essays_list[4310]

['non',
 'review',
 'parole',
 'reinstatement',
 'parole',
 'review',
 'commission',
 'conscious',
 'prisoner',
 'conscious',
 'prisoner',
 'evolution',
 'uhuru',
 'non',
 'review',
 'parole',
 'reinstatement',
 'parole',
 'review',
 'commission',
 'ﬁt',
 'monday',
 'mar',
 'post',
 'bobdylan',
 'writing',
 'uhuru',
 'leave',
 'comment',
 'tag',
 'law',
 'abolition',
 'parole',
 'executive',
 'order',
 'governor',
 'terry',
 'mcauliffe',
 'parole',
 'republican',
 'truth',
 'sentence',
 'law',
 'uhuru',
 'rowe',
 'virginia',
 'virginia',
 'parole',
 'review',
 'commision',
 'virginia',
 'sentence',
 'commission',
 'uhuru',
 'rowe',
 'february',
 'commission',
 'speciﬁcally',
 'address',
 'reinstate',
 'discretionary',
 'parole',
 'due',
 'limited',
 'time',
 'commission',
 'interrelate',
 'complex',
 'issue',
 'include',
 'incomplete',
 'data',
 'make',
 'issue',
 'difﬁcult',
 'address',
 'time',
 'period',
 'allow',
 'governor',
 'terry',
 'mcauliffe',
 'reveal',
 'commission',
 'paro

In [21]:
for index, score in sorted(lda_model[bow_corpus[20]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model.print_topic(index, 10)))



Score: 0.9988597631454468	 
Topic: 0.006*"law" + 0.005*"sentence" + 0.005*"court" + 0.004*"criminal" + 0.003*"case" + 0.003*"crime" + 0.003*"person" + 0.003*"new" + 0.003*"government" + 0.003*"release"


### Performance evaluation by classifying sample document using LDA TF-IDF model

In [22]:
for index, score in sorted(lda_model_tfidf[bow_corpus[4310]], key=lambda tup: -1*tup[1]):
    print("\nScore: {}\t \nTopic: {}".format(score, lda_model_tfidf.print_topic(index, 10)))




Score: 0.9076688885688782	 
Topic: 0.003*"inmate" + 0.001*"cell" + 0.001*"officer" + 0.001*"gang" + 0.001*"guard" + 0.001*"program" + 0.001*"parole" + 0.001*"sentence" + 0.001*"death" + 0.001*"staff"

Score: 0.09131407737731934	 
Topic: 0.001*"law" + 0.001*"american" + 0.001*"police" + 0.001*"inmate" + 0.001*"crime" + 0.001*"criminal" + 0.001*"political" + 0.001*"carolina" + 0.001*"consciousness" + 0.001*"african"


### Word2Vec

In [24]:
vector_dim = 100
model = gensim.models.Word2Vec(tokenized_essays.values(), size=vector_dim)


2019-10-21 20:50:34,223 : INFO : collecting all words and their counts
2019-10-21 20:50:34,225 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2019-10-21 20:50:34,600 : INFO : collected 57330 word types from a corpus of 1194020 raw words and 1573 sentences
2019-10-21 20:50:34,601 : INFO : Loading a fresh vocabulary
2019-10-21 20:50:34,824 : INFO : effective_min_count=5 retains 13796 unique words (24% of original 57330, drops 43534)
2019-10-21 20:50:34,825 : INFO : effective_min_count=5 leaves 1132837 word corpus (94% of original 1194020, drops 61183)
2019-10-21 20:50:34,905 : INFO : deleting the raw counts dictionary of 57330 items
2019-10-21 20:50:34,907 : INFO : sample=0.001 downsamples 27 most-common words
2019-10-21 20:50:34,909 : INFO : downsampling leaves estimated 1100373 word corpus (97.1% of prior 1132837)
2019-10-21 20:50:34,962 : INFO : estimated required memory for 13796 words and 100 dimensions: 17934800 bytes
2019-10-21 20:50:34,963 : INFO : res

### Guide for saving / loading word embedding spaces

In [25]:
# model.save(root + "/mymodel.space")
# model = gensim.models.Word2Vec.load(root + "/mymodel.space")


### Experiment with most_similar terms

In [26]:
model.wv.most_similar(positive="neglect")


2019-10-21 20:50:44,731 : INFO : precomputing L2-norms of word weight vectors


[('mistreatment', 0.9414080381393433),
 ('abusive', 0.9273565411567688),
 ('suffers', 0.9173864126205444),
 ('degrade', 0.9134754538536072),
 ('foster', 0.909307599067688),
 ('homelessness', 0.9036213159561157),
 ('diagnosis', 0.8962069749832153),
 ('unnecessary', 0.8932418823242188),
 ('symptom', 0.8928334712982178),
 ('indifference', 0.8922271132469177)]

### Total number of words in our vocabulary

In [27]:
len(model.wv.vocab)


13796

### Convert tokens to their respective vectors and linearly combine to make single essay:vector representations

In [28]:
vectorized_essays = {label: np.sum(np.array([model.wv.word_vec(token) for token in token_lst if token in model.wv.vocab]), axis=0) for (label, token_lst) in tokenized_essays.items()}


In [29]:
# make it a dataframe and create index reference
vectorized_df = pd.DataFrame.from_dict(vectorized_essays, orient='index')
index_ref = vectorized_df.index


### Feature scaling through standardization

In [30]:
stdsclr = StandardScaler()
standardized_df = pd.DataFrame(stdsclr.fit_transform(vectorized_df), index=index_ref)


### Principle component analysis

In [31]:
pca = PCA(n_components=3)
reduced_df = pd.DataFrame(pca.fit_transform(standardized_df), index=index_ref)


### Guide for output to visualize effectiveness of vectors

In [32]:
reduced_df.to_csv('new.csv', sep='\t', index=False, header=False)
pd.DataFrame(index_ref).to_csv('index.csv', index=False, header=False)


In [33]:
# # Import the wordcloud library
# from wordcloud import WordCloud

# # Join the different processed titles together.
# long_string = ','.join(texts)

# # Create a WordCloud object
# wordcloud = WordCloud(background_color="white", max_words=1000, contour_width=3, contour_color='steelblue')

# # Generate a word cloud
# wordcloud.generate(long_string)

# # Visualize the word cloud
# wordcloud.to_image()

In [34]:
# # Load the library with the CountVectorizer method
# from sklearn.feature_extraction.text import CountVectorizer
# import numpy as np

# import matplotlib.pyplot as plt
# import seaborn as sns
# sns.set_style('whitegrid')
# %matplotlib inline

# # Helper function
# def plot_10_most_common_words(count_data, count_vectorizer):
#     import matplotlib.pyplot as plt
#     words = count_vectorizer.get_feature_names()
#     total_counts = np.zeros(len(words))
#     for t in count_data:
#         total_counts+=t.toarray()[0]
    
#     count_dict = (zip(words, total_counts))
#     count_dict = sorted(count_dict, key=lambda x:x[1], reverse=True)[0:10]
#     words = [w[0] for w in count_dict]
#     counts = [w[1] for w in count_dict]
#     x_pos = np.arange(len(words)) 
    
#     plt.figure(2, figsize=(15, 15/1.6180))
#     plt.subplot(title='10 most common words')
#     sns.set_context("notebook", font_scale=1.25, rc={"lines.linewidth": 2.5})
#     sns.barplot(x_pos, counts, palette='husl')
#     plt.xticks(x_pos, words, rotation=90) 
#     plt.xlabel('words')
#     plt.ylabel('counts')
#     plt.show()

# file1 = open("desktop/Larson_Project/APWATranscriptions-WC2.txt","r")

# # Initialise the count vectorizer with the English stop words
# count_vectorizer = CountVectorizer(stop_words='english')

# # Fit and transform the processed titles
# count_data = count_vectorizer.fit_transform(file1)

# # Visualise the 10 most common words
# plot_10_most_common_words(count_data, count_vectorizer)

In [35]:
# import warnings
# warnings.simplefilter("ignore", DeprecationWarning)

# # Load the LDA model from sk-learn
# from sklearn.decomposition import LatentDirichletAllocation as LDA
 
# # Helper function
# def print_topics(model, count_vectorizer, n_top_words):
#     words = count_vectorizer.get_feature_names()
#     for topic_idx, topic in enumerate(model.components_):
#         print("\nTopic #%d:" % topic_idx)
#         print(" ".join([words[i]
#                         for i in topic.argsort()[:-n_top_words - 1:-1]]))
        
# # Tweak the two parameters below (use int values below 15)
# number_topics = 5
# number_words = 10

# # Create and fit the LDA model
# lda = LDA(n_components=number_topics)
# lda.fit(count_data)

# # Print the topics found by the LDA model
# print("Topics found via LDA:")
# print_topics(lda, count_vectorizer, number_words)