In [1]:
import os, codecs
import gensim
from gensim import corpora
from collections import defaultdict
import string
from string import punctuation
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import pandas as pd
import numpy as np

Using TensorFlow backend.


### THESE SHOULD BE ALL THE RELATIVE PATHS

In [2]:
source_dir = '/Users/rachelbrynsvold/dsi/capstone_dir/Capstone/books/clean/'

In [3]:
outputs_dir = '/Users/rachelbrynsvold/dsi/capstone_dir/Capstone/outputs/reduced_corp/'

In [4]:
class IterFile(object):
    '''
    class object to do the iterating on individual book txt documents, including file i/o.
    '''
    
    def __init__(self, filepath):
        self.filepath = filepath
        
    def _open_file(self):
        self.file = codecs.open(self.filepath, 'r', encoding='utf_8')
        
    def _close_file(self):
        self.file.close()
    
    def __iter__(self):
        '''
        overwrite iteration to include file i/o
        '''
        self._open_file()
        
        for line in self.file:
            yield line
        
        self._close_file()
    

In [5]:
def transform_txt_file_v1(fname, root=source_dir):
    '''
    Initial pass at text transformation
    Reimplemented later (v2 etc) as a caller of various subfunctions to do all the transformation
    '''
    fp = root + fname

    book_as_lst = []
    for line in IterFile(fp):
        if line == "\n":
            pass
        else: 
            line_lst= [tok.lower().strip(punctuation) for tok in line.strip('\n').split()]
            book_as_lst.extend(line_lst)
            
    #add in stop word removal and frequency threshhold
    return book_as_lst

## EDA Step 0: Test transform function with short book (Fall of the House of Usher)

In [6]:
usher = '932-clean.txt'
print transform_txt_file_v1(usher)

[u'the', u'fall', u'of', u'the', u'house', u'of', u'usher', u'son', u'coeur', u'est', u'un', u'luth', u'suspendu', u'sitot', u"qu'on", u'le', u'touche', u'il', u'resonne', u'de', u'beranger', u'during', u'the', u'whole', u'of', u'a', u'dull', u'dark', u'and', u'soundless', u'day', u'in', u'the', u'autumn', u'of', u'the', u'year', u'when', u'the', u'clouds', u'hung', u'oppressively', u'low', u'in', u'the', u'heavens', u'i', u'had', u'been', u'passing', u'alone', u'on', u'horseback', u'through', u'a', u'singularly', u'dreary', u'tract', u'of', u'country', u'and', u'at', u'length', u'found', u'myself', u'as', u'the', u'shades', u'of', u'the', u'evening', u'drew', u'on', u'within', u'view', u'of', u'the', u'melancholy', u'house', u'of', u'usher', u'i', u'know', u'not', u'how', u'it', u'was--but', u'with', u'the', u'first', u'glimpse', u'of', u'the', u'building', u'a', u'sense', u'of', u'insufferable', u'gloom', u'pervaded', u'my', u'spirit', u'i', u'say', u'insufferable', u'for', u'the', u

Get list of file names by using nltk... this feels a bit like cheating... should probably change this for final code

In [7]:
temp_corp = PlaintextCorpusReader(source_dir, '.*\.txt')
fileid_lst = temp_corp.fileids()[:5]
fileid_lst

['10-clean.txt',
 '100-clean.txt',
 '105-clean.txt',
 '108-clean.txt',
 '1080-clean.txt']

## EDA Step 1: Most basic transformation (tokenization only)

Apply transformation function to all books

In [8]:
all_transf_books_lst = []
for f in fileid_lst:
    all_transf_books_lst.append(transform_txt_file_v1(f))

all_transf_books_lst

[[u'the',
  u'old',
  u'testament',
  u'of',
  u'the',
  u'king',
  u'james',
  u'version',
  u'of',
  u'the',
  u'bible',
  u'the',
  u'first',
  u'book',
  u'of',
  u'moses',
  u'called',
  u'genesis',
  u'1:1',
  u'in',
  u'the',
  u'beginning',
  u'god',
  u'created',
  u'the',
  u'heavens',
  u'and',
  u'the',
  u'earth',
  u'1:2',
  u'and',
  u'the',
  u'earth',
  u'was',
  u'without',
  u'form',
  u'and',
  u'void',
  u'and',
  u'darkness',
  u'was',
  u'upon',
  u'the',
  u'face',
  u'of',
  u'the',
  u'deep',
  u'and',
  u'the',
  u'spirit',
  u'of',
  u'god',
  u'moved',
  u'upon',
  u'the',
  u'face',
  u'of',
  u'the',
  u'waters',
  u'1:3',
  u'and',
  u'god',
  u'said',
  u'let',
  u'there',
  u'be',
  u'light',
  u'and',
  u'there',
  u'was',
  u'light',
  u'1:4',
  u'and',
  u'god',
  u'saw',
  u'the',
  u'light',
  u'that',
  u'it',
  u'was',
  u'good',
  u'and',
  u'god',
  u'divided',
  u'the',
  u'light',
  u'from',
  u'the',
  u'darkness',
  u'1:5',
  u'and',
  u'g

Cool - looks like I've got all the books in there

In [9]:
len(all_transf_books_lst)

5

## EDA Step 1.5: Write code snippets to do all the calculations, print-outs, and obj creation that I'll use for the rest of the EDA 

How long is each book (hom many totoal words?)

In [10]:
book_lengths = []

for tup in zip(fileid_lst, all_transf_books_lst):
    book_lengths.append((tup[0], len(tup[1])))
    
book_lengths

[('10-clean.txt', 821133),
 ('100-clean.txt', 883320),
 ('105-clean.txt', 83286),
 ('108-clean.txt', 112139),
 ('1080-clean.txt', 3410)]

Average number of tokens per book

In [11]:
int(np.mean([len(book) for book in all_transf_books_lst]))

380657

Make a dictionary - a count of the number of times each word appears in each book

In [12]:
dictionary = corpora.Dictionary(all_transf_books_lst)

Confirm: yup, it's got words in it

In [13]:
dictionary[0], dictionary[10], dictionary[1000], dictionary[5000]

(u'', u'mozah', u'tookest', u'89:4')

In [14]:
type(dictionary)

gensim.corpora.dictionary.Dictionary

I'd be interested to see this as pandas dataframe - but this doesn't work

In [15]:
#df = pd.DataFrame(dictionary)
#df.head()

How many words in this dictionary?

In [16]:
len(dictionary)

44167

Do the contents look right?

In [17]:
print(dictionary)

Dictionary(44167 unique tokens: [u'', u'fawn', u'raining', u'nunnery', u'aijalon']...)


Save out the dictionary

In [18]:
filename = 'tokenized.dict'

dictionary.save(outputs_dir + filename)

In [19]:
zip(fileid_lst, [len(dictionary.doc2bow(book)) for book in all_transf_books_lst])

[('10-clean.txt', 17331),
 ('100-clean.txt', 28919),
 ('105-clean.txt', 6046),
 ('108-clean.txt', 8707),
 ('1080-clean.txt', 1087)]

Make a corpus!

In [20]:
corpus = [dictionary.doc2bow(book) for book in all_transf_books_lst]

Check that it has all the books

In [21]:
len(corpus)

5

View it as a dataframe

In [22]:
df = pd.DataFrame(corpus)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,28909,28910,28911,28912,28913,28914,28915,28916,28917,28918
0,"(0, 3)","(1, 4)","(2, 1)","(3, 1)","(4, 1)","(5, 1)","(6, 1)","(7, 1)","(8, 4)","(9, 328)",...,,,,,,,,,,
1,"(0, 324)","(8, 28)","(9, 148)","(13, 38)","(15, 28)","(18, 28)","(21, 29)","(23, 3059)","(25, 3)","(26, 1)",...,"(39474, 19)","(39475, 3)","(39476, 1)","(39477, 1)","(39478, 5)","(39479, 1)","(39480, 1)","(39481, 1)","(39482, 1)","(39483, 2)"
2,"(0, 1)","(9, 14)","(13, 3)","(18, 5)","(21, 2)","(23, 9)","(32, 1)","(44, 12)","(48, 2)","(57, 3)",...,,,,,,,,,,
3,"(0, 22)","(8, 7)","(9, 15)","(13, 7)","(21, 3)","(23, 41)","(44, 3)","(47, 1)","(48, 2)","(55, 4)",...,,,,,,,,,,
4,"(9, 5)","(23, 1)","(57, 2)","(98, 4)","(132, 1)","(154, 10)","(155, 1)","(205, 5)","(213, 1)","(215, 4)",...,,,,,,,,,,


Why doesn't it have the same number of cols as the length of the dictionary...

In [23]:
#check same process with 1 book - see how dict/corp/df size differ

Number of unique words in each book

In [24]:
unique_toks_num_lst = [len(book) for book in corpus]
unique_toks_per_fileid = zip(fileid_lst, unique_toks_num_lst)

unique_toks_per_fileid

[('10-clean.txt', 17331),
 ('100-clean.txt', 28919),
 ('105-clean.txt', 6046),
 ('108-clean.txt', 8707),
 ('1080-clean.txt', 1087)]

Average unique words per book

In [25]:
avg_unique_toks = int(np.mean(unique_toks_num_lst))
avg_unique_toks

12418

Save out the corpus

In [26]:
filename = 'tokenized_corpus.mm'

corpora.MmCorpus.serialize(outputs_dir + filename, corpus)

## EDA and Saving helper funtions

To automate next few EDA rounds

In [27]:
def eda(transform_txt_file, fileid_lst=fileid_lst):
    '''
    Do transformations with updated transformation function and return all the eda items
    '''
    
    all_transf_books_lst = [transform_txt_file(f) for f in fileid_lst]
    
    book_lengths = [(tup[0], len(tup[1])) for tup in zip(fileid_lst, all_transf_books_lst)]
    avg_num_tokens = int(np.mean([len(book) for book in all_transf_books_lst]))
    
    dictionary = corpora.Dictionary(all_transf_books_lst)
    dictionary_length = len(dictionary)
    
    corpus = [dictionary.doc2bow(book) for book in all_transf_books_lst]
    
    unique_toks_num_lst = [len(book) for book in corpus]
    unique_toks_per_fileid = zip(fileid_lst, unique_toks_num_lst)
    avg_unique_toks = int(np.mean(unique_toks_num_lst))
    
    
    return book_lengths, avg_num_tokens, dictionary, dictionary_length, unique_toks_per_fileid, avg_unique_toks, corpus


def save_stuff(distinguishing_str, dictionary, corpus, outputs_dir='/Users/rachelbrynsvold/dsi/capstone_dir/top_100_dev_corp/outputs/'):
    '''
    Save the outputs of the most recent eda step
    '''
    dictionary.save(outputs_dir + distinguishing_str + '.dict')
    corpora.MmCorpus.serialize(outputs_dir + distinguishing_str + '_corpus.mm', corpus)
    

### EDA items
* List of book lengths (total num of tokens for each book)
* Average number of tokens per book
* Number of words in corpus (dictionary length)
    * Dictionary (not viewed)
* Unique tokens per book
* Average number of unique tokens per book
    * Corpus (not viewe)
    
Save everything after eda step


### To summarize the 'simple tokenization' EDA step (#1):

In [28]:
output_v1 = eda(transform_txt_file_v1)

In [29]:
book_lengths1, avg_num_tokens1, dictionary1, dictionary_length1, unique_toks_per_fileid1, \
    avg_unique_toks1, corpus1 = output_v1 

In [31]:
print "Average number of tokens in a book: ", avg_num_tokens1
print "   "
print "Average unique tokens in a book: ", avg_unique_toks1
print "   "
print "Total number of words (dictionary length): ", dictionary_length1

Average number of tokens in a book:  380657
   
Average unique tokens in a book:  12418
   
Total number of words (dictionary length):  44167


In [32]:
##for pres, note the sparcity problem - 9000 vs. 195k == 186k empty

In [33]:
book_lengths1

[('10-clean.txt', 821133),
 ('100-clean.txt', 883320),
 ('105-clean.txt', 83286),
 ('108-clean.txt', 112139),
 ('1080-clean.txt', 3410)]

In [34]:
unique_toks_per_fileid1

[('10-clean.txt', 17331),
 ('100-clean.txt', 28919),
 ('105-clean.txt', 6046),
 ('108-clean.txt', 8707),
 ('1080-clean.txt', 1087)]

In [35]:
save_stuff('simple_tok', dictionary1, corpus1)

## EDA Step 2: + stop word removal

In [45]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = set(stopwords.words('english'))
print stop

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rachelbrynsvold/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
set([u'all', u'just', u'being', u'over', u'both', u'through', u'yourselves', u'its', u'before', u'o', u'hadn', u'herself', u'll', u'had', u'should', u'to', u'only', u'won', u'under', u'ours', u'has', u'do', u'them', u'his', u'very', u'they', u'not', u'during', u'now', u'him', u'nor', u'd', u'did', u'didn', u'this', u'she', u'each', u'further', u'where', u'few', u'because', u'doing', u'some', u'hasn', u'are', u'our', u'ourselves', u'out', u'what', u'for', u'while', u're', u'does', u'above', u'between', u'mustn', u't', u'be', u'we', u'who', u'were', u'here', u'shouldn', u'hers', u'by', u'on', u'about', u'couldn', u'of', u'against', u's', u'isn', u'or', u'own', u'into', u'yourself', u'down', u'mightn', u'wasn', u'your', u'from', u'her', u'their', u'aren', u'there', u'been', u'whom', u'too', u'wouldn', u'themselves', u'weren', u'was', u'unt

In [46]:
def transform_txt_file_v2(fname, root=source_dir, stop_words=stop):
    '''
    Top-level function to call all of the subfunctions for text transformation
    Assumes you want to remove empty lines and tokenize (because you do)
    '''
    fp = root + fname
    book_as_lst = []
    for line in IterFile(fp):
        
        if empty_line_check(line) == False:
            line = basic_tokenize(line)
            
            if stop_words !=None:
                line = remove_stop_words(line, stop_words)
        
        book_as_lst.extend(line)
        
    return book_as_lst

def empty_line_check(line) :
    '''
    checks for empty line
    '''
    if line == "\n":
        empty = True
    else:
        empty = False
    return empty
    
def basic_tokenize(line):
    '''
    convert to list
    strip punctuation, lowercase
    '''
    return [tok.lower().strip(punctuation) for tok in line.strip('\n').split()]    
            
def remove_stop_words(line, stop_words):
    return [tok for tok in line if tok not in stop_words] 

In [47]:
output_v2 = eda(transform_txt_file_v2)

In [48]:
book_lengths2, avg_num_tokens2, dictionary2, dictionary_length2, \
    unique_toks_per_fileid2, avg_unique_toks2, corpus2 = output_v2

In [49]:
print "Average number of tokens in a book: ", avg_num_tokens2
print "   "
print "Average unique tokens in a book: ", avg_unique_toks2
print "   "
print "Total number of words (dictionary length): ", dictionary_length2

Average number of tokens in a book:  201597
   
Average unique tokens in a book:  12297
   
Total number of words (dictionary length):  44032


In [115]:
#make graph of reduction

In [50]:
book_lengths2

[('10-clean.txt', 431250),
 ('100-clean.txt', 481608),
 ('105-clean.txt', 39320),
 ('108-clean.txt', 54072),
 ('1080-clean.txt', 1739)]

In [51]:
unique_toks_per_fileid2

[('10-clean.txt', 17208),
 ('100-clean.txt', 28785),
 ('105-clean.txt', 5922),
 ('108-clean.txt', 8578),
 ('1080-clean.txt', 994)]

In [52]:
save_stuff('no_stopwords', dictionary2, corpus2)

## EDA Step 3: + lemmatization

In [53]:
from gensim.utils import lemmatize

In [54]:
def transform_txt_file_v3(fname, root=source_dir, stop_words=stop, \
                         lemma = True):
    '''
    Top-level function to call all of the subfunctions for text transformation
    Assumes you want to remove empty lines and tokenize (because you do)
    '''
    fp = root + fname
    book_as_lst = []
    for line in IterFile(fp):
        
        if empty_line_check(line) == False:
            
            
            if lemma == True:
                line = lemmatize(line)
            
            else:
                line = basic_tokenize(line)
                
                if stop_words !=None:
                    line = remove_stop_words(line, stop_words)
        
            book_as_lst.extend(line)
    
    return book_as_lst

def empty_line_check(line) :
    '''
    checks for empty line
    '''
    if line == "\n":
        empty = True
    else:
        empty = False
    return empty

def basic_tokenize(line):
    '''
    convert to list
    strip punctuation, lowercase
    '''
    return [tok.lower().strip(punctuation) for tok in line.strip('\n').split()]
           
def remove_stop_words(line, stop_words):
    return [tok for tok in line if tok not in stop_words] 
    

In [76]:
def transform_txt_file_v3_alt(fname, root=source_dir, stop_words=stop, \
                         lemma = True):
    '''
    Top-level function to call all of the subfunctions for text transformation
    Assumes you want to remove empty lines and tokenize (because you do)
    '''
    fp = root + fname
    book_as_lst = []
    
    if lemma == True:
        for line in IterFile(fp):
            
            if empty_line_check(line) == False:
                if stop_words !=None:
                    line = remove_stop_words(line.split(), stop_words)
                book_as_lst.extend(line)
                
        book_as_lst = lemmatize(" ".join(book_as_lst))
            
    else:
        for line in IterFile(fp):
            if empty_line_check(line) == False:
                line = basic_tokenize(line)
                
            if stop_words !=None:
                line = remove_stop_words(line, stop_words)
        
            book_as_lst.extend(line)
    
    return book_as_lst

def empty_line_check(line) :
    '''
    checks for empty line
    '''
    if line == "\n":
        empty = True
    else:
        empty = False
    return empty

def basic_tokenize(line):
    '''
    convert to list
    strip punctuation, lowercase
    '''
    return [tok.lower().strip(punctuation) for tok in line.strip('\n').split()]
           
def remove_stop_words(line, stop_words):
    return [tok for tok in line if tok not in stop_words] 
    

In [74]:
fp = source_dir + usher
book_as_lst = []
    
if True == True:
    for num, line in enumerate(IterFile(fp)):
        print num, ": ", line
        if empty_line_check(line) == False:
            print "line ", num, ": empty line check passed"
            if stop != None:
                line = [tok for tok in line.split() if tok not in stop] 
                print num, ": ", line, " stop words removed"
            book_as_lst.extend(line)
                
    book_as_lst = lemmatize(" ".join(book_as_lst))

0 :  

1 :  

2 :  

3 :  

4 :  

5 :  

6 :  

7 :  

8 :  

9 :  The Fall of the House of Usher

line  9 : empty line check passed
9 :  [u'The', u'Fall', u'House', u'Usher']  stop words removed
10 :  

11 :  

12 :    Son coeur est un luth suspendu;

line  12 : empty line check passed
12 :  [u'Son', u'coeur', u'est', u'un', u'luth', u'suspendu;']  stop words removed
13 :    Sitot qu'on le touche il resonne.

line  13 : empty line check passed
13 :  [u'Sitot', u"qu'on", u'le', u'touche', u'il', u'resonne.']  stop words removed
14 :      DE BERANGER.

line  14 : empty line check passed
14 :  [u'DE', u'BERANGER.']  stop words removed
15 :  

16 :  

17 :  

18 :  During the whole of a dull, dark, and soundless day in the

line  18 : empty line check passed
18 :  [u'During', u'whole', u'dull,', u'dark,', u'soundless', u'day']  stop words removed
19 :  autumn of the year, when the clouds hung oppressively low in the

line  19 : empty line check passed
19 :  [u'autumn', u'year,', u'clouds

In [75]:
book_as_lst

['fall/NN',
 'house/NN',
 'usher/NN',
 'son/NN',
 'coeur/NN',
 'luth/NN',
 'suspendu/NN',
 'sitot/NN',
 'qu/NN',
 'le/NN',
 'touche/NN',
 'il/NN',
 'beranger/JJ',
 'whole/JJ',
 'dull/JJ',
 'dark/NN',
 'soundless/NN',
 'day/NN',
 'autumn/NN',
 'year/NN',
 'cloud/NN',
 'hung/JJ',
 'oppressively/RB',
 'low/JJ',
 'heaven/NN',
 'pass/VB',
 'alone/RB',
 'horseback/NN',
 'singularly/RB',
 'dreary/JJ',
 'tract/NN',
 'country/NN',
 'length/NN',
 'find/VB',
 'shade/NN',
 'evening/NN',
 'draw/VB',
 'view/NN',
 'melancholy/NN',
 'house/NN',
 'usher/NN',
 'know/VB',
 'be/VB',
 'first/JJ',
 'glimpse/NN',
 'building/NN',
 'sense/NN',
 'insufferable/JJ',
 'gloom/NN',
 'pervade/VB',
 'spirit/NN',
 'say/VB',
 'insufferable/JJ',
 'feel/VB',
 'unrelieved/JJ',
 'half/NN',
 'pleasureable/JJ',
 'poetic/JJ',
 'sentiment/NN',
 'mind/NN',
 'usually/RB',
 'receive/VB',
 'even/RB',
 'sternest/VB',
 'natural/JJ',
 'image/NN',
 'desolate/JJ',
 'terrible/JJ',
 'look/VB',
 'scene/NN',
 'mere/JJ',
 'house/NN',
 'simpl

In [77]:
usher_v3_alt = transform_txt_file_v3_alt(usher)
usher_v3_alt

['fall/NN',
 'house/NN',
 'usher/NN',
 'son/NN',
 'coeur/NN',
 'luth/NN',
 'suspendu/NN',
 'sitot/NN',
 'qu/NN',
 'le/NN',
 'touche/NN',
 'il/NN',
 'beranger/JJ',
 'whole/JJ',
 'dull/JJ',
 'dark/NN',
 'soundless/NN',
 'day/NN',
 'autumn/NN',
 'year/NN',
 'cloud/NN',
 'hung/JJ',
 'oppressively/RB',
 'low/JJ',
 'heaven/NN',
 'pass/VB',
 'alone/RB',
 'horseback/NN',
 'singularly/RB',
 'dreary/JJ',
 'tract/NN',
 'country/NN',
 'length/NN',
 'find/VB',
 'shade/NN',
 'evening/NN',
 'draw/VB',
 'view/NN',
 'melancholy/NN',
 'house/NN',
 'usher/NN',
 'know/VB',
 'be/VB',
 'first/JJ',
 'glimpse/NN',
 'building/NN',
 'sense/NN',
 'insufferable/JJ',
 'gloom/NN',
 'pervade/VB',
 'spirit/NN',
 'say/VB',
 'insufferable/JJ',
 'feel/VB',
 'unrelieved/JJ',
 'half/NN',
 'pleasureable/JJ',
 'poetic/JJ',
 'sentiment/NN',
 'mind/NN',
 'usually/RB',
 'receive/VB',
 'even/RB',
 'sternest/VB',
 'natural/JJ',
 'image/NN',
 'desolate/JJ',
 'terrible/JJ',
 'look/VB',
 'scene/NN',
 'mere/JJ',
 'house/NN',
 'simpl

In [63]:
usher_v3 = transform_txt_file_v3(usher)
usher_v3

['fall/NN',
 'house/NN',
 'usher/NN',
 'son/NN',
 'coeur/NN',
 'luth/NN',
 'suspendu/NN',
 'sitot/NN',
 'qu/NN',
 'le/NN',
 'touche/NN',
 'il/NN',
 'resonne/NN',
 'beranger/JJ',
 'whole/NN',
 'dull/JJ',
 'dark/NN',
 'soundless/NN',
 'day/NN',
 'autumn/NN',
 'year/NN',
 'cloud/NN',
 'hung/JJ',
 'oppressively/RB',
 'low/JJ',
 'heaven/NN',
 'have/VB',
 'be/VB',
 'pass/VB',
 'alone/RB',
 'horseback/NN',
 'singularly/RB',
 'dreary/JJ',
 'tract/NN',
 'country/NN',
 'length/NN',
 'find/VB',
 'shade/NN',
 'evening/NN',
 'draw/VB',
 'view/NN',
 'melancholy/NN',
 'house/NN',
 'usher/NN',
 'know/VB',
 'not/RB',
 'be/VB',
 'first/JJ',
 'glimpse/NN',
 'build/VB',
 'sense/NN',
 'insufferable/JJ',
 'gloom/NN',
 'pervade/VB',
 'spirit/NN',
 'say/VB',
 'insufferable/JJ',
 'feeling/NN',
 'be/VB',
 'unrelieved/JJ',
 'half/NN',
 'pleasureable/NN',
 'poetic/JJ',
 'sentiment/NN',
 'mind/NN',
 'usually/RB',
 'receive/VB',
 'even/RB',
 'sternest/NN',
 'natural/JJ',
 'image/NN',
 'desolate/JJ',
 'terrible/JJ',

In [56]:
source_dir

'/Users/rachelbrynsvold/dsi/capstone_dir/Capstone/books/clean/'

In [57]:
lemmatize("I took an extra turn")

['take/VB', 'extra/JJ', 'turn/NN']

In [58]:
lemmatize("Turn around, there is a gator chasing you")

['turn/VB', 'be/VB', 'gator/NN', 'chasing/NN']

In [None]:
output_v3 = eda(transform_txt_file_v3)

In [60]:
book_lengths3, avg_num_tokens3, dictionary3, dictionary_length3, \
    unique_toks_per_fileid3, avg_unique_toks3, corpus3 = output_v3

NameError: name 'output_v3' is not defined

In [None]:
print "Average number of tokens in a book: ", avg_num_tokens3
print "   "
print "Average unique tokens in a book: ", avg_unique_toks3
print "   "
print "Total number of words (dictionary length): ", dictionary_length3

In [None]:
book_lengths3

In [None]:
unique_toks_per_fileid3

In [85]:
save_stuff('lemmatized', dictionary3, corpus3)

## EDA Step 4: + frequency filters

In [None]:
#Think I can do this by pruning existing dictionary

In [105]:
def eda(transform_txt_file, fileid_lst=fileid_lst):
    '''
    Do transformations with updated transformation function and return all the eda items
    '''
    
    all_transf_books_lst = [transform_txt_file(f) for f in fileid_lst]
    
    book_lengths = [(tup[0], len(tup[1])) for tup in zip(fileid_lst, all_transf_books_lst)]
    avg_num_tokens = int(np.mean([len(book) for book in all_transf_books_lst]))
    
    dictionary = corpora.Dictionary(all_transf_books_lst)
    dictionary_length = len(dictionary)
    
    corpus = [dictionary.doc2bow(book) for book in all_transf_books_lst]
    
    unique_toks_num_lst = [len(book) for book in corpus]
    unique_toks_per_fileid = zip(fileid_lst, unique_toks_num_lst)
    avg_unique_toks = int(np.mean(unique_toks_num_lst))
    
    
    return book_lengths, avg_num_tokens, dictionary, dictionary_length, unique_toks_per_fileid, avg_unique_toks, corpus


def save_stuff(distinguishing_str, dictionary, corpus, model, outputs_dir='/Users/rachelbrynsvold/dsi/capstone_dir/top_100_dev_corp/outputs/'):
    '''
    Save the outputs of the most recent eda step
    '''
    if dictionary != None:
        dictionary.save(outputs_dir + distinguishing_str + '.dict')
        
    if corpus != None:
        corpora.MmCorpus.serialize(outputs_dir + distinguishing_str + '_corpus.mm', corpus)
    
    if model != None:
        pass

In [None]:
gensim.utils.prune_vocabctionary3.copy()
gensim.utils.prune_vocab(dictionary3, min_reduce=10)

In [None]:
len(dictionary3)

# First LDA Model!!!

In [109]:
from gensim.models import ldamodel

In [111]:
lda = ldamodel.LdaModel(corpus=corpus,alpha='auto', id2word=dictionary, num_topics=20, update_every=0, passes=20)

In [None]:
name = 'first_lda.model'
lda.save(outpoutputs_dir_dir + name)


In [None]:
print lda

In [121]:
lda.get_document_topics(dictionary.doc2bow(transform_txt_file_v2(usher)))

[(17, 0.99976736796572041)]

In [124]:
lda.get_term_topics(3)

[]

In [127]:
lda.get_topic_terms(1)

[(91660, 0.0010124057909801489),
 (10364, 0.00061426842891210544),
 (91597, 0.00051999185286401111),
 (91667, 0.00049671380979222706),
 (91536, 0.00041714172695244539),
 (91596, 0.0003260759552575187),
 (91657, 0.00029770823944895501),
 (12919, 0.00023973092674403279),
 (2729, 0.00021683942578643836),
 (1443, 0.00021455100956580147)]

In [None]:
#dewey decimal system recreation??
#recreate gutenberg topics?

At the end - bar chart of changes?