In [1]:
import os, codecs
import gensim
from gensim import corpora
from collections import defaultdict
import string
from string import punctuation
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import pandas as pd
import numpy as np

Using TensorFlow backend.


In [2]:
class IterFile(object):
    '''
    class object to do the iterating on individual book txt documents, including file i/o.
    '''
    
    def __init__(self, filepath):
        self.filepath = filepath
        
    def _open_file(self):
        self.file = codecs.open(self.filepath, 'r', encoding='utf_8')
        
    def _close_file(self):
        self.file.close()
    
    def __iter__(self):
        '''
        overwrite iteration to include file i/o
        '''
        self._open_file()
        
        for line in self.file:
            yield line
        
        self._close_file()
    

In [3]:
source_dir = '/Users/rachelbrynsvold/dsi/capstone_dir/top_100_dev_corp/books/clean/'

In [4]:
def transform_txt_file_v1(fname, root=source_dir):
    '''
    Initial pass at text transformation
    Reimplemented later (v2 etc) as a caller of various subfunctions to do all the transformation
    '''
    fp = root + fname

    book_as_lst = []
    for line in IterFile(fp):
        if line == "\n":
            pass
        else: 
            line_lst= [tok.lower().strip(punctuation) for tok in line.strip('\n').split()]
            book_as_lst.extend(line_lst)
            
    #add in stop word removal and frequency threshhold
    return book_as_lst

## EDA Step 0: Test transform function with short book (Fall of the House of Usher)

In [55]:
usher = '932-clean.txt'
print transform_txt_file_v1(usher)

[u'the', u'fall', u'of', u'the', u'house', u'of', u'usher', u'son', u'coeur', u'est', u'un', u'luth', u'suspendu', u'sitot', u"qu'on", u'le', u'touche', u'il', u'resonne', u'de', u'beranger', u'during', u'the', u'whole', u'of', u'a', u'dull', u'dark', u'and', u'soundless', u'day', u'in', u'the', u'autumn', u'of', u'the', u'year', u'when', u'the', u'clouds', u'hung', u'oppressively', u'low', u'in', u'the', u'heavens', u'i', u'had', u'been', u'passing', u'alone', u'on', u'horseback', u'through', u'a', u'singularly', u'dreary', u'tract', u'of', u'country', u'and', u'at', u'length', u'found', u'myself', u'as', u'the', u'shades', u'of', u'the', u'evening', u'drew', u'on', u'within', u'view', u'of', u'the', u'melancholy', u'house', u'of', u'usher', u'i', u'know', u'not', u'how', u'it', u'was--but', u'with', u'the', u'first', u'glimpse', u'of', u'the', u'building', u'a', u'sense', u'of', u'insufferable', u'gloom', u'pervaded', u'my', u'spirit', u'i', u'say', u'insufferable', u'for', u'the', u

Get list of file names by using nltk... this feels a bit like cheating... should probably change this for final code

In [6]:
temp_corp = PlaintextCorpusReader(source_dir, '.*\.txt')
fileid_lst = temp_corp.fileids()
fileid_lst

['10-clean.txt',
 '100-clean.txt',
 '105-clean.txt',
 '108-clean.txt',
 '1080-clean.txt',
 '11-clean.txt',
 '1112-clean.txt',
 '1184-clean.txt',
 '12-clean.txt',
 '120-clean.txt',
 '1232-clean.txt',
 '1260-clean.txt',
 '1322-clean.txt',
 '1342-clean.txt',
 '135-clean.txt',
 '1399-clean.txt',
 '140-clean.txt',
 '1400-clean.txt',
 '1404-clean.txt',
 '14264-clean.txt',
 '147-clean.txt',
 '1497-clean.txt',
 '15399-clean.txt',
 '158-clean.txt',
 '16-clean.txt',
 '160-clean.txt',
 '161-clean.txt',
 '16382-clean.txt',
 '1661-clean.txt',
 '1727-clean.txt',
 '174-clean.txt',
 '1952-yellow_wallpaper-clean.txt',
 '19942-clean.txt',
 '20-clean.txt',
 '20203-clean.txt',
 '203-clean.txt',
 '205-clean.txt',
 '21279-clean.txt',
 '2148-clean.txt',
 '2174-clean.txt',
 '219-clean.txt',
 '224-clean.txt',
 '23-clean.txt',
 '236-clean.txt',
 '2500-clean.txt',
 '25305-clean.txt',
 '2591-clean.txt',
 '2600-clean.txt',
 '2680-clean.txt',
 '2701-moby-clean.txt',
 '28054-clean.txt',
 '2814-clean.txt',
 '2852-cle

## EDA Step 1: Most basic transformation (tokenization only)

Apply transformation function to all books

In [7]:
all_transf_books_lst = []
for f in fileid_lst:
    all_transf_books_lst.append(transform_txt_file_v1(f))

all_transf_books_lst[5]

[u"alice's",
 u'adventures',
 u'in',
 u'wonderland',
 u'lewis',
 u'carroll',
 u'the',
 u'millennium',
 u'fulcrum',
 u'edition',
 u'3.0',
 u'chapter',
 u'i',
 u'down',
 u'the',
 u'rabbit-hole',
 u'alice',
 u'was',
 u'beginning',
 u'to',
 u'get',
 u'very',
 u'tired',
 u'of',
 u'sitting',
 u'by',
 u'her',
 u'sister',
 u'on',
 u'the',
 u'bank',
 u'and',
 u'of',
 u'having',
 u'nothing',
 u'to',
 u'do',
 u'once',
 u'or',
 u'twice',
 u'she',
 u'had',
 u'peeped',
 u'into',
 u'the',
 u'book',
 u'her',
 u'sister',
 u'was',
 u'reading',
 u'but',
 u'it',
 u'had',
 u'no',
 u'pictures',
 u'or',
 u'conversations',
 u'in',
 u'it',
 u'and',
 u'what',
 u'is',
 u'the',
 u'use',
 u'of',
 u'a',
 u'book',
 u'thought',
 u'alice',
 u'without',
 u'pictures',
 u'or',
 u'conversations',
 u'so',
 u'she',
 u'was',
 u'considering',
 u'in',
 u'her',
 u'own',
 u'mind',
 u'as',
 u'well',
 u'as',
 u'she',
 u'could',
 u'for',
 u'the',
 u'hot',
 u'day',
 u'made',
 u'her',
 u'feel',
 u'very',
 u'sleepy',
 u'and',
 u'stupi

Cool - looks like I've got all the books in there

In [8]:
len(all_transf_books_lst)

95

## EDA Step 1.5: Write code snippets to do all the calculations, print-outs, and obj creation that I'll use for the rest of the EDA 

How long is each book (hom many totoal words?)

In [9]:
book_lengths = []

for tup in zip(fileid_lst, all_transf_books_lst):
    book_lengths.append((tup[0], len(tup[1])))
    
book_lengths

[('10-clean.txt', 821133),
 ('100-clean.txt', 883320),
 ('105-clean.txt', 83286),
 ('108-clean.txt', 112139),
 ('1080-clean.txt', 3410),
 ('11-clean.txt', 26449),
 ('1112-clean.txt', 25898),
 ('1184-clean.txt', 461184),
 ('12-clean.txt', 29293),
 ('120-clean.txt', 68589),
 ('1232-clean.txt', 49612),
 ('1260-clean.txt', 185468),
 ('1322-clean.txt', 121712),
 ('1342-clean.txt', 121567),
 ('135-clean.txt', 565761),
 ('1399-clean.txt', 349978),
 ('140-clean.txt', 149124),
 ('1400-clean.txt', 184403),
 ('1404-clean.txt', 192349),
 ('14264-clean.txt', 76656),
 ('147-clean.txt', 21997),
 ('1497-clean.txt', 216243),
 ('15399-clean.txt', 82564),
 ('158-clean.txt', 157441),
 ('16-clean.txt', 47439),
 ('160-clean.txt', 63992),
 ('161-clean.txt', 118580),
 ('16382-clean.txt', 135922),
 ('1661-clean.txt', 104493),
 ('1727-clean.txt', 129410),
 ('174-clean.txt', 78934),
 ('1952-yellow_wallpaper-clean.txt', 6067),
 ('19942-clean.txt', 35861),
 ('20-clean.txt', 80059),
 ('20203-clean.txt', 76167),
 ('

Average number of tokens per book

In [10]:
int(np.mean([len(book) for book in all_transf_books_lst]))

132124

Make a dictionary - a count of the number of times each word appears in each book

In [11]:
dictionary = corpora.Dictionary(all_transf_books_lst)

Confirm: yup, it's got words in it

In [12]:
dictionary[0], dictionary[10], dictionary[1000], dictionary[100000]

(u'', u'mozah', u'tookest', u'shastras')

In [13]:
type(dictionary)

gensim.corpora.dictionary.Dictionary

I'd be interested to see this as pandas dataframe - but this doesn't work

In [14]:
#df = pd.DataFrame(dictionary)
#df.head()

How many words in this dictionary?

In [15]:
len(dictionary)

195104

Do the contents look right?

In [16]:
print(dictionary)

Dictionary(195104 unique tokens: [u'', u'biennials', u'lenitives', u'unsupportable', u'nunnery']...)


Save out the dictionary

In [17]:
outputs_dir = '/Users/rachelbrynsvold/dsi/capstone_dir/top_100_dev_corp/outputs/'

In [18]:
filename = 'tokenized.dict'

dictionary.save(outputs_dir + filename)

In [19]:
zip(fileid_lst, [len(dictionary.doc2bow(book)) for book in all_transf_books_lst])

[('10-clean.txt', 17331),
 ('100-clean.txt', 28919),
 ('105-clean.txt', 6046),
 ('108-clean.txt', 8707),
 ('1080-clean.txt', 1087),
 ('11-clean.txt', 2780),
 ('1112-clean.txt', 3821),
 ('1184-clean.txt', 26118),
 ('12-clean.txt', 3097),
 ('120-clean.txt', 6603),
 ('1232-clean.txt', 5284),
 ('1260-clean.txt', 14991),
 ('1322-clean.txt', 14721),
 ('1342-clean.txt', 6842),
 ('135-clean.txt', 26112),
 ('1399-clean.txt', 14473),
 ('140-clean.txt', 11656),
 ('1400-clean.txt', 12334),
 ('1404-clean.txt', 8954),
 ('14264-clean.txt', 5571),
 ('147-clean.txt', 3595),
 ('1497-clean.txt', 11362),
 ('15399-clean.txt', 7436),
 ('158-clean.txt', 10057),
 ('16-clean.txt', 5026),
 ('160-clean.txt', 7204),
 ('161-clean.txt', 7509),
 ('16382-clean.txt', 11299),
 ('1661-clean.txt', 8459),
 ('1727-clean.txt', 8066),
 ('174-clean.txt', 7324),
 ('1952-yellow_wallpaper-clean.txt', 1270),
 ('19942-clean.txt', 5207),
 ('20-clean.txt', 10237),
 ('20203-clean.txt', 8300),
 ('203-clean.txt', 13909),
 ('205-clean.t

Make a corpus!

In [20]:
corpus = [dictionary.doc2bow(book) for book in all_transf_books_lst]

Check that it has all the books

In [21]:
len(corpus)

95

View it as a dataframe

In [22]:
df = pd.DataFrame(corpus)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30757,30758,30759,30760,30761,30762,30763,30764,30765,30766
0,"(0, 3)","(1, 4)","(2, 1)","(3, 1)","(4, 1)","(5, 1)","(6, 1)","(7, 1)","(8, 4)","(9, 328)",...,,,,,,,,,,
1,"(0, 324)","(8, 28)","(9, 148)","(13, 38)","(15, 28)","(18, 28)","(21, 29)","(23, 3059)","(25, 3)","(26, 1)",...,,,,,,,,,,
2,"(0, 1)","(9, 14)","(13, 3)","(18, 5)","(21, 2)","(23, 9)","(32, 1)","(44, 12)","(48, 2)","(57, 3)",...,,,,,,,,,,
3,"(0, 22)","(8, 7)","(9, 15)","(13, 7)","(21, 3)","(23, 41)","(44, 3)","(47, 1)","(48, 2)","(55, 4)",...,,,,,,,,,,
4,"(9, 5)","(23, 1)","(57, 2)","(98, 4)","(132, 1)","(154, 10)","(155, 1)","(205, 5)","(213, 1)","(215, 4)",...,,,,,,,,,,


Why doesn't it have the same number of cols as the lenght of the dictionary...

In [136]:
#check same process with 1 book - see how dict/corp/df size differ

Number of unique words in each book

In [23]:
unique_toks_num_lst = [len(book) for book in corpus]
unique_toks_per_fileid = zip(fileid_lst, unique_toks_num_lst)

unique_toks_per_fileid

[('10-clean.txt', 17331),
 ('100-clean.txt', 28919),
 ('105-clean.txt', 6046),
 ('108-clean.txt', 8707),
 ('1080-clean.txt', 1087),
 ('11-clean.txt', 2780),
 ('1112-clean.txt', 3821),
 ('1184-clean.txt', 26118),
 ('12-clean.txt', 3097),
 ('120-clean.txt', 6603),
 ('1232-clean.txt', 5284),
 ('1260-clean.txt', 14991),
 ('1322-clean.txt', 14721),
 ('1342-clean.txt', 6842),
 ('135-clean.txt', 26112),
 ('1399-clean.txt', 14473),
 ('140-clean.txt', 11656),
 ('1400-clean.txt', 12334),
 ('1404-clean.txt', 8954),
 ('14264-clean.txt', 5571),
 ('147-clean.txt', 3595),
 ('1497-clean.txt', 11362),
 ('15399-clean.txt', 7436),
 ('158-clean.txt', 10057),
 ('16-clean.txt', 5026),
 ('160-clean.txt', 7204),
 ('161-clean.txt', 7509),
 ('16382-clean.txt', 11299),
 ('1661-clean.txt', 8459),
 ('1727-clean.txt', 8066),
 ('174-clean.txt', 7324),
 ('1952-yellow_wallpaper-clean.txt', 1270),
 ('19942-clean.txt', 5207),
 ('20-clean.txt', 10237),
 ('20203-clean.txt', 8300),
 ('203-clean.txt', 13909),
 ('205-clean.t

Average unique words per book

In [24]:
avg_unique_toks = int(np.mean(unique_toks_num_lst))
avg_unique_toks

9004

Save out the corpus

In [25]:
filename = 'tokenized_corpus.mm'

corpora.MmCorpus.serialize(outputs_dir + filename, corpus)

## EDA and Saving helper funtions

To automate next few EDA rounds

In [137]:
def eda(transform_txt_file, fileid_lst=fileid_lst):
    '''
    Do transformations with updated transformation function and return all the eda items
    '''
    
    all_transf_books_lst = [transform_txt_file(f) for f in fileid_lst]
    
    book_lengths = [(tup[0], len(tup[1])) for tup in zip(fileid_lst, all_transf_books_lst)]
    avg_num_tokens = int(np.mean([len(book) for book in all_transf_books_lst]))
    
    dictionary = corpora.Dictionary(all_transf_books_lst)
    dictionary_length = len(dictionary)
    
    corpus = [dictionary.doc2bow(book) for book in all_transf_books_lst]
    
    unique_toks_num_lst = [len(book) for book in corpus]
    unique_toks_per_fileid = zip(fileid_lst, unique_toks_num_lst)
    avg_unique_toks = int(np.mean(unique_toks_num_lst))
    
    
    return book_lengths, avg_num_tokens, dictionary, dictionary_length, unique_toks_per_fileid, avg_unique_toks, corpus


def save_stuff(distinguishing_str, dictionary, corpus, outputs_dir='/Users/rachelbrynsvold/dsi/capstone_dir/top_100_dev_corp/outputs/'):
    '''
    Save the outputs of the most recent eda step
    '''
    dictionary.save(outputs_dir + distinguishing_str + '.dict')
    corpora.MmCorpus.serialize(outputs_dir + distinguishing_str + '_corpus.mm', corpus)
    

### EDA items
* List of book lengths (total num of tokens for each book)
* Average number of tokens per book
* Number of words in corpus (dictionary length)
    * Dictionary (not viewed)
* Unique tokens per book
* Average number of unique tokens per book
    * Corpus (not viewe)
    
Save everything after eda step


### To summarize the 'simple tokenization' EDA step (#1):

In [68]:
output_v1 = eda(transform_txt_file_v1)

In [82]:
book_lengths1, avg_num_tokens1, dictionary1, dictionary_length1, unique_toks_per_fileid1, \
    avg_unique_toks1, corpus1 = output_v1 

In [72]:
print "Average number of tokens in a book: ", avg_num_tokens1
print "   "
print "Average unique tokens in a book: ", avg_unique_toks1
print "   "1
print "Total number of words (dictionary length): ", dictionary_length

Average number of tokens in a book:  132124
   
Average unique tokens in a book:  9004
   
Total number of words (dictionary length):  195104


In [114]:
##for pres, note the sparcity problem - 9000 vs. 195k == 186k empty

In [73]:
book_lengths1

[('10-clean.txt', 821133),
 ('100-clean.txt', 883320),
 ('105-clean.txt', 83286),
 ('108-clean.txt', 112139),
 ('1080-clean.txt', 3410),
 ('11-clean.txt', 26449),
 ('1112-clean.txt', 25898),
 ('1184-clean.txt', 461184),
 ('12-clean.txt', 29293),
 ('120-clean.txt', 68589),
 ('1232-clean.txt', 49612),
 ('1260-clean.txt', 185468),
 ('1322-clean.txt', 121712),
 ('1342-clean.txt', 121567),
 ('135-clean.txt', 565761),
 ('1399-clean.txt', 349978),
 ('140-clean.txt', 149124),
 ('1400-clean.txt', 184403),
 ('1404-clean.txt', 192349),
 ('14264-clean.txt', 76656),
 ('147-clean.txt', 21997),
 ('1497-clean.txt', 216243),
 ('15399-clean.txt', 82564),
 ('158-clean.txt', 157441),
 ('16-clean.txt', 47439),
 ('160-clean.txt', 63992),
 ('161-clean.txt', 118580),
 ('16382-clean.txt', 135922),
 ('1661-clean.txt', 104493),
 ('1727-clean.txt', 129410),
 ('174-clean.txt', 78934),
 ('1952-yellow_wallpaper-clean.txt', 6067),
 ('19942-clean.txt', 35861),
 ('20-clean.txt', 80059),
 ('20203-clean.txt', 76167),
 ('

In [74]:
unique_toks_per_fileid1

[('10-clean.txt', 17331),
 ('100-clean.txt', 28919),
 ('105-clean.txt', 6046),
 ('108-clean.txt', 8707),
 ('1080-clean.txt', 1087),
 ('11-clean.txt', 2780),
 ('1112-clean.txt', 3821),
 ('1184-clean.txt', 26118),
 ('12-clean.txt', 3097),
 ('120-clean.txt', 6603),
 ('1232-clean.txt', 5284),
 ('1260-clean.txt', 14991),
 ('1322-clean.txt', 14721),
 ('1342-clean.txt', 6842),
 ('135-clean.txt', 26112),
 ('1399-clean.txt', 14473),
 ('140-clean.txt', 11656),
 ('1400-clean.txt', 12334),
 ('1404-clean.txt', 8954),
 ('14264-clean.txt', 5571),
 ('147-clean.txt', 3595),
 ('1497-clean.txt', 11362),
 ('15399-clean.txt', 7436),
 ('158-clean.txt', 10057),
 ('16-clean.txt', 5026),
 ('160-clean.txt', 7204),
 ('161-clean.txt', 7509),
 ('16382-clean.txt', 11299),
 ('1661-clean.txt', 8459),
 ('1727-clean.txt', 8066),
 ('174-clean.txt', 7324),
 ('1952-yellow_wallpaper-clean.txt', 1270),
 ('19942-clean.txt', 5207),
 ('20-clean.txt', 10237),
 ('20203-clean.txt', 8300),
 ('203-clean.txt', 13909),
 ('205-clean.t

In [83]:
save_stuff('simple_tok')

## EDA Step 2: + stop word removal

In [39]:
import nltk
nltk.download('stopwords')
stop = set(stopwords.words('english'))
print stop

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rachelbrynsvold/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
set([u'all', u'just', u'being', u'over', u'both', u'through', u'yourselves', u'its', u'before', u'o', u'hadn', u'herself', u'll', u'had', u'should', u'to', u'only', u'won', u'under', u'ours', u'has', u'do', u'them', u'his', u'very', u'they', u'not', u'during', u'now', u'him', u'nor', u'd', u'did', u'didn', u'this', u'she', u'each', u'further', u'where', u'few', u'because', u'doing', u'some', u'hasn', u'are', u'our', u'ourselves', u'out', u'what', u'for', u'while', u're', u'does', u'above', u'between', u'mustn', u't', u'be', u'we', u'who', u'were', u'here', u'shouldn', u'hers', u'by', u'on', u'about', u'couldn', u'of', u'against', u's', u'isn', u'or', u'own', u'into', u'yourself', u'down', u'mightn', u'wasn', u'your', u'from', u'her', u'their', u'aren', u'there', u'been', u'whom', u'too', u'wouldn', u'themselves', u'weren', u'was', u'until', u'm

In [70]:
def transform_txt_file_v2(fname, root=source_dir, stop_words=stop):
    '''
    Top-level function to call all of the subfunctions for text transformation
    Assumes you want to remove empty lines and tokenize (because you do)
    '''
    fp = root + fname
    book_as_lst = []
    for line in IterFile(fp):
        
        if empty_line_check(line) == False:
            line = basic_tokenize(line)
            
            if stop_words !=None:
                line = remove_stop_words(line, stop_words)
        
        book_as_lst.extend(line)
        
    return book_as_lst

def empty_line_check(line) :
    '''
    checks for empty line
    '''
    if line == "\n":
        empty = True
    else:
        empty = False
    return empty
    
def basic_tokenize(line):
    '''
    convert to list
    strip punctuation, lowercase
    '''
    return [tok.lower().strip(punctuation) for tok in line.strip('\n').split()]    
            
def remove_stop_words(line, stop_words):
    return [tok for tok in line if tok not in stop_words] 

In [106]:
output_v2 = eda(transform_txt_file_v2)

In [107]:
book_lengths2, avg_num_tokens2, dictionary2, dictionary_length2, \
    unique_toks_per_fileid2, avg_unique_toks2, corpus2 = output_v2

In [77]:
print "Average number of tokens in a book: ", avg_num_tokens2
print "   "
print "Average unique tokens in a book: ", avg_unique_toks2
print "   "
print "Total number of words (dictionary length): ", dictionary_length2

Average number of tokens in a book:  66904
   
Average unique tokens in a book:  8882
   
Total number of words (dictionary length):  194961


In [115]:
#make graph of reduction

In [78]:
book_lengths2

[('10-clean.txt', 431250),
 ('100-clean.txt', 481608),
 ('105-clean.txt', 39320),
 ('108-clean.txt', 54072),
 ('1080-clean.txt', 1739),
 ('11-clean.txt', 13616),
 ('1112-clean.txt', 15468),
 ('1184-clean.txt', 251348),
 ('12-clean.txt', 15589),
 ('120-clean.txt', 34581),
 ('1232-clean.txt', 23021),
 ('1260-clean.txt', 91946),
 ('1322-clean.txt', 67400),
 ('1342-clean.txt', 57307),
 ('135-clean.txt', 288377),
 ('1399-clean.txt', 173226),
 ('140-clean.txt', 70693),
 ('1400-clean.txt', 87450),
 ('1404-clean.txt', 91284),
 ('14264-clean.txt', 38978),
 ('147-clean.txt', 10640),
 ('1497-clean.txt', 98946),
 ('15399-clean.txt', 38240),
 ('158-clean.txt', 75454),
 ('16-clean.txt', 23919),
 ('160-clean.txt', 32962),
 ('161-clean.txt', 56016),
 ('16382-clean.txt', 71067),
 ('1661-clean.txt', 49411),
 ('1727-clean.txt', 59474),
 ('174-clean.txt', 38744),
 ('1952-yellow_wallpaper-clean.txt', 2978),
 ('19942-clean.txt', 18535),
 ('20-clean.txt', 47100),
 ('20203-clean.txt', 37660),
 ('203-clean.txt

In [79]:
unique_toks_per_fileid2

[('10-clean.txt', 17208),
 ('100-clean.txt', 28785),
 ('105-clean.txt', 5922),
 ('108-clean.txt', 8578),
 ('1080-clean.txt', 994),
 ('11-clean.txt', 2653),
 ('1112-clean.txt', 3701),
 ('1184-clean.txt', 25988),
 ('12-clean.txt', 2977),
 ('120-clean.txt', 6479),
 ('1232-clean.txt', 5159),
 ('1260-clean.txt', 14861),
 ('1322-clean.txt', 14591),
 ('1342-clean.txt', 6716),
 ('135-clean.txt', 25978),
 ('1399-clean.txt', 14342),
 ('140-clean.txt', 11530),
 ('1400-clean.txt', 12205),
 ('1404-clean.txt', 8831),
 ('14264-clean.txt', 5449),
 ('147-clean.txt', 3472),
 ('1497-clean.txt', 11236),
 ('15399-clean.txt', 7306),
 ('158-clean.txt', 9933),
 ('16-clean.txt', 4902),
 ('160-clean.txt', 7080),
 ('161-clean.txt', 7387),
 ('16382-clean.txt', 11174),
 ('1661-clean.txt', 8332),
 ('1727-clean.txt', 7939),
 ('174-clean.txt', 7202),
 ('1952-yellow_wallpaper-clean.txt', 1164),
 ('19942-clean.txt', 5078),
 ('20-clean.txt', 10120),
 ('20203-clean.txt', 8169),
 ('203-clean.txt', 13780),
 ('205-clean.txt

In [85]:
save_stuff('no_stopwords')

## EDA Step 3: + lemmatization

In [89]:
from gensim.utils import lemmatize

In [133]:
def transform_txt_file_v3(fname, root=source_dir, stop_words=stop, \
                         lemma = True):
    '''
    Top-level function to call all of the subfunctions for text transformation
    Assumes you want to remove empty lines and tokenize (because you do)
    '''
    fp = root + fname
    book_as_lst = []
    for line in IterFile(fp):
        
        if empty_line_check(line) == False:
            
            
            if lemma == True:
                line = lemmatize(line)
            
            else:
                line = basic_tokenize(line)
                
                if stop_words !=None:
                    line = remove_stop_words(line, stop_words)
        
            book_as_lst.extend(line)
    
    return book_as_lst

def empty_line_check(line) :
    '''
    checks for empty line
    '''
    if line == "\n":
        empty = True
    else:
        empty = False
    return empty

def basic_tokenize(line):
    '''
    convert to list
    strip punctuation, lowercase
    '''
    return [tok.lower().strip(punctuation) for tok in line.strip('\n').split()]
           
def remove_stop_words(line, stop_words):
    return [tok for tok in line if tok not in stop_words] 
    

In [134]:
transform_txt_file_v3(usher)

['fall/NN',
 'house/NN',
 'usher/NN',
 'son/NN',
 'coeur/NN',
 'luth/NN',
 'suspendu/NN',
 'sitot/NN',
 'qu/NN',
 'le/NN',
 'touche/NN',
 'il/NN',
 'resonne/NN',
 'beranger/JJ',
 'whole/NN',
 'dull/JJ',
 'dark/NN',
 'soundless/NN',
 'day/NN',
 'autumn/NN',
 'year/NN',
 'cloud/NN',
 'hung/JJ',
 'oppressively/RB',
 'low/JJ',
 'heaven/NN',
 'have/VB',
 'be/VB',
 'pass/VB',
 'alone/RB',
 'horseback/NN',
 'singularly/RB',
 'dreary/JJ',
 'tract/NN',
 'country/NN',
 'length/NN',
 'find/VB',
 'shade/NN',
 'evening/NN',
 'draw/VB',
 'view/NN',
 'melancholy/NN',
 'house/NN',
 'usher/NN',
 'know/VB',
 'not/RB',
 'be/VB',
 'first/JJ',
 'glimpse/NN',
 'build/VB',
 'sense/NN',
 'insufferable/JJ',
 'gloom/NN',
 'pervade/VB',
 'spirit/NN',
 'say/VB',
 'insufferable/JJ',
 'feeling/NN',
 'be/VB',
 'unrelieved/JJ',
 'half/NN',
 'pleasureable/NN',
 'poetic/JJ',
 'sentiment/NN',
 'mind/NN',
 'usually/RB',
 'receive/VB',
 'even/RB',
 'sternest/NN',
 'natural/JJ',
 'image/NN',
 'desolate/JJ',
 'terrible/JJ',

In [128]:
source_dir

'/Users/rachelbrynsvold/dsi/capstone_dir/top_100_dev_corp/books/clean/'

In [112]:
lemmatize("I took an extra turn")

['take/VB', 'extra/JJ', 'turn/NN']

In [113]:
lemmatize("Turn around, there is a gator chasing you")

['turn/VB', 'be/VB', 'gator/NN', 'chasing/NN']

In [135]:
output_v3 = eda(transform_txt_file_v3)

KeyboardInterrupt: 

In [84]:
book_lengths3, avg_num_tokens3, dictionary3, dictionary_length3, \
    unique_toks_per_fileid3, avg_unique_toks3, corpus3 = output_v3

In [77]:
print "Average number of tokens in a book: ", avg_num_tokens3
print "   "
print "Average unique tokens in a book: ", avg_unique_toks3
print "   "
print "Total number of words (dictionary length): ", dictionary_length3

Average number of tokens in a book:  66904
   
Average unique tokens in a book:  8882
   
Total number of words (dictionary length):  194961


In [78]:
book_lengths3

[('10-clean.txt', 431250),
 ('100-clean.txt', 481608),
 ('105-clean.txt', 39320),
 ('108-clean.txt', 54072),
 ('1080-clean.txt', 1739),
 ('11-clean.txt', 13616),
 ('1112-clean.txt', 15468),
 ('1184-clean.txt', 251348),
 ('12-clean.txt', 15589),
 ('120-clean.txt', 34581),
 ('1232-clean.txt', 23021),
 ('1260-clean.txt', 91946),
 ('1322-clean.txt', 67400),
 ('1342-clean.txt', 57307),
 ('135-clean.txt', 288377),
 ('1399-clean.txt', 173226),
 ('140-clean.txt', 70693),
 ('1400-clean.txt', 87450),
 ('1404-clean.txt', 91284),
 ('14264-clean.txt', 38978),
 ('147-clean.txt', 10640),
 ('1497-clean.txt', 98946),
 ('15399-clean.txt', 38240),
 ('158-clean.txt', 75454),
 ('16-clean.txt', 23919),
 ('160-clean.txt', 32962),
 ('161-clean.txt', 56016),
 ('16382-clean.txt', 71067),
 ('1661-clean.txt', 49411),
 ('1727-clean.txt', 59474),
 ('174-clean.txt', 38744),
 ('1952-yellow_wallpaper-clean.txt', 2978),
 ('19942-clean.txt', 18535),
 ('20-clean.txt', 47100),
 ('20203-clean.txt', 37660),
 ('203-clean.txt

In [79]:
unique_toks_per_fileid3

[('10-clean.txt', 17208),
 ('100-clean.txt', 28785),
 ('105-clean.txt', 5922),
 ('108-clean.txt', 8578),
 ('1080-clean.txt', 994),
 ('11-clean.txt', 2653),
 ('1112-clean.txt', 3701),
 ('1184-clean.txt', 25988),
 ('12-clean.txt', 2977),
 ('120-clean.txt', 6479),
 ('1232-clean.txt', 5159),
 ('1260-clean.txt', 14861),
 ('1322-clean.txt', 14591),
 ('1342-clean.txt', 6716),
 ('135-clean.txt', 25978),
 ('1399-clean.txt', 14342),
 ('140-clean.txt', 11530),
 ('1400-clean.txt', 12205),
 ('1404-clean.txt', 8831),
 ('14264-clean.txt', 5449),
 ('147-clean.txt', 3472),
 ('1497-clean.txt', 11236),
 ('15399-clean.txt', 7306),
 ('158-clean.txt', 9933),
 ('16-clean.txt', 4902),
 ('160-clean.txt', 7080),
 ('161-clean.txt', 7387),
 ('16382-clean.txt', 11174),
 ('1661-clean.txt', 8332),
 ('1727-clean.txt', 7939),
 ('174-clean.txt', 7202),
 ('1952-yellow_wallpaper-clean.txt', 1164),
 ('19942-clean.txt', 5078),
 ('20-clean.txt', 10120),
 ('20203-clean.txt', 8169),
 ('203-clean.txt', 13780),
 ('205-clean.txt

In [85]:
save_stuff('lemmatized')

## EDA Step 4: + frequency filters

In [None]:
#Think I can do this by pruning existing dictionary

In [105]:
def eda(transform_txt_file, fileid_lst=fileid_lst):
    '''
    Do transformations with updated transformation function and return all the eda items
    '''
    
    all_transf_books_lst = [transform_txt_file(f) for f in fileid_lst]
    
    book_lengths = [(tup[0], len(tup[1])) for tup in zip(fileid_lst, all_transf_books_lst)]
    avg_num_tokens = int(np.mean([len(book) for book in all_transf_books_lst]))
    
    dictionary = corpora.Dictionary(all_transf_books_lst)
    dictionary_length = len(dictionary)
    
    corpus = [dictionary.doc2bow(book) for book in all_transf_books_lst]
    
    unique_toks_num_lst = [len(book) for book in corpus]
    unique_toks_per_fileid = zip(fileid_lst, unique_toks_num_lst)
    avg_unique_toks = int(np.mean(unique_toks_num_lst))
    
    
    return book_lengths, avg_num_tokens, dictionary, dictionary_length, unique_toks_per_fileid, avg_unique_toks, corpus


def save_stuff(distinguishing_str, dictionary, corpus, model, outputs_dir='/Users/rachelbrynsvold/dsi/capstone_dir/top_100_dev_corp/outputs/'):
    '''
    Save the outputs of the most recent eda step
    '''
    if dictionary != None:
        dictionary.save(outputs_dir + distinguishing_str + '.dict')
        
    if corpus != None:
        corpora.MmCorpus.serialize(outputs_dir + distinguishing_str + '_corpus.mm', corpus)
    
    if model != None:
        pass

In [None]:
gensim.utils.prune_vocabctionary3.copy()
gensim.utils.prune_vocab(dictionary3, min_reduce=10)

In [None]:
len(dictionary3)

# First LDA Model!!!

In [109]:
from gensim.models import ldamodel

In [111]:
lda = ldamodel.LdaModel(corpus=corpus,alpha='auto', id2word=dictionary, num_topics=20, update_every=0, passes=20)

In [None]:
name = 'first_lda.model'
lda.save(outpoutputs_dir_dir + name)


In [None]:
print lda

In [121]:
lda.get_document_topics(dictionary.doc2bow(transform_txt_file_v2(usher)))

[(17, 0.99976736796572041)]

In [124]:
lda.get_term_topics(3)

[]

In [127]:
lda.get_topic_terms(1)

[(91660, 0.0010124057909801489),
 (10364, 0.00061426842891210544),
 (91597, 0.00051999185286401111),
 (91667, 0.00049671380979222706),
 (91536, 0.00041714172695244539),
 (91596, 0.0003260759552575187),
 (91657, 0.00029770823944895501),
 (12919, 0.00023973092674403279),
 (2729, 0.00021683942578643836),
 (1443, 0.00021455100956580147)]

In [None]:
#dewey decimal system recreation??
#recreate gutenberg topics?

At the end - bar chart of changes?