In [1]:
import os, codecs
import gensim
from gensim import corpora
from collections import defaultdict
import string
from string import punctuation
from nltk.corpus.reader.plaintext import PlaintextCorpusReader
import pandas as pd
import numpy as np

### THESE SHOULD BE ALL THE RELATIVE PATHS

In [6]:
source_dir = '/home/ubuntu/Capstone/books/clean/'

In [7]:
outputs_dir = '/home/ubuntu/Capstone/outputs/ec2/'

In [8]:
class IterFile(object):
    '''
    class object to do the iterating on individual book txt documents, including file i/o.
    '''
    
    def __init__(self, filepath):
        self.filepath = filepath
        
    def _open_file(self):
        self.file = codecs.open(self.filepath, 'r', encoding='utf_8')
        
    def _close_file(self):
        self.file.close()
    
    def __iter__(self):
        '''
        overwrite iteration to include file i/o
        '''
        self._open_file()
        
        for line in self.file:
            yield line
        
        self._close_file()
    

In [9]:
def transform_txt_file_v1(fname, root=source_dir):
    '''
    Initial pass at text transformation
    Reimplemented later (v2 etc) as a caller of various subfunctions to do all the transformation
    '''
    fp = root + fname

    book_as_lst = []
    for line in IterFile(fp):
        if line == "\n":
            pass
        else: 
            line_lst= [tok.lower().strip(punctuation) for tok in line.strip('\n').split()]
            book_as_lst.extend(line_lst)
            
    #add in stop word removal and frequency threshhold
    return book_as_lst

## EDA Step 0: Test transform function with short book (Fall of the House of Usher)

In [10]:
usher = '932-clean.txt'
print transform_txt_file_v1(usher)

[u'the', u'fall', u'of', u'the', u'house', u'of', u'usher', u'son', u'coeur', u'est', u'un', u'luth', u'suspendu', u'sitot', u"qu'on", u'le', u'touche', u'il', u'resonne', u'de', u'beranger', u'during', u'the', u'whole', u'of', u'a', u'dull', u'dark', u'and', u'soundless', u'day', u'in', u'the', u'autumn', u'of', u'the', u'year', u'when', u'the', u'clouds', u'hung', u'oppressively', u'low', u'in', u'the', u'heavens', u'i', u'had', u'been', u'passing', u'alone', u'on', u'horseback', u'through', u'a', u'singularly', u'dreary', u'tract', u'of', u'country', u'and', u'at', u'length', u'found', u'myself', u'as', u'the', u'shades', u'of', u'the', u'evening', u'drew', u'on', u'within', u'view', u'of', u'the', u'melancholy', u'house', u'of', u'usher', u'i', u'know', u'not', u'how', u'it', u'was--but', u'with', u'the', u'first', u'glimpse', u'of', u'the', u'building', u'a', u'sense', u'of', u'insufferable', u'gloom', u'pervaded', u'my', u'spirit', u'i', u'say', u'insufferable', u'for', u'the', u

Get list of file names by using nltk... this feels a bit like cheating... should probably change this for final code

In [11]:
temp_corp = PlaintextCorpusReader(source_dir, '.*\.txt')
fileid_lst = temp_corp.fileids()
fileid_lst

['10-clean.txt',
 '100-clean.txt',
 '105-clean.txt',
 '108-clean.txt',
 '1080-clean.txt',
 '11-clean.txt',
 '1112-clean.txt',
 '1184-clean.txt',
 '12-clean.txt',
 '120-clean.txt',
 '1232-clean.txt',
 '1260-clean.txt',
 '1322-clean.txt',
 '1342-clean.txt',
 '135-clean.txt',
 '1399-clean.txt',
 '140-clean.txt',
 '1400-clean.txt',
 '1404-clean.txt',
 '14264-clean.txt',
 '147-clean.txt',
 '1497-clean.txt',
 '15399-clean.txt',
 '158-clean.txt',
 '16-clean.txt',
 '160-clean.txt',
 '161-clean.txt',
 '16382-clean.txt',
 '1661-clean.txt',
 '1727-clean.txt',
 '174-clean.txt',
 '1952-yellow_wallpaper-clean.txt',
 '19942-clean.txt',
 '20-clean.txt',
 '20203-clean.txt',
 '203-clean.txt',
 '205-clean.txt',
 '21279-clean.txt',
 '2148-clean.txt',
 '2174-clean.txt',
 '219-clean.txt',
 '224-clean.txt',
 '23-clean.txt',
 '236-clean.txt',
 '2500-clean.txt',
 '25305-clean.txt',
 '2591-clean.txt',
 '2600-clean.txt',
 '2680-clean.txt',
 '2701-moby-clean.txt',
 '28054-clean.txt',
 '2814-clean.txt',
 '2852-cle

## EDA Step 1: Most basic transformation (tokenization only)

Apply transformation function to all books

In [12]:
all_transf_books_lst = []
for f in fileid_lst:
    all_transf_books_lst.append(transform_txt_file_v1(f))

all_transf_books_lst

[[u'the',
  u'old',
  u'testament',
  u'of',
  u'the',
  u'king',
  u'james',
  u'version',
  u'of',
  u'the',
  u'bible',
  u'the',
  u'first',
  u'book',
  u'of',
  u'moses',
  u'called',
  u'genesis',
  u'1:1',
  u'in',
  u'the',
  u'beginning',
  u'god',
  u'created',
  u'the',
  u'heavens',
  u'and',
  u'the',
  u'earth',
  u'1:2',
  u'and',
  u'the',
  u'earth',
  u'was',
  u'without',
  u'form',
  u'and',
  u'void',
  u'and',
  u'darkness',
  u'was',
  u'upon',
  u'the',
  u'face',
  u'of',
  u'the',
  u'deep',
  u'and',
  u'the',
  u'spirit',
  u'of',
  u'god',
  u'moved',
  u'upon',
  u'the',
  u'face',
  u'of',
  u'the',
  u'waters',
  u'1:3',
  u'and',
  u'god',
  u'said',
  u'let',
  u'there',
  u'be',
  u'light',
  u'and',
  u'there',
  u'was',
  u'light',
  u'1:4',
  u'and',
  u'god',
  u'saw',
  u'the',
  u'light',
  u'that',
  u'it',
  u'was',
  u'good',
  u'and',
  u'god',
  u'divided',
  u'the',
  u'light',
  u'from',
  u'the',
  u'darkness',
  u'1:5',
  u'and',
  u'g

Cool - looks like I've got all the books in there

In [13]:
len(all_transf_books_lst)

95

## EDA Step 1.5: Write code snippets to do all the calculations, print-outs, and obj creation that I'll use for the rest of the EDA 

How long is each book (hom many totoal words?)

In [14]:
book_lengths = []

for tup in zip(fileid_lst, all_transf_books_lst):
    book_lengths.append((tup[0], len(tup[1])))
    
book_lengths

[('10-clean.txt', 821133),
 ('100-clean.txt', 883320),
 ('105-clean.txt', 83286),
 ('108-clean.txt', 112139),
 ('1080-clean.txt', 3410),
 ('11-clean.txt', 26449),
 ('1112-clean.txt', 25898),
 ('1184-clean.txt', 461184),
 ('12-clean.txt', 29293),
 ('120-clean.txt', 68589),
 ('1232-clean.txt', 49612),
 ('1260-clean.txt', 185468),
 ('1322-clean.txt', 121712),
 ('1342-clean.txt', 121567),
 ('135-clean.txt', 565761),
 ('1399-clean.txt', 349978),
 ('140-clean.txt', 149124),
 ('1400-clean.txt', 184403),
 ('1404-clean.txt', 192349),
 ('14264-clean.txt', 76656),
 ('147-clean.txt', 21997),
 ('1497-clean.txt', 216243),
 ('15399-clean.txt', 82564),
 ('158-clean.txt', 157441),
 ('16-clean.txt', 47439),
 ('160-clean.txt', 63992),
 ('161-clean.txt', 118580),
 ('16382-clean.txt', 135922),
 ('1661-clean.txt', 104493),
 ('1727-clean.txt', 129410),
 ('174-clean.txt', 78934),
 ('1952-yellow_wallpaper-clean.txt', 6067),
 ('19942-clean.txt', 35861),
 ('20-clean.txt', 80059),
 ('20203-clean.txt', 76167),
 ('

Average number of tokens per book

In [15]:
int(np.mean([len(book) for book in all_transf_books_lst]))

132124

Make a dictionary - a count of the number of times each word appears in each book

In [16]:
dictionary = corpora.Dictionary(all_transf_books_lst)

Confirm: yup, it's got words in it

In [17]:
dictionary[0], dictionary[10], dictionary[1000], dictionary[5000]

(u'', u'mozah', u'tookest', u'89:4')

In [18]:
type(dictionary)

gensim.corpora.dictionary.Dictionary

I'd be interested to see this as pandas dataframe - but this doesn't work

In [19]:
#df = pd.DataFrame(dictionary)
#df.head()

How many words in this dictionary?

In [20]:
len(dictionary)

195104

Do the contents look right?

In [21]:
print(dictionary)

Dictionary(195104 unique tokens: [u'', u'biennials', u'lenitives', u'unsupportable', u'nunnery']...)


Save out the dictionary

In [22]:
filename = 'tokenized.dict'

dictionary.save(outputs_dir + filename)

In [24]:
zip(fileid_lst, [len(dictionary.doc2bow(book)) for book in all_transf_books_lst])

[('10-clean.txt', 17331),
 ('100-clean.txt', 28919),
 ('105-clean.txt', 6046),
 ('108-clean.txt', 8707),
 ('1080-clean.txt', 1087),
 ('11-clean.txt', 2780),
 ('1112-clean.txt', 3821),
 ('1184-clean.txt', 26118),
 ('12-clean.txt', 3097),
 ('120-clean.txt', 6603),
 ('1232-clean.txt', 5284),
 ('1260-clean.txt', 14991),
 ('1322-clean.txt', 14721),
 ('1342-clean.txt', 6842),
 ('135-clean.txt', 26112),
 ('1399-clean.txt', 14473),
 ('140-clean.txt', 11656),
 ('1400-clean.txt', 12334),
 ('1404-clean.txt', 8954),
 ('14264-clean.txt', 5571),
 ('147-clean.txt', 3595),
 ('1497-clean.txt', 11362),
 ('15399-clean.txt', 7436),
 ('158-clean.txt', 10057),
 ('16-clean.txt', 5026),
 ('160-clean.txt', 7204),
 ('161-clean.txt', 7509),
 ('16382-clean.txt', 11299),
 ('1661-clean.txt', 8459),
 ('1727-clean.txt', 8066),
 ('174-clean.txt', 7324),
 ('1952-yellow_wallpaper-clean.txt', 1270),
 ('19942-clean.txt', 5207),
 ('20-clean.txt', 10237),
 ('20203-clean.txt', 8300),
 ('203-clean.txt', 13909),
 ('205-clean.t

Make a corpus!

In [25]:
corpus = [dictionary.doc2bow(book) for book in all_transf_books_lst]

Check that it has all the books

In [26]:
len(corpus)

95

View it as a dataframe

In [27]:
df = pd.DataFrame(corpus)
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30757,30758,30759,30760,30761,30762,30763,30764,30765,30766
0,"(0, 3)","(1, 4)","(2, 1)","(3, 1)","(4, 1)","(5, 1)","(6, 1)","(7, 1)","(8, 4)","(9, 328)",...,,,,,,,,,,
1,"(0, 324)","(8, 28)","(9, 148)","(13, 38)","(15, 28)","(18, 28)","(21, 29)","(23, 3059)","(25, 3)","(26, 1)",...,,,,,,,,,,
2,"(0, 1)","(9, 14)","(13, 3)","(18, 5)","(21, 2)","(23, 9)","(32, 1)","(44, 12)","(48, 2)","(57, 3)",...,,,,,,,,,,
3,"(0, 22)","(8, 7)","(9, 15)","(13, 7)","(21, 3)","(23, 41)","(44, 3)","(47, 1)","(48, 2)","(55, 4)",...,,,,,,,,,,
4,"(9, 5)","(23, 1)","(57, 2)","(98, 4)","(132, 1)","(154, 10)","(155, 1)","(205, 5)","(213, 1)","(215, 4)",...,,,,,,,,,,


Why doesn't it have the same number of cols as the length of the dictionary...

In [28]:
#check same process with 1 book - see how dict/corp/df size differ

Number of unique words in each book

In [29]:
unique_toks_num_lst = [len(book) for book in corpus]
unique_toks_per_fileid = zip(fileid_lst, unique_toks_num_lst)

unique_toks_per_fileid

[('10-clean.txt', 17331),
 ('100-clean.txt', 28919),
 ('105-clean.txt', 6046),
 ('108-clean.txt', 8707),
 ('1080-clean.txt', 1087),
 ('11-clean.txt', 2780),
 ('1112-clean.txt', 3821),
 ('1184-clean.txt', 26118),
 ('12-clean.txt', 3097),
 ('120-clean.txt', 6603),
 ('1232-clean.txt', 5284),
 ('1260-clean.txt', 14991),
 ('1322-clean.txt', 14721),
 ('1342-clean.txt', 6842),
 ('135-clean.txt', 26112),
 ('1399-clean.txt', 14473),
 ('140-clean.txt', 11656),
 ('1400-clean.txt', 12334),
 ('1404-clean.txt', 8954),
 ('14264-clean.txt', 5571),
 ('147-clean.txt', 3595),
 ('1497-clean.txt', 11362),
 ('15399-clean.txt', 7436),
 ('158-clean.txt', 10057),
 ('16-clean.txt', 5026),
 ('160-clean.txt', 7204),
 ('161-clean.txt', 7509),
 ('16382-clean.txt', 11299),
 ('1661-clean.txt', 8459),
 ('1727-clean.txt', 8066),
 ('174-clean.txt', 7324),
 ('1952-yellow_wallpaper-clean.txt', 1270),
 ('19942-clean.txt', 5207),
 ('20-clean.txt', 10237),
 ('20203-clean.txt', 8300),
 ('203-clean.txt', 13909),
 ('205-clean.t

Average unique words per book

In [30]:
avg_unique_toks = int(np.mean(unique_toks_num_lst))
avg_unique_toks

9004

Save out the corpus

In [31]:
filename = 'tokenized_corpus.mm'

corpora.MmCorpus.serialize(outputs_dir + filename, corpus)

## EDA and Saving helper funtions

To automate next few EDA rounds

In [43]:
def eda(transform_txt_file, fileid_lst=fileid_lst):
    '''
    Do transformations with updated transformation function and return all the eda items
    '''
    
    all_transf_books_lst = [transform_txt_file(f) for f in fileid_lst]
    
    book_lengths = [(tup[0], len(tup[1])) for tup in zip(fileid_lst, all_transf_books_lst)]
    avg_num_tokens = int(np.mean([len(book) for book in all_transf_books_lst]))
    
    dictionary = corpora.Dictionary(all_transf_books_lst)
    dictionary_length = len(dictionary)
    
    corpus = [dictionary.doc2bow(book) for book in all_transf_books_lst]
    
    unique_toks_num_lst = [len(book) for book in corpus]
    unique_toks_per_fileid = zip(fileid_lst, unique_toks_num_lst)
    avg_unique_toks = int(np.mean(unique_toks_num_lst))
    
    
    return book_lengths, avg_num_tokens, dictionary, dictionary_length, unique_toks_per_fileid, avg_unique_toks, corpus


def save_stuff(distinguishing_str, dictionary, corpus, outputs_dir=outputs_dir):
    '''
    Save the outputs of the most recent eda step
    '''
    dictionary.save(outputs_dir + distinguishing_str + '.dict')
    corpora.MmCorpus.serialize(outputs_dir + distinguishing_str + '_corpus.mm', corpus)
    

### EDA items
* List of book lengths (total num of tokens for each book)
* Average number of tokens per book
* Number of words in corpus (dictionary length)
    * Dictionary (not viewed)
* Unique tokens per book
* Average number of unique tokens per book
    * Corpus (not viewe)
    
Save everything after eda step


### To summarize the 'simple tokenization' EDA step (#1):

In [33]:
output_v1 = eda(transform_txt_file_v1)

In [34]:
book_lengths1, avg_num_tokens1, dictionary1, dictionary_length1, unique_toks_per_fileid1, \
    avg_unique_toks1, corpus1 = output_v1 

In [35]:
print "Average number of tokens in a book: ", avg_num_tokens1
print "   "
print "Average unique tokens in a book: ", avg_unique_toks1
print "   "
print "Total number of words (dictionary length): ", dictionary_length1

Average number of tokens in a book:  132124
   
Average unique tokens in a book:  9004
   
Total number of words (dictionary length):  195104


In [36]:
##for pres, note the sparcity problem - 9000 vs. 195k == 186k empty

In [37]:
book_lengths1

[('10-clean.txt', 821133),
 ('100-clean.txt', 883320),
 ('105-clean.txt', 83286),
 ('108-clean.txt', 112139),
 ('1080-clean.txt', 3410),
 ('11-clean.txt', 26449),
 ('1112-clean.txt', 25898),
 ('1184-clean.txt', 461184),
 ('12-clean.txt', 29293),
 ('120-clean.txt', 68589),
 ('1232-clean.txt', 49612),
 ('1260-clean.txt', 185468),
 ('1322-clean.txt', 121712),
 ('1342-clean.txt', 121567),
 ('135-clean.txt', 565761),
 ('1399-clean.txt', 349978),
 ('140-clean.txt', 149124),
 ('1400-clean.txt', 184403),
 ('1404-clean.txt', 192349),
 ('14264-clean.txt', 76656),
 ('147-clean.txt', 21997),
 ('1497-clean.txt', 216243),
 ('15399-clean.txt', 82564),
 ('158-clean.txt', 157441),
 ('16-clean.txt', 47439),
 ('160-clean.txt', 63992),
 ('161-clean.txt', 118580),
 ('16382-clean.txt', 135922),
 ('1661-clean.txt', 104493),
 ('1727-clean.txt', 129410),
 ('174-clean.txt', 78934),
 ('1952-yellow_wallpaper-clean.txt', 6067),
 ('19942-clean.txt', 35861),
 ('20-clean.txt', 80059),
 ('20203-clean.txt', 76167),
 ('

In [38]:
unique_toks_per_fileid1

[('10-clean.txt', 17331),
 ('100-clean.txt', 28919),
 ('105-clean.txt', 6046),
 ('108-clean.txt', 8707),
 ('1080-clean.txt', 1087),
 ('11-clean.txt', 2780),
 ('1112-clean.txt', 3821),
 ('1184-clean.txt', 26118),
 ('12-clean.txt', 3097),
 ('120-clean.txt', 6603),
 ('1232-clean.txt', 5284),
 ('1260-clean.txt', 14991),
 ('1322-clean.txt', 14721),
 ('1342-clean.txt', 6842),
 ('135-clean.txt', 26112),
 ('1399-clean.txt', 14473),
 ('140-clean.txt', 11656),
 ('1400-clean.txt', 12334),
 ('1404-clean.txt', 8954),
 ('14264-clean.txt', 5571),
 ('147-clean.txt', 3595),
 ('1497-clean.txt', 11362),
 ('15399-clean.txt', 7436),
 ('158-clean.txt', 10057),
 ('16-clean.txt', 5026),
 ('160-clean.txt', 7204),
 ('161-clean.txt', 7509),
 ('16382-clean.txt', 11299),
 ('1661-clean.txt', 8459),
 ('1727-clean.txt', 8066),
 ('174-clean.txt', 7324),
 ('1952-yellow_wallpaper-clean.txt', 1270),
 ('19942-clean.txt', 5207),
 ('20-clean.txt', 10237),
 ('20203-clean.txt', 8300),
 ('203-clean.txt', 13909),
 ('205-clean.t

In [44]:
save_stuff('simple_tok', dictionary1, corpus1)

## EDA Step 2: + stop word removal

In [45]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = set(stopwords.words('english'))
print stop

[nltk_data] Downloading package stopwords to /home/ubuntu/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
set([u'all', u'just', u'being', u'over', u'both', u'through', u'yourselves', u'its', u'before', u'o', u'hadn', u'herself', u'll', u'had', u'should', u'to', u'only', u'won', u'under', u'ours', u'has', u'do', u'them', u'his', u'very', u'they', u'not', u'during', u'now', u'him', u'nor', u'd', u'did', u'didn', u'this', u'she', u'each', u'further', u'where', u'few', u'because', u'doing', u'some', u'hasn', u'are', u'our', u'ourselves', u'out', u'what', u'for', u'while', u're', u'does', u'above', u'between', u'mustn', u't', u'be', u'we', u'who', u'were', u'here', u'shouldn', u'hers', u'by', u'on', u'about', u'couldn', u'of', u'against', u's', u'isn', u'or', u'own', u'into', u'yourself', u'down', u'mightn', u'wasn', u'your', u'from', u'her', u'their', u'aren', u'there', u'been', u'whom', u'too', u'wouldn', u'themselves', u'weren', u'was', u'until', u'more', u'himself', u'that',

In [59]:
def transform_txt_file_v2(fname, root=source_dir, stop_words=stop):
    '''
    Top-level function to call all of the subfunctions for text transformation
    Assumes you want to remove empty lines and tokenize (because you do)
    '''
    fp = root + fname
    book_as_lst = []
    for line in IterFile(fp):
        
        if empty_line_check(line) == False:
            line = basic_tokenize(line)
            
            if stop_words !=None:
                line = remove_stop_words(line, stop_words)
        
        book_as_lst.extend(line)
        
    return book_as_lst

def empty_line_check(line) :
    '''
    checks for empty line
    '''
    if line == "\n":
        empty = True
    else:
        empty = False
    return empty
    
def basic_tokenize(line):
    '''
    convert to list
    strip punctuation, lowercase
    '''
    return [tok.lower().strip(punctuation) for tok in line.strip('\n').split()]    
            
def remove_stop_words(line, stop_words):
    return [tok for tok in line if tok not in stop_words] 

In [47]:
output_v2 = eda(transform_txt_file_v2)

In [48]:
book_lengths2, avg_num_tokens2, dictionary2, dictionary_length2, \
    unique_toks_per_fileid2, avg_unique_toks2, corpus2 = output_v2

In [49]:
print "Average number of tokens in a book: ", avg_num_tokens2
print "   "
print "Average unique tokens in a book: ", avg_unique_toks2
print "   "
print "Total number of words (dictionary length): ", dictionary_length2

Average number of tokens in a book:  66904
   
Average unique tokens in a book:  8882
   
Total number of words (dictionary length):  194961


In [50]:
#make graph of reduction

In [51]:
book_lengths2

[('10-clean.txt', 431250),
 ('100-clean.txt', 481608),
 ('105-clean.txt', 39320),
 ('108-clean.txt', 54072),
 ('1080-clean.txt', 1739),
 ('11-clean.txt', 13616),
 ('1112-clean.txt', 15468),
 ('1184-clean.txt', 251348),
 ('12-clean.txt', 15589),
 ('120-clean.txt', 34581),
 ('1232-clean.txt', 23021),
 ('1260-clean.txt', 91946),
 ('1322-clean.txt', 67400),
 ('1342-clean.txt', 57307),
 ('135-clean.txt', 288377),
 ('1399-clean.txt', 173226),
 ('140-clean.txt', 70693),
 ('1400-clean.txt', 87450),
 ('1404-clean.txt', 91284),
 ('14264-clean.txt', 38978),
 ('147-clean.txt', 10640),
 ('1497-clean.txt', 98946),
 ('15399-clean.txt', 38240),
 ('158-clean.txt', 75454),
 ('16-clean.txt', 23919),
 ('160-clean.txt', 32962),
 ('161-clean.txt', 56016),
 ('16382-clean.txt', 71067),
 ('1661-clean.txt', 49411),
 ('1727-clean.txt', 59474),
 ('174-clean.txt', 38744),
 ('1952-yellow_wallpaper-clean.txt', 2978),
 ('19942-clean.txt', 18535),
 ('20-clean.txt', 47100),
 ('20203-clean.txt', 37660),
 ('203-clean.txt

In [52]:
unique_toks_per_fileid2

[('10-clean.txt', 17208),
 ('100-clean.txt', 28785),
 ('105-clean.txt', 5922),
 ('108-clean.txt', 8578),
 ('1080-clean.txt', 994),
 ('11-clean.txt', 2653),
 ('1112-clean.txt', 3701),
 ('1184-clean.txt', 25988),
 ('12-clean.txt', 2977),
 ('120-clean.txt', 6479),
 ('1232-clean.txt', 5159),
 ('1260-clean.txt', 14861),
 ('1322-clean.txt', 14591),
 ('1342-clean.txt', 6716),
 ('135-clean.txt', 25978),
 ('1399-clean.txt', 14342),
 ('140-clean.txt', 11530),
 ('1400-clean.txt', 12205),
 ('1404-clean.txt', 8831),
 ('14264-clean.txt', 5449),
 ('147-clean.txt', 3472),
 ('1497-clean.txt', 11236),
 ('15399-clean.txt', 7306),
 ('158-clean.txt', 9933),
 ('16-clean.txt', 4902),
 ('160-clean.txt', 7080),
 ('161-clean.txt', 7387),
 ('16382-clean.txt', 11174),
 ('1661-clean.txt', 8332),
 ('1727-clean.txt', 7939),
 ('174-clean.txt', 7202),
 ('1952-yellow_wallpaper-clean.txt', 1164),
 ('19942-clean.txt', 5078),
 ('20-clean.txt', 10120),
 ('20203-clean.txt', 8169),
 ('203-clean.txt', 13780),
 ('205-clean.txt

In [53]:
save_stuff('no_stopwords', dictionary2, corpus2)

## EDA Step 3: + lemmatization

In [60]:
from gensim.utils import lemmatize

In [61]:
def transform_txt_file_v3(fname, root=source_dir, stop_words=stop, \
                         lemma = True):
    '''
    Top-level function to call all of the subfunctions for text transformation
    Note that lemmatizing takes care of the tokenize/punct stripping/lowercasing/
        stopword removal
    '''
    fp = root + fname
    book_as_lst = []
    for line in IterFile(fp):
        
        if empty_line_check(line) == False:
            
            
            if lemma == True:
                line = lemmatize(line)
            
            else:
                line = basic_tokenize(line)
                
                if stop_words !=None:
                    line = remove_stop_words(line, stop_words)
        
            book_as_lst.extend(line)
    
    return book_as_lst

def empty_line_check(line) :
    '''
    checks for empty line
    '''
    if line == "\n":
        empty = True
    else:
        empty = False
    return empty

def basic_tokenize(line):
    '''
    convert to list
    strip punctuation, lowercase
    '''
    return [tok.lower().strip(punctuation) for tok in line.strip('\n').split()]
           
def remove_stop_words(line, stop_words):
    return [tok for tok in line if tok not in stop_words] 
    

In [62]:
usher_v3 = transform_txt_file_v3(usher)
usher_v3

['fall/NN',
 'house/NN',
 'usher/NN',
 'son/NN',
 'coeur/NN',
 'luth/NN',
 'suspendu/NN',
 'sitot/NN',
 'qu/NN',
 'le/NN',
 'touche/NN',
 'il/NN',
 'resonne/NN',
 'beranger/JJ',
 'whole/NN',
 'dull/JJ',
 'dark/NN',
 'soundless/NN',
 'day/NN',
 'autumn/NN',
 'year/NN',
 'cloud/NN',
 'hung/JJ',
 'oppressively/RB',
 'low/JJ',
 'heaven/NN',
 'have/VB',
 'be/VB',
 'pass/VB',
 'alone/RB',
 'horseback/NN',
 'singularly/RB',
 'dreary/JJ',
 'tract/NN',
 'country/NN',
 'length/NN',
 'find/VB',
 'shade/NN',
 'evening/NN',
 'draw/VB',
 'view/NN',
 'melancholy/NN',
 'house/NN',
 'usher/NN',
 'know/VB',
 'not/RB',
 'be/VB',
 'first/JJ',
 'glimpse/NN',
 'build/VB',
 'sense/NN',
 'insufferable/JJ',
 'gloom/NN',
 'pervade/VB',
 'spirit/NN',
 'say/VB',
 'insufferable/JJ',
 'feeling/NN',
 'be/VB',
 'unrelieved/JJ',
 'half/NN',
 'pleasureable/NN',
 'poetic/JJ',
 'sentiment/NN',
 'mind/NN',
 'usually/RB',
 'receive/VB',
 'even/RB',
 'sternest/NN',
 'natural/JJ',
 'image/NN',
 'desolate/JJ',
 'terrible/JJ',

In [None]:
#still doesn't run, even on ec2
#gave ~15mins

In [None]:
output_v3 = eda(transform_txt_file_v3)

In [82]:
def transform_txt_file_v3_alt(fname, root=source_dir, stop_words=stop, \
                         lemma = True):
    '''
    Top-level function to call all of the subfunctions for text transformation
    Trying to lemmatize each book at once, rather than line-by-line - to attempt
        to fix apparent processing hang-up in v3
    '''
    fp = root + fname
    book_as_lst = []
    
    if lemma == True:
        for line in IterFile(fp):
            
            if empty_line_check(line) == False:
                book_as_lst.extend([word for word in line.split()])
                
        book_as_lst = lemmatize(" ".join(book_as_lst))
            
    else:
        for line in IterFile(fp):
            if empty_line_check(line) == False:
                line = basic_tokenize(line)
                
            if stop_words !=None:
                line = remove_stop_words(line, stop_words)
        
            book_as_lst.extend(line)
    
    return book_as_lst

def empty_line_check(line) :
    '''
    checks for empty line
    '''
    if line == "\n":
        empty = True
    else:
        empty = False
    return empty

def basic_tokenize(line):
    '''
    convert to list
    strip punctuation, lowercase
    '''
    return [tok.lower().strip(punctuation) for tok in line.strip('\n').split()]
           
def remove_stop_words(line, stop_words):
    return [tok for tok in line if tok not in stop_words] 
    

In [83]:
# note: need another layer of punct stripping, for sententce ending in 
#quotations.  ("I did not." => I, did, not.)

In [84]:
usher_v3_alt = transform_txt_file_v3_alt(usher)
usher_v3_alt

['fall/NN',
 'house/NN',
 'usher/NN',
 'son/NN',
 'coeur/NN',
 'luth/NN',
 'suspendu/NN',
 'sitot/NN',
 'qu/NN',
 'le/NN',
 'touche/NN',
 'il/NN',
 'beranger/JJ',
 'whole/NN',
 'dull/JJ',
 'dark/NN',
 'soundless/NN',
 'day/NN',
 'autumn/NN',
 'year/NN',
 'cloud/NN',
 'hung/JJ',
 'oppressively/RB',
 'low/JJ',
 'heaven/NN',
 'have/VB',
 'be/VB',
 'pass/VB',
 'alone/RB',
 'horseback/NN',
 'singularly/RB',
 'dreary/JJ',
 'tract/NN',
 'country/NN',
 'length/NN',
 'find/VB',
 'shade/NN',
 'evening/NN',
 'draw/VB',
 'view/NN',
 'melancholy/NN',
 'house/NN',
 'usher/NN',
 'know/VB',
 'not/RB',
 'be/VB',
 'first/JJ',
 'glimpse/NN',
 'build/VB',
 'sense/NN',
 'insufferable/JJ',
 'gloom/NN',
 'pervade/VB',
 'spirit/NN',
 'say/VB',
 'insufferable/JJ',
 'feeling/NN',
 'be/VB',
 'unrelieved/JJ',
 'half/NN',
 'pleasureable/NN',
 'poetic/JJ',
 'sentiment/NN',
 'mind/NN',
 'usually/RB',
 'receive/VB',
 'even/RB',
 'sternest/JJ',
 'natural/JJ',
 'image/NN',
 'desolate/JJ',
 'terrible/JJ',
 'look/VB',
 '

In [85]:
#Can't get either version of the eda process w lemmatization transformation to run
#Next step - try to run each piece individually and see where the issue is?
#Also, may not lemmatize?
#Use another package's lemmatizer/stemmer?

In [86]:
output_v3_alt = eda(transform_txt_file_v3_alt)

KeyboardInterrupt: 

In [None]:
#didn't run on ec2 after about 20mins
#look into batching (Lemmatizer I think)
#ask Joe

In [80]:
book_lengths3, avg_num_tokens3, dictionary3, dictionary_length3, \
    unique_toks_per_fileid3, avg_unique_toks3, corpus3 = output_v3

NameError: name 'output_v3' is not defined

In [None]:
print "Average number of tokens in a book: ", avg_num_tokens3
print "   "
print "Average unique tokens in a book: ", avg_unique_toks3
print "   "
print "Total number of words (dictionary length): ", dictionary_length3

In [None]:
book_lengths3

In [None]:
unique_toks_per_fileid3

In [85]:
save_stuff('lemmatized', dictionary3, corpus3)

## EDA Step 4: + frequency filters

In [98]:
def eda_w_filter(transform_txt_file, fileid_lst=fileid_lst):
    '''
    Do transformations with updated transformation function and return all the eda items
    '''
    
    all_transf_books_lst = [transform_txt_file(f) for f in fileid_lst]
    
    book_lengths = [(tup[0], len(tup[1])) for tup in zip(fileid_lst, all_transf_books_lst)]
    avg_num_tokens = int(np.mean([len(book) for book in all_transf_books_lst]))
    
    dictionary = corpora.Dictionary(all_transf_books_lst)
    dictionary.filter_extremes(no_below=1)
    dictionary_length = len(dictionary)
    
    corpus = [dictionary.doc2bow(book) for book in all_transf_books_lst]
    
    unique_toks_num_lst = [len(book) for book in corpus]
    unique_toks_per_fileid = zip(fileid_lst, unique_toks_num_lst)
    avg_unique_toks = int(np.mean(unique_toks_num_lst))
    
    
    return book_lengths, avg_num_tokens, dictionary, dictionary_length, unique_toks_per_fileid, avg_unique_toks, corpus


def save_stuff(distinguishing_str, dictionary, corpus, model, outputs_dir=outputs_dir):
    '''
    Save the outputs of the most recent eda step
    '''
    if dictionary != None:
        dictionary.save(outputs_dir + distinguishing_str + '.dict')
        
    if corpus != None:
        corpora.MmCorpus.serialize(outputs_dir + distinguishing_str + '_corpus.mm', corpus)
    
    if model != None:
        pass

In [90]:
outputs4 = eda_w_filter(transform_txt_file_v2)

In [91]:
book_lengths4, avg_num_tokens4, dictionary4, dictionary_length4, \
    unique_toks_per_fileid4, avg_unique_toks4, corpus4 = outputs4

In [92]:
len(dictionary4)

100000

In [93]:
print "Average number of tokens in a book: ", avg_num_tokens4, avg_num_tokens2
print "   "
print "Average unique tokens in a book: ", avg_unique_toks4, avg_unique_toks2
print "   "
print "Total number of words (dictionary length): ", dictionary_length4, dictionary_length2

Average number of tokens in a book:  66904 66904
   
Average unique tokens in a book:  5461 8882
   
Total number of words (dictionary length):  100000 194961


In [94]:
book_lengths4

[('10-clean.txt', 431250),
 ('100-clean.txt', 481608),
 ('105-clean.txt', 39320),
 ('108-clean.txt', 54072),
 ('1080-clean.txt', 1739),
 ('11-clean.txt', 13616),
 ('1112-clean.txt', 15468),
 ('1184-clean.txt', 251348),
 ('12-clean.txt', 15589),
 ('120-clean.txt', 34581),
 ('1232-clean.txt', 23021),
 ('1260-clean.txt', 91946),
 ('1322-clean.txt', 67400),
 ('1342-clean.txt', 57307),
 ('135-clean.txt', 288377),
 ('1399-clean.txt', 173226),
 ('140-clean.txt', 70693),
 ('1400-clean.txt', 87450),
 ('1404-clean.txt', 91284),
 ('14264-clean.txt', 38978),
 ('147-clean.txt', 10640),
 ('1497-clean.txt', 98946),
 ('15399-clean.txt', 38240),
 ('158-clean.txt', 75454),
 ('16-clean.txt', 23919),
 ('160-clean.txt', 32962),
 ('161-clean.txt', 56016),
 ('16382-clean.txt', 71067),
 ('1661-clean.txt', 49411),
 ('1727-clean.txt', 59474),
 ('174-clean.txt', 38744),
 ('1952-yellow_wallpaper-clean.txt', 2978),
 ('19942-clean.txt', 18535),
 ('20-clean.txt', 47100),
 ('20203-clean.txt', 37660),
 ('203-clean.txt

In [95]:
book_lengths2

[('10-clean.txt', 431250),
 ('100-clean.txt', 481608),
 ('105-clean.txt', 39320),
 ('108-clean.txt', 54072),
 ('1080-clean.txt', 1739),
 ('11-clean.txt', 13616),
 ('1112-clean.txt', 15468),
 ('1184-clean.txt', 251348),
 ('12-clean.txt', 15589),
 ('120-clean.txt', 34581),
 ('1232-clean.txt', 23021),
 ('1260-clean.txt', 91946),
 ('1322-clean.txt', 67400),
 ('1342-clean.txt', 57307),
 ('135-clean.txt', 288377),
 ('1399-clean.txt', 173226),
 ('140-clean.txt', 70693),
 ('1400-clean.txt', 87450),
 ('1404-clean.txt', 91284),
 ('14264-clean.txt', 38978),
 ('147-clean.txt', 10640),
 ('1497-clean.txt', 98946),
 ('15399-clean.txt', 38240),
 ('158-clean.txt', 75454),
 ('16-clean.txt', 23919),
 ('160-clean.txt', 32962),
 ('161-clean.txt', 56016),
 ('16382-clean.txt', 71067),
 ('1661-clean.txt', 49411),
 ('1727-clean.txt', 59474),
 ('174-clean.txt', 38744),
 ('1952-yellow_wallpaper-clean.txt', 2978),
 ('19942-clean.txt', 18535),
 ('20-clean.txt', 47100),
 ('20203-clean.txt', 37660),
 ('203-clean.txt

In [96]:
unique_toks_per_fileid4

[('10-clean.txt', 9158),
 ('100-clean.txt', 20137),
 ('105-clean.txt', 3102),
 ('108-clean.txt', 5106),
 ('1080-clean.txt', 357),
 ('11-clean.txt', 1034),
 ('1112-clean.txt', 2145),
 ('1184-clean.txt', 16475),
 ('12-clean.txt', 1210),
 ('120-clean.txt', 3567),
 ('1232-clean.txt', 2779),
 ('1260-clean.txt', 10043),
 ('1322-clean.txt', 10005),
 ('1342-clean.txt', 3764),
 ('135-clean.txt', 18618),
 ('1399-clean.txt', 9753),
 ('140-clean.txt', 7287),
 ('1400-clean.txt', 7923),
 ('1404-clean.txt', 6065),
 ('14264-clean.txt', 3071),
 ('147-clean.txt', 1709),
 ('1497-clean.txt', 7402),
 ('15399-clean.txt', 4184),
 ('158-clean.txt', 5274),
 ('16-clean.txt', 2528),
 ('160-clean.txt', 4051),
 ('161-clean.txt', 3979),
 ('16382-clean.txt', 7184),
 ('1661-clean.txt', 4971),
 ('1727-clean.txt', 4713),
 ('174-clean.txt', 4169),
 ('1952-yellow_wallpaper-clean.txt', 372),
 ('19942-clean.txt', 2695),
 ('20-clean.txt', 6238),
 ('20203-clean.txt', 5035),
 ('203-clean.txt', 8870),
 ('205-clean.txt', 7839),

In [146]:
unique_toks_per_fileid2

[('10-clean.txt', 17208),
 ('100-clean.txt', 28785),
 ('105-clean.txt', 5922),
 ('108-clean.txt', 8578),
 ('1080-clean.txt', 994)]

In [99]:
save_stuff('frequency_filtered', dictionary4, corpus4, model=None)

## EDA Step 5: + phrase modelling

In [None]:
#Do if time

# First LDA Model!!!

In [100]:
from gensim.models import ldamodel

In [109]:
lda = ldamodel.LdaModel(corpus=corpus2,alpha='auto', id2word=dictionary2, num_topics=10, update_every=0, passes=20)

In [110]:
name = 'first_lda.model'
lda.save(outputs_dir + name)


In [111]:
print lda

LdaModel(num_terms=194961, num_topics=10, decay=0.5, chunksize=2000)


See the topics and their most significant terms

In [183]:
lda.show_topics()

[(0,
  u'0.027*"\n" + 0.007*"one" + 0.007*"would" + 0.005*"may" + 0.004*"man" + 0.004*"small" + 0.004*"said" + 0.004*"upon" + 0.004*"letter" + 0.003*"us"'),
 (1,
  u'0.036*"\n" + 0.010*"said" + 0.007*"would" + 0.007*"one" + 0.006*"could" + 0.005*"mr" + 0.004*"know" + 0.004*"time" + 0.004*"little" + 0.004*"see"'),
 (2,
  u'0.038*"\n" + 0.007*"one" + 0.006*"said" + 0.004*"would" + 0.004*"man" + 0.003*"us" + 0.003*"time" + 0.003*"like" + 0.003*"could" + 0.003*"much"'),
 (3,
  u'0.036*"\n" + 0.008*"man" + 0.007*"god" + 0.007*"men" + 0.007*"ich" + 0.006*"power" + 0.006*"one" + 0.005*"law" + 0.004*"also" + 0.004*"therefore"'),
 (4,
  u'0.055*"\n" + 0.022*"shall" + 0.020*"unto" + 0.017*"lord" + 0.013*"thou" + 0.011*"thy" + 0.010*"god" + 0.009*"thee" + 0.009*"ye" + 0.009*"said"'),
 (5,
  u'0.056*"\n" + 0.008*"said" + 0.007*"one" + 0.005*"man" + 0.004*"would" + 0.003*"upon" + 0.003*"like" + 0.003*"two" + 0.003*"could" + 0.003*"little"'),
 (6,
  u'0.038*"\n" + 0.019*"said" + 0.008*"one" + 0.006*

For a given document (in bow format), see most relevant topics

In [112]:
lda.get_document_topics(dictionary2.doc2bow(transform_txt_file_v2('1080-clean.txt')))

[(0, 0.83551263725276792),
 (1, 0.017758247818321121),
 (4, 0.033966879261823679),
 (7, 0.11079299655988328)]

In [186]:
lda.get_term_topics(100, minimum_probability=0.00001)

[(0, 0.00081795748432229377),
 (1, 0.00046665540461913628),
 (2, 0.00039821109443075416),
 (3, 0.00089142134354498184),
 (4, 0.00070689873255245941),
 (5, 0.00075794148467451953),
 (6, 0.00025252416712527251),
 (7, 0.00044098219041034128),
 (8, 0.00014582208768044617),
 (9, 0.0016219715531712738)]

In [114]:
lda.get_topic_terms(1)

[(10364, 0.035504333182389526),
 (8954, 0.0096323354543700022),
 (12919, 0.0073228099915345771),
 (1443, 0.006681396288898224),
 (3530, 0.0063353405287126258),
 (39835, 0.0052772699844260773),
 (16982, 0.0042263928343234903),
 (1031, 0.0036816693497437052),
 (9837, 0.00359973572788477),
 (2690, 0.0035094699408091535)]

In [115]:
for tup in lda.get_topic_terms(0):
    print dictionary2[tup[0]]



one
would
may
man
small
said
upon
letter
us


## Word2Vec

In [143]:
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence

In [118]:
usher1 = transform_txt_file_v1(usher)
usher1

[u'the',
 u'fall',
 u'of',
 u'the',
 u'house',
 u'of',
 u'usher',
 u'son',
 u'coeur',
 u'est',
 u'un',
 u'luth',
 u'suspendu',
 u'sitot',
 u"qu'on",
 u'le',
 u'touche',
 u'il',
 u'resonne',
 u'de',
 u'beranger',
 u'during',
 u'the',
 u'whole',
 u'of',
 u'a',
 u'dull',
 u'dark',
 u'and',
 u'soundless',
 u'day',
 u'in',
 u'the',
 u'autumn',
 u'of',
 u'the',
 u'year',
 u'when',
 u'the',
 u'clouds',
 u'hung',
 u'oppressively',
 u'low',
 u'in',
 u'the',
 u'heavens',
 u'i',
 u'had',
 u'been',
 u'passing',
 u'alone',
 u'on',
 u'horseback',
 u'through',
 u'a',
 u'singularly',
 u'dreary',
 u'tract',
 u'of',
 u'country',
 u'and',
 u'at',
 u'length',
 u'found',
 u'myself',
 u'as',
 u'the',
 u'shades',
 u'of',
 u'the',
 u'evening',
 u'drew',
 u'on',
 u'within',
 u'view',
 u'of',
 u'the',
 u'melancholy',
 u'house',
 u'of',
 u'usher',
 u'i',
 u'know',
 u'not',
 u'how',
 u'it',
 u'was--but',
 u'with',
 u'the',
 u'first',
 u'glimpse',
 u'of',
 u'the',
 u'building',
 u'a',
 u'sense',
 u'of',
 u'insuffe

In [144]:
sentences = LineSentence(source_dir + usher)
sentences

<gensim.models.word2vec.LineSentence at 0x7fa3810c6790>

In [182]:
#this does nothing to prepare the text - this was just to get some data
#read in. Look here for iterator method: https://rare-technologies.com/word2vec-tutorial/

In [159]:
w2v_model = Word2Vec(sentences)

In [160]:
##Note: word2vec does word freq pruning

In [161]:
type(w2v_model)

gensim.models.word2vec.Word2Vec

In [162]:
w2v_model.corpus_count

721

In [163]:
t = w2v_model.create_binary_tree()
t
#?? doesn't do anything

In [164]:
w2v_model.raw_vocab

defaultdict(int, {})

In [165]:
w2v_model.wv.index2word[0]

u'the'

In [179]:
[w2v_model.wv.index2word[n] for n in xrange(154)]

[u'the',
 u'of',
 u'and',
 u'I',
 u'a',
 u'in',
 u'to',
 u'which',
 u'his',
 u'that',
 u'had',
 u'was',
 u'with',
 u'as',
 u'my',
 u'upon',
 u'he',
 u'not',
 u'for',
 u'at',
 u'from',
 u'an',
 u'this',
 u'The',
 u'by',
 u'its',
 u'have',
 u'no',
 u'so',
 u'all',
 u'were',
 u'or',
 u'her',
 u'me',
 u'been',
 u'it',
 u'there',
 u'be',
 u'now',
 u'more',
 u'long',
 u'but',
 u'It',
 u'yet',
 u'some',
 u'into',
 u'over',
 u'on',
 u'could',
 u'A',
 u'His',
 u'if',
 u'one',
 u'very',
 u'about',
 u'even',
 u'however,',
 u'certain',
 u'through',
 u'their',
 u'portion',
 u'before',
 u'was,',
 u'than',
 u'many',
 u'him',
 u'within',
 u'having',
 u'our',
 u'And',
 u'wild',
 u'still',
 u'any',
 u'In',
 u'found',
 u'what',
 u'thus',
 u'well',
 u'we',
 u'character',
 u'these',
 u'it,',
 u'whole',
 u'up',
 u'me,',
 u'mind',
 u'shall',
 u'door,',
 u'air',
 u'hung',
 u'low',
 u'did',
 u'and,',
 u'myself',
 u'might',
 u'much',
 u'should',
 u'she',
 u'will',
 u'me.',
 u'would',
 u'ghastly',
 u'Usher',
 u'

In [180]:
w2v_model.wv['Usher']

array([  8.45681783e-03,  -4.39963164e-03,  -4.91341110e-03,
         5.38371969e-03,   6.34652330e-04,   5.14846249e-03,
        -3.66568426e-03,  -8.43488239e-03,   1.08329077e-05,
        -8.17000493e-03,  -2.30804435e-03,  -7.82304816e-03,
        -7.98802357e-03,   5.43261506e-03,  -2.34154589e-03,
         1.23616206e-02,   1.67156872e-03,  -2.01779325e-03,
         4.29236749e-03,  -4.66562016e-03,  -5.38349792e-04,
         1.96097558e-03,   1.03308698e-02,   6.90320227e-03,
        -6.02438522e-05,   3.74780095e-04,  -3.29784438e-04,
        -2.37007462e-03,  -1.33334603e-02,   1.53001456e-03,
        -1.39171313e-02,   1.34639088e-02,  -4.02349886e-03,
        -7.04521826e-03,  -4.18594759e-03,   7.48682255e-03,
        -6.18552440e-04,   3.92296817e-03,  -9.61316400e-04,
        -5.97934355e-04,  -2.50450894e-03,   1.25131372e-03,
        -5.60161285e-03,  -5.99381467e-03,  -2.64261430e-03,
         1.49250822e-03,   3.71399103e-03,   1.07647404e-02,
         4.66987398e-03,

In [181]:
##lots of work to do to understand this model and how to use it for my pres

## Doc2Vec

In [None]:
#dewey decimal system recreation??
#recreate gutenberg topics?

At the end - bar chart of changes?