In [1]:
import os
import codecs
import pandas as pd
from utils.helpers import adding_stanford_nlp_groups_NER_to_stop_words, removing_stanford_nlp_groups_NER_from_stop_words, punct_space_stop, line_review, lemmatized_sentence_corpus

essays = pd.read_csv('./data/prepped_essays_df.csv')

In [2]:
essays = essays[essays['essay_set'] == 1]

In [3]:
len(essays)

1783

In [4]:
essays.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6,prompt,has_source_material,source_text,grade_7,grade_8,grade_10
0,1,1,"Dear local newspaper, I think effects computer...",4.0,4.0,,8.0,,,,...,,,,,"More and more people use computers, but not ev...",0,,0,1,0
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5.0,4.0,,9.0,,,,...,,,,,"More and more people use computers, but not ev...",0,,0,1,0
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4.0,3.0,,7.0,,,,...,,,,,"More and more people use computers, but not ev...",0,,0,1,0
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5.0,5.0,,10.0,,,,...,,,,,"More and more people use computers, but not ev...",0,,0,1,0
4,5,1,"Dear @LOCATION1, I know having computers has a...",4.0,4.0,,8.0,,,,...,,,,,"More and more people use computers, but not ev...",0,,0,1,0


In [5]:
essays.dropna(axis=1, how='all', inplace=True)

In [6]:
essays.head(1)

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,domain1_score,prompt,has_source_material,grade_7,grade_8,grade_10
0,1,1,"Dear local newspaper, I think effects computer...",4.0,4.0,8.0,"More and more people use computers, but not ev...",0,0,1,0


In [7]:
essays.iloc[0, 2]

"Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of troble! Thing about! Dont you think so? How would you feel if your teenager is always on the phone with friends! Do you ever time to chat with your friends or buisness partner about things. Well now - there's a new way to chat the computer, theirs plenty of sites on the internet to do so: @ORGANIZATION1, @ORGANIZATION2, @CAPS1, facebook, myspace ect. Just think now while your setting up meeting with your boss on the computer, your teenager is having fun on the phone not rushing to get off cause you want to use it. How did you learn about other countrys/states outside of yours? Well I have by computer/internet, it's a new way to learn about what going on in our time! You might think your child spends a lot of time on the computer, but ask them so question about the econom

In [8]:
intermediate_directory = os.path.join('./data/intermediate')

essay_set1_txt_filepath = os.path.join(intermediate_directory,
                                   'essay_set1_text_all.txt')

In [9]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:
    
    essay_count = 0

    # create & open a new file in write mode
    with codecs.open(essay_set1_txt_filepath, 'w', encoding='utf_8') as essay_set1_txt_file:

        # loop through all essays in the dataframe
        for row in essays.itertuples():

            # write the essay as a line in the new file and escape newline characters in the original essays
            essay_set1_txt_file.write(row.essay.replace('\n', '\\n') + '\n')
            essay_count += 1

    print('Text from {:,} essays written to the new txt file.'.format(essay_count))
    
else:
    
    with codecs.open(essay_set1_txt_filepath, encoding='utf_8') as essay_set1_txt_file:
        for essay_count, line in enumerate(essay_set1_txt_file):
            pass
        
    print('Text from {:,} essays in the txt file.'.format(essay_count + 1))

Text from 1,783 essays written to the new txt file.
CPU times: user 13.1 ms, sys: 3.37 ms, total: 16.4 ms
Wall time: 45.5 ms


#### Text Preprocessing with spaCy

In [10]:
import spacy
import itertools as it

nlp = spacy.load('en_core_web_md')

In [11]:
test_essay = essays.iloc[0, 2]

In [12]:
test_essay

"Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of troble! Thing about! Dont you think so? How would you feel if your teenager is always on the phone with friends! Do you ever time to chat with your friends or buisness partner about things. Well now - there's a new way to chat the computer, theirs plenty of sites on the internet to do so: @ORGANIZATION1, @ORGANIZATION2, @CAPS1, facebook, myspace ect. Just think now while your setting up meeting with your boss on the computer, your teenager is having fun on the phone not rushing to get off cause you want to use it. How did you learn about other countrys/states outside of yours? Well I have by computer/internet, it's a new way to learn about what going on in our time! You might think your child spends a lot of time on the computer, but ask them so question about the econom

In [13]:
%%time
parsed_essay = nlp(test_essay)

CPU times: user 71.6 ms, sys: 14.9 ms, total: 86.5 ms
Wall time: 149 ms


In [14]:
for num, sentence in enumerate(parsed_essay.sents):
    print('Sentence {}:'.format(num + 1))
    print(sentence)
    print('')

Sentence 1:
Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of troble!

Sentence 2:
Thing about!

Sentence 3:
Dont you think so?

Sentence 4:
How would you feel if your teenager is always on the phone with friends!

Sentence 5:
Do you ever time to chat with your friends or buisness partner about things.

Sentence 6:
Well now - there's a new way to chat the computer, theirs plenty of sites on the internet to do so: @ORGANIZATION1, @ORGANIZATION2, @CAPS1, facebook, myspace ect.

Sentence 7:
Just think now while your setting up meeting with your boss on the computer, your teenager is having fun on the phone not rushing to get off cause you want to use it.

Sentence 8:
How did you learn about other countrys

Sentence 9:
/states outside of yours?

Sentence 10:
Well I have by computer/internet

Sentence 11:
, it's a new way to 

In [15]:
for num, entity in enumerate(parsed_essay.ents):
    print('Entity {}:'.format(num + 1), entity, '-', entity.label_)
    print('')

Entity 1: @CAPS1 - ORG

Entity 2: @DATE1 - ORG

Entity 3: all day - DATE

Entity 4: @CAPS2 - ORG



In [16]:
# token_text = [token.orth_ for token in parsed_essay]
# token_pos = [token.pos_ for token in parsed_essay]

# pd.DataFrame(zip(token_text, token_pos),
#              columns=['token_text', 'part_of_speech'])

In [17]:
# token_lemma = [token.lemma_ for token in parsed_essay]
# token_shape = [token.shape_ for token in parsed_essay]

# pd.DataFrame(zip(token_text, token_lemma, token_shape),
#              columns=['token_text', 'token_lemma', 'token_shape'])

In [18]:
# token_entity_type = [token.ent_type_ for token in parsed_essay]
# token_entity_iob = [token.ent_iob_ for token in parsed_essay]

# pd.DataFrame(zip(token_text, token_entity_type, token_entity_iob),
#              columns=['token_text', 'entity_type', 'inside_outside_begin'])

In [19]:
# token_attributes = [(token.orth_,
#                      token.prob,
#                      token.is_stop,
#                      token.is_punct,
#                      token.is_space,
#                      token.like_num,
#                      token.is_oov)
#                     for token in parsed_essay]

# df = pd.DataFrame(token_attributes,
#                   columns=['text',
#                            'log_probability',
#                            'stop?',
#                            'punctuation?',
#                            'whitespace?',
#                            'number?',
#                            'out of vocab.?'])

# df.loc[:, 'stop?':'out of vocab.?'] = (df.loc[:, 'stop?':'out of vocab.?']
#                                        .applymap(lambda x: u'Yes' if x else u''))
                                               
# df

In [20]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

In [21]:
# removing_stanford_nlp_groups_NER_from_stop_words(nlp)
adding_stanford_nlp_groups_NER_to_stop_words(nlp)

In [22]:
unigram_sentences_filepath = os.path.join(intermediate_directory,
                                          'unigram_sentences_all_essays.txt')

In [23]:
essays_set1_all_filepath = os.path.join(intermediate_directory, 'essay_set1_text_all.txt')

In [24]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:

    with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus(essays_set1_all_filepath, codecs, nlp):
            f.write(sentence + '\n')

CPU times: user 55.8 s, sys: 16.5 s, total: 1min 12s
Wall time: 1min 15s


In [25]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [26]:
for unigram_sentence in it.islice(unigram_sentences, 19, 42):
    print(' '.join(unigram_sentence))
    print('')

believe computer benefit way like talk friend website like facebook mysace

computer help find coordibate location able ourselfs million information

computer benefit help job plan house plan type page report job write

let wonder world technology

computer help life talk make friend line

people myspace facebooks aim benefit have conversation

people believe computer bad friend talk

fortunate have computer help school work social life friend

computer help find location coordibate million information online

internet lot know website help location coordinate like

use computer

suppose vacation

million information find internet

question computer

easily draw house plan computer hour hand ugly erazer mark garrentee find job drawing like

apple job worker write long paper like word essay job fit people know like write word non stopp hour

hav

computer

computer need lot adays

hope essay impact descion computer great machine work

day show mom use computer

say great invention sense

In [27]:
type(unigram_sentences)

gensim.models.word2vec.LineSentence

In [28]:
bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_all')

Train a phrase model for word pairs, let's apply it to the sentences data and explore the results

In [29]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 1 == 1:

    bigram_model = Phrases(unigram_sentences)

    bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)

CPU times: user 852 ms, sys: 37 ms, total: 889 ms
Wall time: 1.3 s


In [30]:
bigram_sentences_filepath = os.path.join(intermediate_directory,
                                         'bigram_sentences_all.txt')

In [31]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:

    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for unigram_sentence in unigram_sentences:
            
            bigram_sentence = ' '.join(bigram_model[unigram_sentence])
            
            f.write(bigram_sentence + '\n')

CPU times: user 1.78 s, sys: 14 ms, total: 1.79 s
Wall time: 1.93 s


In [32]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [33]:
for bigram_sentence in it.islice(bigram_sentences, 19, 42):
    print(' '.join(bigram_sentence))
    print('')

believe computer benefit way like talk friend website like facebook mysace

computer help find coordibate location able ourselfs million information

computer benefit help job plan house plan type page report job write

let wonder world technology

computer help life talk make friend line

people myspace facebooks aim benefit have conversation

people believe computer bad friend talk

fortunate have computer help school_work social life friend

computer help find location coordibate million information online

internet lot know website help location coordinate like

use computer

suppose vacation

million information find internet

question computer

easily draw house plan computer hour hand ugly erazer mark garrentee find job drawing like

apple job worker write long paper like word essay job fit people know like write word non stopp hour

hav

computer

computer need lot adays

hope essay impact descion computer great machine work

day show mom use computer

say great_invention sense

In [34]:
trigram_model_filepath = os.path.join(intermediate_directory,
                                      'trigram_model_all')

In [35]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 1 == 1:

    trigram_model = Phrases(bigram_sentences)

    trigram_model.save(trigram_model_filepath)
    
# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)

CPU times: user 750 ms, sys: 24.4 ms, total: 775 ms
Wall time: 909 ms


In [36]:
trigram_sentences_filepath = os.path.join(intermediate_directory,
                                          'trigram_sentences_all.txt')

In [37]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:

    with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for bigram_sentence in bigram_sentences:
            
            trigram_sentence = ' '.join(trigram_model[bigram_sentence])
            
            f.write(trigram_sentence + '\n')

CPU times: user 1.46 s, sys: 7.16 ms, total: 1.47 s
Wall time: 1.48 s


In [38]:
trigram_sentences = LineSentence(trigram_sentences_filepath)

In [39]:
for trigram_sentence in it.islice(trigram_sentences, 205, 245):
    print(' '.join(trigram_sentence))
    print('')

right away

way able communicate family friend computer

computer easy e_mail tell answer house phone

happen think

People need communicate computer lot

work need talk employee co_worker leave desk e_mail information

employee fast_easy e_mail information oppose talk phone person

lot people agree computer life lot easy

computer teach_hand_eye_coordination let communicate people

critical reason computer let people learn_faraway_place people

difference way people feel computer

write local_newspaper

dear_reader dramatic effect human life

change way today

know computer

device allow people buy thing online talk people online provide_entertainment people

good quality everyone life easy

imagine look refrigerator notice

car need grocery shopping store far

computer look food online

ther great_deal

company deliver free

amazing easy way buy food leave_house

food purchase

product sell computer

need new toy kid

new hat friend

maybe curtain room

easy access internet computer 

In [40]:
trigram_essays_all_filepath = os.path.join(intermediate_directory,
                                        'trigram_essays_all.txt')

In [41]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:

#     with codecs.open(trigram_essays_all_filepath, 'w', encoding='utf_8') as f:
#         for sentence in lemmatized_sentence_corpus(essays_set1_all_filepath, codecs, nlp):
#             f.write(sentence + '\n')
    
    
    with codecs.open(trigram_essays_all_filepath, 'w', encoding='utf_8') as f:
        
        for parsed_essay in nlp.pipe(line_review(essays_set1_all_filepath, codecs),
                                      batch_size=100, n_threads=4):
            
            # lemmatize the text, removing punctuation and whitespace
            unigram_essays = [token.lemma_ for token in parsed_essay
                              if not punct_space_stop(token)]
            
            # apply the first-order and second-order phrase models
            bigram_essays = bigram_model[unigram_essays]
            trigram_essays = trigram_model[bigram_essays]
            
            # write the transformed review as a line in the new file
            trigram_essays = ' '.join(trigram_essays)
            f.write(trigram_essays + '\n')

CPU times: user 56.7 s, sys: 16.7 s, total: 1min 13s
Wall time: 1min 15s


In [42]:
print('Original:' + '\n')

for essay in it.islice(line_review(essays_set1_all_filepath, codecs), 301, 302):
    print(essay)

print('----' + '\n')
print('Transformed:' + '\n')

with codecs.open(trigram_essays_all_filepath, encoding='utf_8') as f:
    for essay in it.islice(f, 301, 302):
        print(essay)

Original:

Dear @CAPS1 times, @CAPS2 you think computers benefit society? Well I think so! There are countless reasons why computers are both resourceful and helpful. Many citizens in our own community of watertown think computers are a great resource for many things while others disagree with this completely. Computers can benefit society because you can learn many new things on the internet, also you can interact with your friends and family, and lastly there are many applications used for business. On both a computer and the internet there are more than @NUM1 million things you can learn. When you are struggling with homework a computer is a great resource. You can quickly open @CAPS3.com and search any topic at any time. For example, if you did not know a conversion it is easily found on the internet. Another thing you can be taught or informed about is news. There are websites such as nytimes.com and cnn.com that give you daily news. I personally use these websites weekly. On the 

## Topic Modeling with Latent Dirichlet Allocation (_LDA_)

*Topic modeling* is family of techniques that can be used to describe and summarize the documents in a corpus according to a set of latent "topics". For this demo, we'll be using [*Latent Dirichlet Allocation*](http://www.jmlr.org/papers/volume3/blei03a/blei03a.pdf) or LDA, a popular approach to topic modeling.

In many conventional NLP applications, documents are represented a mixture of the individual tokens (words and phrases) they contain. In other words, a document is represented as a *vector* of token counts. There are two layers in this model &mdash; documents and tokens &mdash; and the size or dimensionality of the document vectors is the number of tokens in the corpus vocabulary. This approach has a number of disadvantages:
* Document vectors tend to be large (one dimension for each token $\Rightarrow$ lots of dimensions)
* They also tend to be very sparse. Any given document only contains a small fraction of all tokens in the vocabulary, so most values in the document's token vector are 0.
* The dimensions are fully indepedent from each other &mdash; there's no sense of connection between related tokens, such as _knife_ and _fork_.

LDA injects a third layer into this conceptual model. Documents are represented as a mixture of a pre-defined number of *topics*, and the *topics* are represented as a mixture of the individual tokens in the vocabulary. The number of topics is a model hyperparameter selected by the practitioner. LDA makes a prior assumption that the (document, topic) and (topic, token) mixtures follow [*Dirichlet*](https://en.wikipedia.org/wiki/Dirichlet_distribution) probability distributions. This assumption encourages documents to consist mostly of a handful of topics, and topics to consist mostly of a modest set of the tokens.

In [43]:
from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import pyLDAvis
import pyLDAvis.gensim
import warnings
import pickle

The first step to creating an LDA model is to learn the full vocabulary of the corpus to be modeled. We'll use gensim's [**Dictionary**](https://radimrehurek.com/gensim/corpora/dictionary.html) class for this.

In [44]:
trigram_dictionary_filepath = os.path.join(intermediate_directory,
                                           'trigram_dict_all.dict')

In [45]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to learn the dictionary yourself.
if 1 == 1:

    trigram_essays = LineSentence(trigram_essays_all_filepath)

    # learn the dictionary by iterating over all of the reviews
    trigram_dictionary = Dictionary(trigram_essays)
    
    # filter tokens that are very rare or too common from
    # the dictionary (filter_extremes) and reassign integer ids (compactify)
    trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)
    trigram_dictionary.compactify()

    trigram_dictionary.save(trigram_dictionary_filepath)
    
# load the finished dictionary from disk
trigram_dictionary = Dictionary.load(trigram_dictionary_filepath)

CPU times: user 279 ms, sys: 8.02 ms, total: 287 ms
Wall time: 675 ms


Like many NLP techniques, LDA uses a simplifying assumption known as the [*bag-of-words* model](https://en.wikipedia.org/wiki/Bag-of-words_model). In the bag-of-words model, a document is represented by the counts of distinct terms that occur within it. Additional information, such as word order, is discarded. 

Using the gensim Dictionary we learned to generate a bag-of-words representation for each review. The `trigram_bow_generator` function implements this. We'll save the resulting bag-of-words reviews as a matrix.

In the following code, "bag-of-words" is abbreviated as `bow`.

In [46]:
trigram_bow_filepath = os.path.join(intermediate_directory,
                                    'trigram_bow_corpus_all.mm')

In [47]:
def trigram_bow_generator(filepath):
    """
    generator function to read reviews from a file
    and yield a bag-of-words representation
    """
    
    for essay in LineSentence(filepath):
        yield trigram_dictionary.doc2bow(essay)

In [48]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to build the bag-of-words corpus yourself.
if 1 == 1:

    # generate bag-of-words representations for
    # all reviews and save them as a matrix
    MmCorpus.serialize(trigram_bow_filepath, trigram_bow_generator(trigram_essays_all_filepath))
    
# load the finished bag-of-words corpus from disk
trigram_bow_corpus = MmCorpus(trigram_bow_filepath)

CPU times: user 347 ms, sys: 8.73 ms, total: 356 ms
Wall time: 362 ms


With the bag-of-words corpus, we're finally ready to learn our topic model from the essays. We simply need to pass the bag-of-words matrix and Dictionary from our previous steps to `LdaMulticore` as inputs, along with the number of topics the model should learn. For this demo, we're asking for 5 topics.

In [49]:
lda_model_filepath = os.path.join(intermediate_directory, 'lda_model_all')

In [50]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to train the LDA model yourself.
if 1 == 1:

    with warnings.catch_warnings():
        warnings.simplefilter('ignore')
        
        # workers => sets the parallelism, and should be
        # set to your number of physical cores minus one
        lda = LdaMulticore(trigram_bow_corpus,
                           num_topics=5,
                           id2word=trigram_dictionary,
                           workers=3)
    
    lda.save(lda_model_filepath)
    
# load the finished LDA model from disk
lda = LdaMulticore.load(lda_model_filepath)

CPU times: user 2.7 s, sys: 268 ms, total: 2.97 s
Wall time: 3.05 s


Our topic model is now trained and ready to use! Since each topic is represented as a mixture of tokens, you can manually inspect which tokens have been grouped together into which topics to try to understand the patterns the model has discovered in the data.

In [51]:
def explore_topic(topic_number, topn=5):
    """
    accept a user-supplied topic number and
    print out a formatted list of the top terms
    """
        
    print('{:20} {}'.format('term', 'frequency') + '\n')

    for term, frequency in lda.show_topic(topic_number, topn=5):
        print('{:20} {:.3f}'.format(term, round(frequency, 3)))

In [56]:
explore_topic(topic_number=4)

term                 frequency

online               0.015
learn                0.013
find                 0.012
look                 0.011
kid                  0.011


In [57]:
topic_names = {0: 'reasons to spend time online',
               1: 'spend time online kids play games',
               2: 'internet helps to learn about the world',
               3: 'want to find/learn but could be bad',
               4: 'internet is a tool to find and look'}

## pickle stops this viz

In [58]:
topic_names_filepath = os.path.join(intermediate_directory, 'topic_names.pkl')

with open(topic_names_filepath, 'wb') as f:
    pickle.dump(topic_names, f)

In [59]:
LDAvis_data_filepath = os.path.join(intermediate_directory, 'ldavis_prepared')

In [60]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:

    LDAvis_prepared = pyLDAvis.gensim.prepare(lda, trigram_bow_corpus,
                                              trigram_dictionary, sort=False)

    with open(LDAvis_data_filepath, 'wb') as f:
        pickle.dump(LDAvis_prepared, f)
        
# load the pre-prepared pyLDAvis data from disk
with open(LDAvis_data_filepath, 'rb') as f:
    LDAvis_prepared = pickle.load(f)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


CPU times: user 2.84 s, sys: 182 ms, total: 3.02 s
Wall time: 51.3 s


In [61]:
pyLDAvis.display(LDAvis_prepared)

### Describing text with LDA
Beyond data exploration, one of the key uses for an LDA model is providing a compact, quantitative description of natural language text. Once an LDA model has been trained, it can be used to represent free text as a mixture of the topics the model learned from the original corpus. This mixture can be interpreted as a probability distribution across the topics, so the LDA representation of a paragraph of text might look like 50% _Topic A_, 20% _Topic B_, 20% _Topic C_, and 10% _Topic D_.

To use an LDA model to generate a vector representation of new text, you'll need to apply any text preprocessing steps you used on the model's training corpus to the new text, too. For our model, the preprocessing steps we used include:
1. Using spaCy to remove punctuation and lemmatize the text
1. Applying our first-order phrase model to join word pairs
1. Applying our second-order phrase model to join longer phrases
1. Removing stopwords
1. Creating a bag-of-words representation

Once you've applied these preprocessing steps to the new text, it's ready to pass directly to the model to create an LDA representation. The `lda_description(...)` function will perform all these steps for us, including printing the resulting topical description of the input text.

In [62]:
def get_sample_essay(essay_number):
    """
    retrieve a particular review index
    from the reviews file and return it
    """
    
    return list(it.islice(line_review(essay_set1_txt_filepath, codecs),essay_number, essay_number+1))[0]

In [63]:
def lda_description(essay_text, min_topic_freq=0.05):
    """
    accept the original text of a review and (1) parse it with spaCy,
    (2) apply text pre-proccessing steps, (3) create a bag-of-words
    representation, (4) create an LDA representation, and
    (5) print a sorted list of the top topics in the LDA representation
    """
    
    # parse the essay text with spaCy
    parsed_essay = nlp(essay_text)
    
    # lemmatize the text and remove punctuation and whitespace
    unigram_essay = [token.lemma_ for token in parsed_essay
                      if not punct_space_stop(token)]
    
    # apply the first-order and secord-order phrase models
    bigram_essay = bigram_model[unigram_essay]
    trigram_essay = trigram_model[bigram_essay]
    
    # create a bag-of-words representation
    essay_bow = trigram_dictionary.doc2bow(trigram_essay)
    
    # create an LDA representation
    essay_lda = lda[essay_bow]
    
    # sort with the most highly related topics first
    essay_lda = sorted(essay_lda)
    
    for topic_number, freq in essay_lda:
        if freq < min_topic_freq:
            break
            
        # print the most highly related topic names and frequencies
        print('{:25} {}'.format(topic_names[topic_number],round(freq, 3)))


In [64]:
sample_essay = get_sample_essay(56)
print(sample_essay)

in @DATE1 has become far more advanced. Computers are one of the most futiristic objects. The computer is good for connecting to people around the world. However, computers have bad effects on people's lives. Children are taking time out of their homework to go on the computer, citizens are gaining weight, and children are being kidnapped. School is very important and valuable if you want to suceed in life. However, computers have made it difficult for students to concentrat on their homework. This is a horrible habbit and a bad effect on students every where. Students grades will suffer and will not obtain class credits if this problem is not mentioned. Obesity is a huge problem in @LOCATION1, five out of every ten @CAPS1 are either overweight or obese. This can lead to death. The problem is that people are spending hours on the computer each day. Most of that time can be used to exercise. Exercising can be an important benefit to someone's life. If this problem is not addressed milli

In [65]:
lda_description(sample_essay)

reasons to spend time online 0.800000011920929
internet is a tool to find and look 0.19200000166893005


## Word Vector Embedding with Word2Vec

The goal of *word vector embedding models*, or *word vector models* for short, is to learn dense, numerical vector representations for each term in a corpus vocabulary. If the model is successful, the vectors it learns about each term should encode some information about the *meaning* or *concept* the term represents, and the relationship between it and other terms in the vocabulary. Word vector models are also fully unsupervised &mdash; they learn all of these meanings and relationships solely by analyzing the text of the corpus, without any advance knowledge provided.

Perhaps the best-known word vector model is [word2vec](https://arxiv.org/pdf/1301.3781v3.pdf), originally proposed in 2013. The general idea of word2vec is, for a given *focus word*, to use the *context* of the word &mdash; i.e., the other words immediately before and after it &mdash; to provide hints about what the focus word might mean. To do this, word2vec uses a *sliding window* technique, where it considers snippets of text only a few tokens long at a time.

At the start of the learning process, the model initializes random vectors for all terms in the corpus vocabulary. The model then slides the window across every snippet of text in the corpus, with each word taking turns as the focus word. Each time the model considers a new snippet, it tries to learn some information about the focus word based on the surrouding context, and it "nudges" the words' vector representations accordingly. One complete pass sliding the window across all of the corpus text is known as a training *epoch*. It's common to train a word2vec model for multiple passes/epochs over the corpus. Over time, the model rearranges the terms' vector representations such that terms that frequently appear in similar contexts have vector representations that are *close* to each other in vector space.

For a deeper dive into word2vec's machine learning process, see [here](https://arxiv.org/pdf/1411.2738v4.pdf).

Word2vec has a number of user-defined hyperparameters, including:
- The dimensionality of the vectors. Typical choices include a few dozen to several hundred.
- The width of the sliding window, in tokens. Five is a common default choice, but narrower and wider windows are possible.
- The number of training epochs.

For using word2vec in Python, [gensim](https://rare-technologies.com/deep-learning-with-word2vec-and-gensim/) comes to the rescue again! It offers a [highly-optimized](https://rare-technologies.com/word2vec-in-python-part-two-optimizing/), [parallelized](https://rare-technologies.com/parallelizing-word2vec-in-python/) implementation of the word2vec algorithm with its [Word2Vec](https://radimrehurek.com/gensim/models/word2vec.html) class.

In [66]:
from gensim.models import Word2Vec

trigram_sentences = LineSentence(trigram_sentences_filepath)
word2vec_filepath = os.path.join(intermediate_directory, 'word2vec_model_all')

We'll train our word2vec model using the normalized sentences with our phrase models applied. We'll use 100-dimensional vectors, and set up our training process to run for twelve epochs.

In [68]:
# https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial

word2vec_filepath = os.path.join(intermediate_directory, 'word2vec_model_all')

In [72]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to train the word2vec model yourself.
if 1 == 1:

    t = time()
    # initiate the model and perform 15 epochs of training
    # workers should be cores - 1
    essay2vec_model = Word2Vec(min_count=20, window=5, size=100, sample=6e-5, alpha=0.03, min_alpha=0.0007, negative=20, workers=4)    
    essay2vec_model.build_vocab(trigram_sentences)
    
    essay2vec_model.train(trigram_sentences, total_examples=essay2vec_model.corpus_count, epochs=15, report_delay=1)
    
    essay2vec_model.save(word2vec_filepath)

        
# load the finished model from disk
essay2vec_model = Word2Vec.load(word2vec_filepath)
essay2vec_model.init_sims()

print('{} training epochs so far.'.format(essay2vec.train_count))

1 training epochs so far.
CPU times: user 6.85 s, sys: 83 ms, total: 6.94 s
Wall time: 5.09 s


In [74]:
print('{:,} terms in the essay2vec vocabulary.'.format(len(essay2vec.wv.vocab)))

1,257 terms in the essay2vec vocabulary.


Let's take a peek at the word vectors our model has learned. We'll create a pandas DataFrame with the terms as the row labels, and the 100 dimensions of the word vector model as the columns.

In [77]:
# build a list of the terms, integer indices,
# and term counts from the food2vec model vocabulary
ordered_vocab = [(term, voc.index, voc.count)
                 for term, voc in essay2vec_model.wv.vocab.items()]

# sort by the term counts, so the most common terms appear first
ordered_vocab = sorted(ordered_vocab)

# unzip the terms, integer indices, and counts into separate lists
ordered_terms, term_indices, term_counts = zip(*ordered_vocab)

# create a DataFrame with the food2vec vectors as data,
# and the terms as row labels
word_vectors = pd.DataFrame(essay2vec_model.wv.syn0norm[term_indices, :], index=ordered_terms)

word_vectors

  


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
@PERSON1,0.030480,0.035113,0.044434,-0.190349,0.148193,0.028287,-0.052105,-0.065568,-0.027474,0.035755,...,-0.052380,-0.133913,-0.021697,0.056268,0.038796,0.067700,-0.137518,0.226192,0.152228,0.085732
@PERSON1_say,0.033969,0.032913,0.044314,-0.191103,0.153822,0.025473,-0.057915,-0.070588,-0.027878,0.032905,...,-0.055568,-0.127464,-0.018683,0.054980,0.039787,0.071721,-0.136831,0.222935,0.155300,0.084624
@PERSON2,0.035231,0.032043,0.041507,-0.194365,0.150678,0.026782,-0.056446,-0.067538,-0.023344,0.034510,...,-0.057637,-0.134358,-0.020192,0.056550,0.040322,0.067763,-0.140066,0.226852,0.156464,0.082387
@PERSON2_say,0.036148,0.034333,0.042328,-0.191491,0.154482,0.024721,-0.053955,-0.068399,-0.023884,0.037533,...,-0.056206,-0.133016,-0.019788,0.062885,0.033918,0.067649,-0.141528,0.226790,0.150178,0.088211
@PERSON3,0.038615,0.034961,0.040289,-0.189257,0.150408,0.024896,-0.051647,-0.074082,-0.024202,0.035583,...,-0.053212,-0.132979,-0.023295,0.060976,0.035748,0.065559,-0.141217,0.228079,0.152102,0.082222
@PERSON4,0.036174,0.036861,0.038132,-0.188017,0.147523,0.027491,-0.054653,-0.070253,-0.026903,0.033629,...,-0.055052,-0.129481,-0.021312,0.061414,0.038601,0.067152,-0.139477,0.225957,0.153509,0.087847
Dear_Local_Newspaper,0.031070,0.043217,0.050563,-0.188614,0.144631,0.028800,-0.051657,-0.071788,-0.026099,0.032384,...,-0.066571,-0.129703,-0.021749,0.057222,0.029597,0.069460,-0.143540,0.217186,0.149730,0.090411
Dear_Newspaper,0.027973,0.047220,0.048904,-0.190008,0.145581,0.031019,-0.053588,-0.064865,-0.028526,0.032300,...,-0.062390,-0.129648,-0.015787,0.055320,0.025584,0.070482,-0.139118,0.220140,0.154037,0.086463
Dr.,0.027154,0.038441,0.045385,-0.189379,0.149567,0.025329,-0.051626,-0.069634,-0.024352,0.032016,...,-0.059168,-0.134579,-0.020480,0.055442,0.039156,0.065558,-0.141085,0.225010,0.149317,0.088061
Facebook,0.036114,0.038485,0.043668,-0.192597,0.155625,0.027446,-0.054392,-0.071754,-0.028514,0.035926,...,-0.057995,-0.132622,-0.020808,0.060782,0.036050,0.064534,-0.142746,0.226364,0.155291,0.085795


Holy wall of numbers! This DataFrame has 1,257 rows &mdash; one for each term in the vocabulary &mdash; and 100 colums. Our model has learned a quantitative vector representation for each term, as expected.

Put another way, our model has "embedded" the terms into a 100-dimensional vector space.

### So... what can we do with all these numbers?
The first thing we can use them for is to simply look up related words and phrases for a given term of interest.

In [79]:
def get_related_terms(token, topn=5):
    """
    look up the topn most similar terms to token
    and print them as a formatted list
    """

    for word, similarity in essay2vec_model.wv.most_similar(positive=[token], topn=topn):

        print('{:20} {}'.format(word, round(similarity, 3)))

### What things are like Facebook?

In [81]:
get_related_terms('facebook')

myspace              1.0
facebook_myspace     1.0
chat_room            0.999
twitter              0.999
family_member        0.999


In [82]:
get_related_terms('society')

conclusion           1.0
technology           0.999
opinion              0.999
positive             0.999
effect               0.999


#### Implementing Word2Vec From Scratch

In [41]:
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
%matplotlib inline

wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

In [42]:
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [43]:
norm_corpus = normalize_corpus(test_essay)
norm_corpus

array('dear local newspaper think effects computers people great learning skillsaffects give us time chat friendsnew people helps us learn globeastronomy keeps us troble thing dont think would feel teenager always phone friends ever time chat friends buisness partner things well theres new way chat computer plenty sites internet organization organization caps facebook myspace ect think setting meeting boss computer teenager fun phone rushing get cause want use learn countrysstates outside well computerinternet new way learn going time might think child spends lot time computer ask question economy sea floor spreading even dates youll surprise much heshe knows believe computer much interesting class day reading books child home computer local library better friends fresh perpressured something know isnt right might know child caps forbidde hospital bed driveby rather child computer learning chatting playing games safe sound home community place hope reached point understand agree comput

In [40]:
# from string import punctuation

# remove_terms = punctuation + '0123456789'

# norm_corpus = [[word.lower() for word in sent if word not in remove_terms] for sent in test_essay]
# norm_corpus = [' '.join(tok_sent) for tok_sent in norm_corpus]
# norm_corpus = filter(None, normalize_corpus(norm_corpus))
# norm_corpus = [tok_sent for tok_sent in norm_corpus if len(tok_sent.split()) > 2]

# print('Total lines:', len(test_essay))

# norm_corpus
# print('\nSample line:', test_essay[1])
# print('\nProcessed line:', norm_corpus[1])

[]

#### CBoW

In [47]:
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence

tokenizer = text.Tokenizer()
type(norm_corpus)
# tokenizer.fit_on_texts(norm_corpus)
# word2id = tokenizer.word_index

# # build vocabulary of unique words
# word2id['PAD'] = 0
# id2word = {v:k for k, v in word2id.items()}
# wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_corpus]

# vocab_size = len(word2id)
# embed_size = 100
# window_size = 2 # context window size

# print('Vocabulary Size:', vocab_size)
# print('Vocabulary Sample:', list(word2id.items())[:10])

numpy.ndarray