In [1]:
import os
import codecs
import pandas as pd
from utils.helpers import adding_stanford_nlp_groups_NER_to_stop_words, removing_stanford_nlp_groups_NER_from_stop_words, punct_space_stop, line_review, lemmatized_sentence_corpus

essays = pd.read_csv('./data/prepped_essays_df.csv')

In [2]:
essays = essays[essays['essay_set'] == 1]

In [3]:
len(essays)

1783

In [4]:
essays.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,rater3_domain1,domain1_score,rater1_domain2,rater2_domain2,domain2_score,...,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6,prompt,has_source_material,source_text,grade_7,grade_8,grade_10
0,1,1,"Dear local newspaper, I think effects computer...",4.0,4.0,,8.0,,,,...,,,,,"More and more people use computers, but not ev...",0,,0,1,0
1,2,1,"Dear @CAPS1 @CAPS2, I believe that using compu...",5.0,4.0,,9.0,,,,...,,,,,"More and more people use computers, but not ev...",0,,0,1,0
2,3,1,"Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...",4.0,3.0,,7.0,,,,...,,,,,"More and more people use computers, but not ev...",0,,0,1,0
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5.0,5.0,,10.0,,,,...,,,,,"More and more people use computers, but not ev...",0,,0,1,0
4,5,1,"Dear @LOCATION1, I know having computers has a...",4.0,4.0,,8.0,,,,...,,,,,"More and more people use computers, but not ev...",0,,0,1,0


In [5]:
essays.dropna(axis=1, how='all', inplace=True)

In [6]:
essays.head(1)

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,domain1_score,prompt,has_source_material,grade_7,grade_8,grade_10
0,1,1,"Dear local newspaper, I think effects computer...",4.0,4.0,8.0,"More and more people use computers, but not ev...",0,0,1,0


In [7]:
essays.iloc[0, 2]

"Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of troble! Thing about! Dont you think so? How would you feel if your teenager is always on the phone with friends! Do you ever time to chat with your friends or buisness partner about things. Well now - there's a new way to chat the computer, theirs plenty of sites on the internet to do so: @ORGANIZATION1, @ORGANIZATION2, @CAPS1, facebook, myspace ect. Just think now while your setting up meeting with your boss on the computer, your teenager is having fun on the phone not rushing to get off cause you want to use it. How did you learn about other countrys/states outside of yours? Well I have by computer/internet, it's a new way to learn about what going on in our time! You might think your child spends a lot of time on the computer, but ask them so question about the econom

In [8]:
intermediate_directory = os.path.join('./data/intermediate')

essay_set1_txt_filepath = os.path.join(intermediate_directory,
                                   'essay_set1_text_all.txt')

In [9]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:
    
    essay_count = 0

    # create & open a new file in write mode
    with codecs.open(essay_set1_txt_filepath, 'w', encoding='utf_8') as essay_set1_txt_file:

        # loop through all essays in the dataframe
        for row in essays.itertuples():

            # write the essay as a line in the new file and escape newline characters in the original essays
            essay_set1_txt_file.write(row.essay.replace('\n', '\\n') + '\n')
            essay_count += 1

    print('Text from {:,} essays written to the new txt file.'.format(essay_count))
    
else:
    
    with codecs.open(essay_set1_txt_filepath, encoding='utf_8') as essay_set1_txt_file:
        for essay_count, line in enumerate(essay_set1_txt_file):
            pass
        
    print('Text from {:,} essays in the txt file.'.format(essay_count + 1))

Text from 1,783 essays written to the new txt file.
CPU times: user 11.2 ms, sys: 4.52 ms, total: 15.7 ms
Wall time: 28.9 ms


#### Text Preprocessing with spaCy

In [10]:
import spacy
import itertools as it

nlp = spacy.load('en_core_web_md')

In [11]:
test_essay = essays.iloc[0, 2]

In [12]:
test_essay

"Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of troble! Thing about! Dont you think so? How would you feel if your teenager is always on the phone with friends! Do you ever time to chat with your friends or buisness partner about things. Well now - there's a new way to chat the computer, theirs plenty of sites on the internet to do so: @ORGANIZATION1, @ORGANIZATION2, @CAPS1, facebook, myspace ect. Just think now while your setting up meeting with your boss on the computer, your teenager is having fun on the phone not rushing to get off cause you want to use it. How did you learn about other countrys/states outside of yours? Well I have by computer/internet, it's a new way to learn about what going on in our time! You might think your child spends a lot of time on the computer, but ask them so question about the econom

In [13]:
%%time
parsed_essay = nlp(test_essay)

CPU times: user 68 ms, sys: 11.3 ms, total: 79.3 ms
Wall time: 97.3 ms


In [14]:
for num, sentence in enumerate(parsed_essay.sents):
    print('Sentence {}:'.format(num + 1))
    print(sentence)
    print('')

Sentence 1:
Dear local newspaper, I think effects computers have on people are great learning skills/affects because they give us time to chat with friends/new people, helps us learn about the globe(astronomy) and keeps us out of troble!

Sentence 2:
Thing about!

Sentence 3:
Dont you think so?

Sentence 4:
How would you feel if your teenager is always on the phone with friends!

Sentence 5:
Do you ever time to chat with your friends or buisness partner about things.

Sentence 6:
Well now - there's a new way to chat the computer, theirs plenty of sites on the internet to do so: @ORGANIZATION1, @ORGANIZATION2, @CAPS1, facebook, myspace ect.

Sentence 7:
Just think now while your setting up meeting with your boss on the computer, your teenager is having fun on the phone not rushing to get off cause you want to use it.

Sentence 8:
How did you learn about other countrys

Sentence 9:
/states outside of yours?

Sentence 10:
Well I have by computer/internet

Sentence 11:
, it's a new way to 

In [15]:
for num, entity in enumerate(parsed_essay.ents):
    print('Entity {}:'.format(num + 1), entity, '-', entity.label_)
    print('')

Entity 1: @CAPS1 - ORG

Entity 2: @DATE1 - ORG

Entity 3: all day - DATE

Entity 4: @CAPS2 - ORG



In [16]:
token_text = [token.orth_ for token in parsed_essay]
token_pos = [token.pos_ for token in parsed_essay]

pd.DataFrame(zip(token_text, token_pos),
             columns=['token_text', 'part_of_speech'])

Unnamed: 0,token_text,part_of_speech
0,Dear,ADJ
1,local,ADJ
2,newspaper,NOUN
3,",",PUNCT
4,I,PRON
5,think,VERB
6,effects,NOUN
7,computers,NOUN
8,have,VERB
9,on,ADP


In [18]:
token_lemma = [token.lemma_ for token in parsed_essay]
token_shape = [token.shape_ for token in parsed_essay]

pd.DataFrame(zip(token_text, token_lemma, token_shape),
             columns=['token_text', 'token_lemma', 'token_shape'])

Unnamed: 0,token_text,token_lemma,token_shape
0,Dear,dear,Xxxx
1,local,local,xxxx
2,newspaper,newspaper,xxxx
3,",",",",","
4,I,-PRON-,X
5,think,think,xxxx
6,effects,effect,xxxx
7,computers,computer,xxxx
8,have,have,xxxx
9,on,on,xx


In [19]:
token_entity_type = [token.ent_type_ for token in parsed_essay]
token_entity_iob = [token.ent_iob_ for token in parsed_essay]

pd.DataFrame(zip(token_text, token_entity_type, token_entity_iob),
             columns=['token_text', 'entity_type', 'inside_outside_begin'])

Unnamed: 0,token_text,entity_type,inside_outside_begin
0,Dear,,O
1,local,,O
2,newspaper,,O
3,",",,O
4,I,,O
5,think,,O
6,effects,,O
7,computers,,O
8,have,,O
9,on,,O


In [20]:
token_attributes = [(token.orth_,
                     token.prob,
                     token.is_stop,
                     token.is_punct,
                     token.is_space,
                     token.like_num,
                     token.is_oov)
                    for token in parsed_essay]

df = pd.DataFrame(token_attributes,
                  columns=['text',
                           'log_probability',
                           'stop?',
                           'punctuation?',
                           'whitespace?',
                           'number?',
                           'out of vocab.?'])

df.loc[:, 'stop?':'out of vocab.?'] = (df.loc[:, 'stop?':'out of vocab.?']
                                       .applymap(lambda x: u'Yes' if x else u''))
                                               
df

Unnamed: 0,text,log_probability,stop?,punctuation?,whitespace?,number?,out of vocab.?
0,Dear,-11.536821,,,,,
1,local,-9.188298,,,,,
2,newspaper,-11.763153,,,,,
3,",",-3.454960,,Yes,,,
4,I,-3.791565,Yes,,,,
5,think,-6.180925,,,,,
6,effects,-9.838640,,,,,
7,computers,-10.650864,,,,,
8,have,-5.156485,Yes,,,,
9,on,-5.172736,Yes,,,,


In [21]:
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

In [22]:
# removing_stanford_nlp_groups_NER_from_stop_words(nlp)
adding_stanford_nlp_groups_NER_to_stop_words(nlp)

In [23]:
unigram_sentences_filepath = os.path.join(intermediate_directory,
                                          'unigram_sentences_all_essays.txt')

In [24]:
essays_set1_all_filepath = os.path.join(intermediate_directory, 'essay_set1_text_all.txt')

In [None]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:

    with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus(essays_set1_all_filepath, codecs, nlp):
            f.write(sentence + '\n')

In [24]:
unigram_sentences = LineSentence(unigram_sentences_filepath)

In [25]:
for unigram_sentence in it.islice(unigram_sentences, 0, 18):
    print(' '.join(unigram_sentence))
    print('')

dear local newspaper think effect computer people great learning skill affect time chat friend new people help learn globe(astronomy keep troble

thing

not think

feel teenager phone friend

time chat friend buisness partner thing

new way chat computer -PRON- plenty site internet facebook myspace ect

think set meeting boss computer teenager have fun phone rush cause want use

learn country

state outside

computer internet

new way learn go time

think child spend lot time computer ask question economy sea floor spread surprise know

believe computer interesting class day read book

child home computer local library well friend fresh perpressured know not right

know child forbidde hospital bed drive

child computer learning chat play game safe sound home community place

hope reach point understand agree computer great effect child give time chat friend new people help learn globe believe keep troble

thank listen



In [26]:
type(unigram_sentences)

gensim.models.word2vec.LineSentence

In [36]:
bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_all')

### Needs more sentences to train for bigrams and trigrams

Train a phrase model for word pairs, let's apply it to the sentences data and explore the results

In [37]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 1 == 1:

    bigram_model = Phrases(unigram_sentences)

    bigram_model.save(bigram_model_filepath)
    
# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)

CPU times: user 3.84 ms, sys: 2.53 ms, total: 6.37 ms
Wall time: 6.24 ms


In [38]:
bigram_sentences_filepath = os.path.join(intermediate_directory,
                                         'bigram_sentences_all.txt')

In [39]:
%%time

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:

    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:
        
        for unigram_sentence in unigram_sentences:
            
            bigram_sentence = ' '.join(bigram_model[unigram_sentence])
            
            f.write(bigram_sentence + '\n')

CPU times: user 3.4 ms, sys: 1.89 ms, total: 5.29 ms
Wall time: 3.99 ms


In [40]:
bigram_sentences = LineSentence(bigram_sentences_filepath)

In [41]:
for bigram_sentence in it.islice(bigram_sentences, 0, 18):
    print(' '.join(bigram_sentence))
    print('')

dear local newspaper -PRON- think effect computer have on people be great learning skill affect because -PRON- give -PRON- time to chat with friend new people help -PRON- learn about the globe(astronomy and keep -PRON- out of troble

thing about

do not -PRON- think so

how would -PRON- feel if -PRON- teenager be always on the phone with friend

Do -PRON- ever time to chat with -PRON- friend or buisness partner about thing

well now there be a new way to chat the computer -PRON- plenty of site on the internet to do so @ORGANIZATION1 @ORGANIZATION2 @CAPS1 facebook myspace ect

just think now while -PRON- set up meeting with -PRON- boss on the computer -PRON- teenager be have fun on the phone not rush to get off cause -PRON- want to use -PRON-

how do -PRON- learn about other country

state outside of -PRON-

well -PRON- have by computer internet

-PRON- be a new way to learn about what go on in -PRON- time

-PRON- may think -PRON- child spend a lot of time on the computer but ask -PRON-

#### Implementing Word2Vec From Scratch

In [41]:
import numpy as np
import re
import nltk
import matplotlib.pyplot as plt
%matplotlib inline

wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

In [42]:
def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

In [43]:
norm_corpus = normalize_corpus(test_essay)
norm_corpus

array('dear local newspaper think effects computers people great learning skillsaffects give us time chat friendsnew people helps us learn globeastronomy keeps us troble thing dont think would feel teenager always phone friends ever time chat friends buisness partner things well theres new way chat computer plenty sites internet organization organization caps facebook myspace ect think setting meeting boss computer teenager fun phone rushing get cause want use learn countrysstates outside well computerinternet new way learn going time might think child spends lot time computer ask question economy sea floor spreading even dates youll surprise much heshe knows believe computer much interesting class day reading books child home computer local library better friends fresh perpressured something know isnt right might know child caps forbidde hospital bed driveby rather child computer learning chatting playing games safe sound home community place hope reached point understand agree comput

In [40]:
# from string import punctuation

# remove_terms = punctuation + '0123456789'

# norm_corpus = [[word.lower() for word in sent if word not in remove_terms] for sent in test_essay]
# norm_corpus = [' '.join(tok_sent) for tok_sent in norm_corpus]
# norm_corpus = filter(None, normalize_corpus(norm_corpus))
# norm_corpus = [tok_sent for tok_sent in norm_corpus if len(tok_sent.split()) > 2]

# print('Total lines:', len(test_essay))

# norm_corpus
# print('\nSample line:', test_essay[1])
# print('\nProcessed line:', norm_corpus[1])

[]

#### CBoW

In [47]:
from keras.preprocessing import text
from keras.utils import np_utils
from keras.preprocessing import sequence

tokenizer = text.Tokenizer()
type(norm_corpus)
# tokenizer.fit_on_texts(norm_corpus)
# word2id = tokenizer.word_index

# # build vocabulary of unique words
# word2id['PAD'] = 0
# id2word = {v:k for k, v in word2id.items()}
# wids = [[word2id[w] for w in text.text_to_word_sequence(doc)] for doc in norm_corpus]

# vocab_size = len(word2id)
# embed_size = 100
# window_size = 2 # context window size

# print('Vocabulary Size:', vocab_size)
# print('Vocabulary Sample:', list(word2id.items())[:10])

numpy.ndarray