In [3]:
import os
import codecs
import spacy
import itertools as it
import pandas as pd
from gensim.models import Phrases
from gensim.models.word2vec import LineSentence

nlp = spacy.load('en_core_web_md')

essays = pd.read_csv('../data/intermediate/prepped_essays_df.csv')

# ----------- ISOLATE JUST ESSAYS FROM 6th SET ------------ #
essays = essays[essays['essay_set'] == 6]
essays.dropna(axis=1, how='all', inplace=True)

intermediate_directory = os.path.join('../data/intermediate')

essay_set6_txt_filepath = os.path.join(intermediate_directory, 'essay_set6_text_all.txt')

In [5]:
# ----------- WRITE ALL ESSAYS TO A .TXT FILE ------------ #
if 0 == 1:
    essay_count = 0

    # create & open a new file in write mode
    with codecs.open(essay_set6_txt_filepath, 'w', encoding='utf_8') as essay_set6_txt_file:

        # loop through all essays in the dataframe
        for row in essays.itertuples():
            # write the essay as a line in the new file and escape newline characters in the original essays
            essay_set6_txt_file.write(row.essay.replace('\n', '\\n') + '\n')
            essay_count += 1

        print('Text from {:,} essays written to the new txt file.'.format(essay_count))

else:

    with codecs.open(essay_set6_txt_filepath, encoding='utf_8') as essay_set6_txt_file:
        for essay_count, line in enumerate(essay_set6_txt_file):
            pass

        print('Text from {:,} essays in the txt file.'.format(essay_count + 1))

Text from 1,800 essays written to the new txt file.


In [14]:
# ----------- USING SPACY ON A SINGLE ESSAY ------------ #
# Run the commands below to look more into a specific essay

test_essay = essays.iloc[1443, 2]
test_essay
# parsed_essay = nlp(test_essay)

# for num, sentence in enumerate(parsed_essay.sents):
#     print('Sentence {}:'.format(num + 1))
#     print(sentence)
#     print('')

# for num, entity in enumerate(parsed_essay.ents):
#     print('Entity {}:'.format(num + 1), entity, '-', entity.label_)
#     print('')

# token_text = [token.orth_ for token in parsed_essay]
# token_pos = [token.pos_ for token in parsed_essay]

# pd.DataFrame(zip(token_text, token_pos), columns=['token_text', 'part_of_speech'])

# token_lemma = [token.lemma_ for token in parsed_essay]
# token_shape = [token.shape_ for token in parsed_essay]

# pd.DataFrame(zip(token_text, token_lemma, token_shape), columns=['token_text', 'token_lemma', 'token_shape'])

# token_entity_type = [token.ent_type_ for token in parsed_essay]
# token_entity_iob = [token.ent_iob_ for token in parsed_essay]

# pd.DataFrame(zip(token_text, token_entity_type, token_entity_iob), columns=['token_text', 'entity_type', 'inside_outside_begin'])

# token_attributes = [(token.orth_,
#                      token.prob,
#                      token.is_stop,
#                      token.is_punct,
#                      token.is_space,
#                      token.like_num,
#                      token.is_oov)
#                     for token in parsed_essay]

# df = pd.DataFrame(token_attributes,
#                   columns=['text',
#                            'log_probability',
#                            'stop?',
#                            'punctuation?',
#                            'whitespace?',
#                            'number?',
#                            'out of vocab.?'])

# df.loc[:, 'stop?':'out of vocab.?'] = (df.loc[:, 'stop?':'out of vocab.?'].applymap(lambda x: u'Yes' if x else u''))

# df

'There were many obstacles the builders of the Empire State Building faced in attempting to allow dirigibles to dock on the mooring mast. One obstacle they faced was the lack of a suitable landing area. They would have to design a mast for the dirigibles. Another obstacle was the dirigibles was held by a single cable tether and that would add stress to the building\'s frame. Next, New York City would have to spend over sixty thousand dollars\' worth of modifications had to be made to the building\'s framework. Then the greatest reason of safety came upon them. That was, most dirigibles from outside of the United States used hydrogen rather than helium. And hydrogen is highly flammable. On @DATE1 a German dirigible "Hindenburg" was destroyed by fire in Lakehurst, New Jersey. The owners of the Empire state building realized if that happened to New York and how big of a problem that would cause. The final and greatest obstacle they faced to the successful use of the mooring mast was natur

In [15]:
# ----------- ADD OR REMOVE STOP WORDS ------------ #
# removing_stanford_nlp_groups_NER_from_stop_words(nlp)
def adding_stanford_nlp_groups_NER_to_stop_words():
    """
    helper funciton to add Stanford NLP Group NERs to spaCy stop words
    range of 0 - 15
    """

    for number in list(range(0, 16)):
        nlp.vocab['@ORGANIZATION' + str(number)].is_stop = True
        nlp.vocab['@PERSON' + str(number)].is_stop = True
        nlp.vocab['@CAPS' + str(number)].is_stop = True
        nlp.vocab['@LOCATION' + str(number)].is_stop = True
        nlp.vocab['@DATE' + str(number)].is_stop = True
        nlp.vocab['@TIME' + str(number)].is_stop = True
        nlp.vocab['@MONEY' + str(number)].is_stop = True
        nlp.vocab['@PERCENT' + str(number)].is_stop = True
        nlp.vocab['@MONTH' + str(number)].is_stop = True
        nlp.vocab['@EMAIL' + str(number)].is_stop = True
        nlp.vocab['@NUM' + str(number)].is_stop = True
        nlp.vocab['@DR' + str(number)].is_stop = True
        nlp.vocab['@CITY' + str(number)].is_stop = True
        nlp.vocab['@STATE' + str(number)].is_stop = True
        
adding_stanford_nlp_groups_NER_to_stop_words()

In [16]:
def punct_space_stop(token):
    """
    helper function to eliminate tokens
    that are pure punctuation, whitespace or stopwords
    """
    
    return token.is_punct or token.is_space or token.is_stop

def line_review(filename):
    """
    generator function to read in reviews from the file
    and un-escape the original line breaks in the text
    """
    
    with codecs.open(filename, encoding='utf_8') as f:
        for review in f:
            yield review.replace('\\n', '\n')
            
def lemmatized_sentence_corpus(filename):
    """
    generator function to use spaCy to parse reviews,
    lemmatize the text, and yield sentences
    """
    
    for parsed_review in nlp.pipe(line_review(filename), batch_size=100, n_threads=4):
        
        for sent in parsed_review.sents:
            yield ' '.join([token.lemma_ for token in sent if not punct_space_stop(token)])


# ----------- LOOKING AT UNIGRAMS ------------ #
unigram_sentences_filepath = os.path.join(intermediate_directory, 'unigram_sentences_all_essays6.txt')
essays_set6_all_filepath = os.path.join(intermediate_directory, 'essay_set6_text_all.txt')

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:

    with codecs.open(unigram_sentences_filepath, 'w', encoding='utf_8') as f:
        for sentence in lemmatized_sentence_corpus(essays_set6_all_filepath):
            f.write(sentence + '\n')

unigram_sentences = LineSentence(unigram_sentences_filepath)

for unigram_sentence in it.islice(unigram_sentences, 19, 42):
    print(' '.join(unigram_sentence))
    print('')

obstacle dirigible hold single cable tether

add stress building frame order frame sturdy modification building framework cost thousand dollar

obstacle builder face dirigible use hydrogen highly flammable

New York densely populated area work

dirigible weight lead weight

order lead weight end dangle high pedestrian street safety issue obstacle builder face law airship fly low urban area

law illegal ship tie building approach area

passage Mooring Mast Marcia Amidon

builder Empire State Building face opstacle building Empire State Building allow dirigible dock

reason Al Smith term governor New York head effort construct Empire State Building year office tower tall

hight building 1,250 ft

lose title world tall tower announce hat tower

reason tower dirigible blimp

people New York New Jersey

Smith want place dirigible dock

happen german dirigible Hindenburg destroy fire Lakehurst New Jersey 6 1937 owner State building realize bad accident New York

1930 idea drop dirigible try 

In [17]:
# ----------- LOOKING AT BIGRAMS ------------ #
bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_all6')

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 1 == 1:
    bigram_model = Phrases(unigram_sentences)

    bigram_model.save(bigram_model_filepath)

# load the finished model from disk
bigram_model = Phrases.load(bigram_model_filepath)

bigram_sentences_filepath = os.path.join(intermediate_directory, 'bigram_sentences_all6.txt')

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:

    with codecs.open(bigram_sentences_filepath, 'w', encoding='utf_8') as f:

        for unigram_sentence in unigram_sentences:
            bigram_sentence = ' '.join(bigram_model[unigram_sentence])

            f.write(bigram_sentence + '\n')

bigram_sentences = LineSentence(bigram_sentences_filepath)

for bigram_sentence in it.islice(bigram_sentences, 19, 42):
    print(' '.join(bigram_sentence))
    print('')

obstacle dirigible hold_single cable_tether

add_stress building frame order frame sturdy modification building framework cost_thousand dollar

obstacle builder face dirigible use hydrogen_highly flammable

New_York densely_populated area work

dirigible weight_lead weight

order lead_weight end dangle_high pedestrian_street safety_issue obstacle builder face law_airship fly_low urban_area

law illegal_ship tie building approach_area

passage_Mooring Mast_Marcia Amidon

builder Empire_State Building face opstacle building Empire_State Building allow dirigible dock

reason Al_Smith term_governor New_York head effort construct Empire_State Building year office tower tall

hight building 1,250_ft

lose_title world_tall tower announce hat tower

reason tower dirigible blimp

people New_York New_Jersey

Smith_want place dirigible dock

happen german dirigible Hindenburg_destroy fire_Lakehurst New_Jersey 6_1937 owner State building realize_bad accident New_York

1930 idea drop dirigible try 

In [18]:
# ----------- LOOKING AT TRIGRAMS ------------ #
trigram_model_filepath = os.path.join(intermediate_directory, 'trigram_model_all6')

# this is a bit time consuming - make the if statement True
# if you want to execute modeling yourself.
if 1 == 1:
    trigram_model = Phrases(bigram_sentences)

    trigram_model.save(trigram_model_filepath)

# load the finished model from disk
trigram_model = Phrases.load(trigram_model_filepath)

trigram_sentences_filepath = os.path.join(intermediate_directory, 'trigram_sentences_all6.txt')

if 1 == 1:

    with codecs.open(trigram_sentences_filepath, 'w', encoding='utf_8') as f:

        for bigram_sentence in bigram_sentences:
            trigram_sentence = ' '.join(trigram_model[bigram_sentence])

            f.write(trigram_sentence + '\n')

trigram_sentences = LineSentence(trigram_sentences_filepath)

for trigram_sentence in it.islice(trigram_sentences, 205, 245):
    print(' '.join(trigram_sentence))
    print('')

quote_show Dirigible need find proper landing_area

able fit

need area space

long_thousand_foot length_block_New_York City

space Dirigible long

show architect come plan make work way fit properly

obstacle builder Empire_State_Building face

lack_suitable area

builder Empire_State_Building face difficult circumstance obstacle task design brand new Empire_State_Building accompany new mean travel

builder mission design Empire_State_Building accommodate mooring blimp thousand_foot air

obstacle design landing_area

difficult architecture need create attempt past

second obstacle have sure Empire_State_Building withstand_stress create blimp

paragraph say stress dirigible load_wind_pressure_transmit way building foundation

difficulty obstacle have refurbish steel_frame Empire_State_Building take lot time work thousand_dollar

great obstacle successful mooring nature

violent_wind current building blimp stable

pedestrian moor dirigible dangerous obstacle

lastly pre exist_law_airshi

In [19]:
# ----------- RUN THE TRIGRAMS PHRASE MODEL ON ALL ESSAYS ------------ #
trigram_essays_all_filepath = os.path.join(intermediate_directory, 'trigram_essays_all6.txt')

# this is a bit time consuming - make the if statement True
# if you want to execute data prep yourself.
if 1 == 1:

    with codecs.open(trigram_essays_all_filepath, 'w', encoding='utf_8') as f:

        for parsed_essay in nlp.pipe(line_review(essays_set6_all_filepath), batch_size=100, n_threads=4):
            # lemmatize the text, removing punctuation and whitespace
            unigram_essays = [token.lemma_ for token in parsed_essay if not punct_space_stop(token)]

            # apply the first-order and second-order phrase models
            bigram_essays = bigram_model[unigram_essays]
            trigram_essays = trigram_model[bigram_essays]

            # write the transformed review as a line in the new file
            trigram_essays = ' '.join(trigram_essays)
            f.write(trigram_essays + '\n')

print('Original:' + '\n')

for essay in it.islice(line_review(essays_set6_all_filepath), 301, 302):
    print(essay)

print('----' + '\n')
print('Transformed:' + '\n')

with codecs.open(trigram_essays_all_filepath, encoding='utf_8') as f:
    for essay in it.islice(f, 301, 302):
        print(essay)

Original:

In the the excerpt "The Mooring Mast" by Marcia Amidon Lüsted, the builders of the Empire State Building faced many obstacles in the attempt to allow dirigibles to dock there. The architects had to acomidate a lot of thing to allow dirigibles to dock at The mooring mast. For example In @CAPS1 @NUM1 it says "The steelframe of the Empire State Building would have to modify and strengthen to accommodate this new situation". Also in @CAPS1 @NUM2 it says "The greatest obstacle to the successful use of the mooring mast was natur itself. The winds on top of the building were constantly shifting due to violent air currents". These factores and many more where the reason the mooring mast was never poot to use do to the feer of safty and law.

----

Transformed:

excerpt_Mooring_Mast_Marcia Amidon_Lüsted builder Empire_State_Building face obstacle attempt_allow dirigible dock architect acomidate lot thing allow dirigible dock mooring_mast example say steelframe Empire_State_Building m