In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objs as go
import plotly.plotly as py
import cufflinks
import plotly.figure_factory as ff
from plotly.offline import iplot
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

%matplotlib inline
sns.set_style("darkgrid")

essays = pd.read_csv('../data/intermediate/prepped_essays_df.csv')

'''
Here you can toggle on/off different essay sets to see similar visualizaitons for each set
Take a look at the images folder in this directory to see what kind of images you can produce
''' 

# essays1 = essays[essays['essay_set'] == 1]
# essays2 = essays[essays['essay_set'] == 2]
# essays3 = essays[essays['essay_set'] == 3]
# essays4 = essays[essays['essay_set'] == 4]
# essays5 = essays[essays['essay_set'] == 5]
essays = essays[essays['essay_set'] == 6]
# essays7 = essays[essays['essay_set'] == 7]
# essays8 = essays[essays['essay_set'] == 8]

essays.dropna(axis=1, how='all', inplace=True)
essays.head()

Unnamed: 0,essay_id,essay_set,essay,rater1_domain1,rater2_domain1,domain1_score,prompt,has_source_material,source_text,grade_7,grade_8,grade_10
8886,14834,6,There were many obstacles that the builders fa...,2.0,2.0,2.0,"Based on the excerpt, describe the obstacles t...",1,"When the Empire State Building was conceived, ...",0,0,1
8887,14835,6,"Him from the start, there would have been many...",3.0,3.0,3.0,"Based on the excerpt, describe the obstacles t...",1,"When the Empire State Building was conceived, ...",0,0,1
8888,14836,6,The builders of the Empire State Building face...,3.0,4.0,4.0,"Based on the excerpt, describe the obstacles t...",1,"When the Empire State Building was conceived, ...",0,0,1
8889,14837,6,In the passage The Mooring Mast by Marcia Amid...,1.0,1.0,1.0,"Based on the excerpt, describe the obstacles t...",1,"When the Empire State Building was conceived, ...",0,0,1
8890,14838,6,The builders of the Empire State Building face...,3.0,3.0,3.0,"Based on the excerpt, describe the obstacles t...",1,"When the Empire State Building was conceived, ...",0,0,1


In [3]:
# This creates a histogram of score distribution
essays['domain1_score'].iplot(
    kind='hist',
    xTitle='score',
    linecolor='black',
    yTitle='count',
    title='Essay Set 6 Scores Distribution')


Consider using IPython.display.IFrame instead



In [4]:
essays['length'] = essays.essay.str.len()

# This creates a histogram of essay length
essays['length'].iplot(
    kind='hist',
    bins=100,
    xTitle='essay length',
    linecolor='black',
    yTitle='count',
    title='Essay Set 6 Length Distribution')


Consider using IPython.display.IFrame instead



In [6]:
def get_top_n_words(corpus, stopwords=False, n=None):
    if stopwords == True:
        vec = CountVectorizer(stop_words = 'english').fit(corpus)
    else:
        vec = CountVectorizer().fit(corpus)

    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

# Generate a plotly histogram of unigrams with stopwords
common_words = get_top_n_words(essays.essay, n=20)

df1 = pd.DataFrame(common_words, columns = ['essay' , 'count'])
df1.groupby('essay').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 words in essays before removing stop words')


Consider using IPython.display.IFrame instead



In [7]:
# Generate a plotly histogram of unigrams without stopwords
common_words = get_top_n_words(essays['essay'], stopwords=True, n=20)
    
df2 = pd.DataFrame(common_words, columns = ['essay' , 'count'])
df2.groupby('essay').sum()['count'].sort_values(ascending=False).iplot(
        kind='bar', yTitle='Count', linecolor='black', title='Top 20 words in essays after removing stop words')


Consider using IPython.display.IFrame instead



In [9]:
def get_top_n_bigram(corpus, stopwords=False, n=None):
    if stopwords == True:
        vec = CountVectorizer(ngram_range=(2, 2), stop_words = 'english').fit(corpus)
    else:
        vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)

    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

# Generate a plotly histogram of bigrams with stopwords
common_words = get_top_n_bigram(essays['essay'], n=20)
    
df3 = pd.DataFrame(common_words, columns = ['essay' , 'count'])
df3.groupby('essay').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in essays before removing stop words')


Consider using IPython.display.IFrame instead



In [10]:
# Generate a plotly histogram of bigrams without stopwords
common_words = get_top_n_bigram(essays['essay'], stopwords=True, n=20)

df4 = pd.DataFrame(common_words, columns = ['essay' , 'count'])
df4.groupby('essay').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in essays after removing stop words')


Consider using IPython.display.IFrame instead



In [11]:
def get_top_n_trigram(corpus, stopwords=False, n=None):
    if stopwords == True:
        vec = CountVectorizer(ngram_range=(3, 3), stop_words = 'english').fit(corpus)
    else:
        vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)

    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

# Generate a plotly histogram of trigrams with stopwords
common_words = get_top_n_trigram(essays['essay'], n=20)
    
df5 = pd.DataFrame(common_words, columns = ['essay' , 'count'])
df5.groupby('essay').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in essays before removing stop words')


Consider using IPython.display.IFrame instead



In [12]:
# Generate a plotly histogram of trigrams without stopwords
common_words = get_top_n_trigram(essays['essay'], stopwords=True, n=20)
    
df6 = pd.DataFrame(common_words, columns = ['essay' , 'count'])
df6.groupby('essay').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in essays after removing stop words')


Consider using IPython.display.IFrame instead



In [24]:
source_text = []

source_text.append(essays.iloc[0, 8])

source_text

['When the Empire State Building was conceived, it was planned as the world’s tallest building, taller even than the new Chrysler Building that was being constructed at Forty-second Street and Lexington Avenue in New York. At seventy-seven stories, it was the tallest building before the Empire State began construction, and Al Smith was determined to outstrip it in height.\nThe architect building the Chrysler Building, however, had a trick up his sleeve. He secretly constructed a 185-foot spire inside the building, and then shocked the public and the media by hoisting it up to the top of the Chrysler Building, bringing it to a height of 1,046 feet, 46 feet taller than the originally announced height of the Empire State Building.\nAl Smith realized that he was close to losing the title of world’s tallest building, and on December 11, 1929, he announced that the Empire State would now reach the height of 1,250 feet. He would add a top or a hat to the building that would be even more disti

In [25]:
# Generate a plotly histogram of unigrams with stopwords
common_words = get_top_n_words(source_text, n=20)

df7 = pd.DataFrame(common_words, columns = ['source_text' , 'count'])
df7.groupby('source_text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 words in source text before removing stop words')


Consider using IPython.display.IFrame instead



In [26]:
# Generate a plotly histogram of unigrams without stopwords
common_words = get_top_n_words(source_text, stopwords=True, n=20)
    
df8 = pd.DataFrame(common_words, columns = ['source_text' , 'count'])
df8.groupby('source_text').sum()['count'].sort_values(ascending=False).iplot(
        kind='bar', yTitle='Count', linecolor='black', title='Top 20 words in source text after removing stop words')


Consider using IPython.display.IFrame instead



In [27]:
common_words = get_top_n_bigram(source_text, n=20)
    
df9 = pd.DataFrame(common_words, columns = ['source_text' , 'count'])
df9.groupby('source_text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in source text before removing stop words')


Consider using IPython.display.IFrame instead



In [28]:
common_words = get_top_n_bigram(source_text, stopwords=True, n=20)
    
df10 = pd.DataFrame(common_words, columns = ['source_text' , 'count'])
df10.groupby('source_text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 bigrams in source text after removing stop words')


Consider using IPython.display.IFrame instead



In [29]:
# Generate a plotly histogram of trigrams with stopwords
common_words = get_top_n_trigram(source_text, n=20)
    
df11 = pd.DataFrame(common_words, columns = ['source_text' , 'count'])
df11.groupby('source_text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in source text before removing stop words')


Consider using IPython.display.IFrame instead



In [30]:
# Generate a plotly histogram of trigrams with stopwords
common_words = get_top_n_trigram(source_text, stopwords=True, n=20)
    
df11 = pd.DataFrame(common_words, columns = ['source_text' , 'count'])
df11.groupby('source_text').sum()['count'].sort_values(ascending=False).iplot(
    kind='bar', yTitle='Count', linecolor='black', title='Top 20 trigrams in source text after removing stop words')


Consider using IPython.display.IFrame instead

