In [63]:
import pandas as pd
import numpy as np
from utils.helpers import uniqueColumns, printEssaySetStats
from data.Essay_Dicts.essay_dictionaries import essay_prompts, essay_gradeLevels, essay_sourceDependent

training_essay_set = pd.read_excel('./data/training_set_rel3.xlsx')

#### Data Description
The file contains 28 columns:

+ essay_id: A unique identifier for each individual student essay
+ essay_set: 1-8, an id for each set of essays
+ essay: The ascii text of a student's response
+ rater1_domain1: Rater 1's domain 1 score; all essays have this
+ rater2_domain1: Rater 2's domain 1 score; all essays have this
+ rater3_domain1: Rater 3's domain 1 score; only some essays in set 8 have this.
+ domain1_score: Resolved score between the raters; all essays have this
+ rater1_domain2: Rater 1's domain 2 score; only essays in set 2 have this
+ rater2_domain2: Rater 2's domain 2 score; only essays in set 2 have this
+ domain2_score: Resolved score between the raters; only essays in set 2 have this
+ rater1_trait1 score - rater3_trait6 score: trait scores for sets 7-8

#### Anonymization in Essays

We have made an effort to remove personally identifying information from the essays using the Named Entity Recognizer (NER) from the Stanford Natural Language Processing group and a variety of other approaches. The relevant entities are identified in the text and then replaced with a string such as "@PERSON1."

The entitities identified by NER are: "PERSON", "ORGANIZATION", "LOCATION", "DATE", "TIME", "MONEY", "PERCENT"

Other replacements made: "MONTH" (any month name not tagged as a date by the NER), "EMAIL" (anything that looks like an e-mail address), "NUM" (word containing digits or non-alphanumeric symbols), and "CAPS" (any capitalized word that doesn't begin a sentence, except in essays where more than 20% of the characters are capitalized letters), "DR" (any word following "Dr." with or without the period, with any capitalization, that doesn't fall into any of the above), "CITY" and "STATE" (various cities and states).

Here are some hypothetical examples of replacements made:

+ "I attend Springfield School..." --> "...I attend @ORGANIZATION1"
+ "once my family took my on a trip to Springfield." --> "once my family took me on a trip to @LOCATION1"
+ "John Doe is a person, and so is Jane Doe. But if I talk about Mr. Doe, I can't tell that's the same person." --> "...@PERSON1 is a person, and so is @PERSON2. But if you talk about @PERSON3, I can't tell that's the same person."
+ "...my phone number is 555-2106" --> "...my phone number is @NUM1"

Any words appearing in the prompt or source material for the corresponding essay set were white-listed and not anonymized.

In [64]:
print('Total number of esseays in the dataset: {}'.format(len(training_essay_set)))

Total number of esseays in the dataset: 12978


In [65]:
uniqueColumns(training_essay_set)

essay_id
essay_set
essay
rater1_domain1
rater2_domain1
rater3_domain1
domain1_score
rater1_domain2
rater2_domain2
domain2_score
rater1_trait1
rater1_trait2
rater1_trait3
rater1_trait4
rater1_trait5
rater1_trait6
rater2_trait1
rater2_trait2
rater2_trait3
rater2_trait4
rater2_trait5
rater2_trait6
rater3_trait1
rater3_trait2
rater3_trait3
rater3_trait4
rater3_trait5
rater3_trait6


In [66]:
set1 = training_essay_set[training_essay_set.essay_set == 1]
set2 = training_essay_set[training_essay_set.essay_set == 2]
set3 = training_essay_set[training_essay_set.essay_set == 3]
set4 = training_essay_set[training_essay_set.essay_set == 4]
set5 = training_essay_set[training_essay_set.essay_set == 5]
set6 = training_essay_set[training_essay_set.essay_set == 6]
set7 = training_essay_set[training_essay_set.essay_set == 7]
set8 = training_essay_set[training_essay_set.essay_set == 8]

In [67]:
printEssaySetStats(set1)

Essay Set #1 Length of dataset 1783
essay_id             0
essay_set            0
essay                0
rater1_domain1       0
rater2_domain1       0
rater3_domain1    1783
domain1_score        0
rater1_domain2    1783
rater2_domain2    1783
domain2_score     1783
rater1_trait1     1783
rater1_trait2     1783
rater1_trait3     1783
rater1_trait4     1783
rater1_trait5     1783
rater1_trait6     1783
rater2_trait1     1783
rater2_trait2     1783
rater2_trait3     1783
rater2_trait4     1783
rater2_trait5     1783
rater2_trait6     1783
rater3_trait1     1783
rater3_trait2     1783
rater3_trait3     1783
rater3_trait4     1783
rater3_trait5     1783
rater3_trait6     1783
dtype: int64




In [68]:
printEssaySetStats(set2)

Essay Set #2 Length of dataset 1800
essay_id             0
essay_set            0
essay                0
rater1_domain1       0
rater2_domain1       0
rater3_domain1    1800
domain1_score        0
rater1_domain2       0
rater2_domain2       0
domain2_score        0
rater1_trait1     1800
rater1_trait2     1800
rater1_trait3     1800
rater1_trait4     1800
rater1_trait5     1800
rater1_trait6     1800
rater2_trait1     1800
rater2_trait2     1800
rater2_trait3     1800
rater2_trait4     1800
rater2_trait5     1800
rater2_trait6     1800
rater3_trait1     1800
rater3_trait2     1800
rater3_trait3     1800
rater3_trait4     1800
rater3_trait5     1800
rater3_trait6     1800
dtype: int64




In [69]:
printEssaySetStats(set3)

Essay Set #3 Length of dataset 1726
essay_id             0
essay_set            0
essay                0
rater1_domain1       0
rater2_domain1       0
rater3_domain1    1726
domain1_score        0
rater1_domain2    1726
rater2_domain2    1726
domain2_score     1726
rater1_trait1     1726
rater1_trait2     1726
rater1_trait3     1726
rater1_trait4     1726
rater1_trait5     1726
rater1_trait6     1726
rater2_trait1     1726
rater2_trait2     1726
rater2_trait3     1726
rater2_trait4     1726
rater2_trait5     1726
rater2_trait6     1726
rater3_trait1     1726
rater3_trait2     1726
rater3_trait3     1726
rater3_trait4     1726
rater3_trait5     1726
rater3_trait6     1726
dtype: int64




In [70]:
printEssaySetStats(set4)

Essay Set #4 Length of dataset 1772
essay_id             0
essay_set            0
essay                0
rater1_domain1       1
rater2_domain1       1
rater3_domain1    1772
domain1_score        1
rater1_domain2    1772
rater2_domain2    1772
domain2_score     1772
rater1_trait1     1772
rater1_trait2     1772
rater1_trait3     1772
rater1_trait4     1772
rater1_trait5     1772
rater1_trait6     1772
rater2_trait1     1772
rater2_trait2     1772
rater2_trait3     1772
rater2_trait4     1772
rater2_trait5     1772
rater2_trait6     1772
rater3_trait1     1772
rater3_trait2     1772
rater3_trait3     1772
rater3_trait4     1772
rater3_trait5     1772
rater3_trait6     1772
dtype: int64




In [71]:
printEssaySetStats(set5)

Essay Set #5 Length of dataset 1805
essay_id             0
essay_set            0
essay                0
rater1_domain1       0
rater2_domain1       0
rater3_domain1    1805
domain1_score        0
rater1_domain2    1805
rater2_domain2    1805
domain2_score     1805
rater1_trait1     1805
rater1_trait2     1805
rater1_trait3     1805
rater1_trait4     1805
rater1_trait5     1805
rater1_trait6     1805
rater2_trait1     1805
rater2_trait2     1805
rater2_trait3     1805
rater2_trait4     1805
rater2_trait5     1805
rater2_trait6     1805
rater3_trait1     1805
rater3_trait2     1805
rater3_trait3     1805
rater3_trait4     1805
rater3_trait5     1805
rater3_trait6     1805
dtype: int64




In [72]:
printEssaySetStats(set6)

Essay Set #6 Length of dataset 1800
essay_id             0
essay_set            0
essay                0
rater1_domain1       0
rater2_domain1       0
rater3_domain1    1800
domain1_score        0
rater1_domain2    1800
rater2_domain2    1800
domain2_score     1800
rater1_trait1     1800
rater1_trait2     1800
rater1_trait3     1800
rater1_trait4     1800
rater1_trait5     1800
rater1_trait6     1800
rater2_trait1     1800
rater2_trait2     1800
rater2_trait3     1800
rater2_trait4     1800
rater2_trait5     1800
rater2_trait6     1800
rater3_trait1     1800
rater3_trait2     1800
rater3_trait3     1800
rater3_trait4     1800
rater3_trait5     1800
rater3_trait6     1800
dtype: int64




In [73]:
printEssaySetStats(set7)

Essay Set #7 Length of dataset 1569
essay_id             0
essay_set            0
essay                0
rater1_domain1       0
rater2_domain1       0
rater3_domain1    1569
domain1_score        0
rater1_domain2    1569
rater2_domain2    1569
domain2_score     1569
rater1_trait1        0
rater1_trait2        0
rater1_trait3        0
rater1_trait4        0
rater1_trait5     1569
rater1_trait6     1569
rater2_trait1        0
rater2_trait2        0
rater2_trait3        0
rater2_trait4        0
rater2_trait5     1569
rater2_trait6     1569
rater3_trait1     1569
rater3_trait2     1569
rater3_trait3     1569
rater3_trait4     1569
rater3_trait5     1569
rater3_trait6     1569
dtype: int64




In [74]:
printEssaySetStats(set8)

Essay Set #8 Length of dataset 723
essay_id            0
essay_set           0
essay               0
rater1_domain1      0
rater2_domain1      0
rater3_domain1    595
domain1_score       0
rater1_domain2    723
rater2_domain2    723
domain2_score     723
rater1_trait1       0
rater1_trait2       0
rater1_trait3       0
rater1_trait4       0
rater1_trait5       0
rater1_trait6       0
rater2_trait1       0
rater2_trait2       0
rater2_trait3       0
rater2_trait4       0
rater2_trait5       0
rater2_trait6       0
rater3_trait1     595
rater3_trait2     595
rater3_trait3     595
rater3_trait4     595
rater3_trait5     595
rater3_trait6     595
dtype: int64




In [19]:
# essay_prompts = {
#     1: 'More and more people use computers, but not everyone agrees that this benefits society. Those who support advances in technology believe that computers have a positive effect on people. They teach hand-eye coordination, give people the ability to learn about faraway places and people, and even allow people to talk online with other people. Others have different ideas. Some experts are concerned that people are spending too much time on their computers and less time exercising, enjoying nature, and interacting with family and friends. Write a letter to your local newspaper in which you state your opinion on the effects computers have on people. Persuade the readers to agree with you.',
#     2: 'Censorship in the Libraries "All of us can think of a book that we hope none of our children or any other children have taken off the shelf. But if I have the right to remove that book from the shelf -- that work I abhor -- then you also have exactly the same right and so does everyone else. And then we have no books left on the shelf for any of us." --Katherine Paterson, Author. Write a persuasive essay to a newspaper reflecting your vies on censorship in libraries. Do you believe that certain materials, such as books, music, movies, magazines, etc., should be removed from the shelves if they are found offensive? Support your position with convincing arguments from your own experience, observations, and/or reading.',
#     3: 'Write a response that explains how the features of the setting affect the cyclist. In your response, include examples from the essay that support your conclusion.',
#     4: 'Read the last paragraph of the story. "When they come back, Saeng vowed silently to herself, in the spring, when the snows melt and the geese return and this hibiscus is budding, then I will take that test again." Write a response that explains why the author concludes the story with this paragraph. In your response, include details and examples from the story that support your ideas.',
#     5: 'Describe the mood created by the author in the memoir. Support your answer with relevant and specific information from the memoir.',
#     6: 'Based on the excerpt, describe the obstacles the builders of the Empire State Building faced in attempting to allow dirigibles to dock there. Support your answer with relevant and specific information from the excerpt.',
#     7: 'Write about patience. Being patient means that you are understanding and tolerant. A patient person experience difficulties without complaining. Do only one of the following: write a story about a time when you were patient OR write a story about a time when someone you know was patient OR write a story in your own way about patience.',
#     8: 'We all understand the benefits of laughter. For example, someone once said, “Laughter is the shortest distance between two people.” Many other people believe that laughter is an important part of any relationship. Tell a true story in which laughter was one element or part.'
# }

In [20]:
essay_gradeLevels = {
    1: 8,
    2: 10,
    3: 10,
    4: 10,
    5: 8,
    6: 10,
    7: 7,
    8: 10,
}

In [21]:
essay_sourceDependent = {
    1: 0,
    2: 0,
    3: 1,
    4: 1,
    5: 1,
    6: 1,
    7: 0,
    8: 0,
}

In [22]:
training_essay_set['prompt'] = training_essay_set.essay_set.map(essay_prompts)

In [23]:
training_essay_set['grade_level'] = training_essay_set.essay_set.map(essay_gradeLevels)

In [24]:
training_essay_set['has_source_material'] = training_essay_set.essay_set.map(essay_sourceDependent)

In [42]:
source3 = open('./data/sourceEssay3.txt')
source4 = open('./data/sourceEssay4.txt')
source5 = open('./data/sourceEssay5.txt')
source6 = open('./data/sourceEssay6.txt')
# file_content = text_file.read()
# print(file_content)
# text_file.close()

In [43]:
essay_sourceText = {
    1: np.nan,
    2: np.nan,
    3: source3.read(),
    4: source4.read(),
    5: source5.read(),
    6: source6.read(),
    7: np.nan,
    8: np.nan,
}

In [44]:
training_essay_set['source_text'] = training_essay_set.essay_set.map(essay_sourceText)

In [51]:
gradeDF = pd.get_dummies(training_essay_set['grade_level'], prefix='grade')

In [52]:
training_essay_set = pd.concat([training_essay_set, gradeDF], axis=1)

In [57]:
training_essay_set.drop(['grade_level', 7, 8, 10], axis=1, inplace=True)

In [59]:
training_essay_set.to_csv('./data/prepped_essays_df.csv', index=False)