In [7]:
# !pip install pyspellchecker
# python -m spacy download en_core_web_sm

In [8]:
import numpy as np
import pandas as pd
import re
import warnings
from datetime import datetime

# Suppress numpy warnings
warnings.filterwarnings('ignore')

In [9]:
training_set  = pd.read_csv('./Data/training_set_rel3.tsv', sep='\t', encoding = "ISO-8859-1")\
            .rename(columns={'essay_set': 'topic', 'domain1_score': 'target_score', 'domain2_score': 'topic2_target'})
training_set.sample()

Unnamed: 0,essay_id,topic,essay,rater1_domain1,rater2_domain1,rater3_domain1,target_score,rater1_domain2,rater2_domain2,topic2_target,...,rater2_trait3,rater2_trait4,rater2_trait5,rater2_trait6,rater3_trait1,rater3_trait2,rater3_trait3,rater3_trait4,rater3_trait5,rater3_trait6
3535,4730,2,Need help with Censorship in @ORGANIZATION1 ...,3,4,,3,3.0,3.0,3.0,...,,,,,,,,,,


In [10]:
# Count characters and words for each essay
training_set['word_count'] = training_set['essay'].str.strip().str.split().str.len()

In [11]:
training_set.groupby(['topic'])['target_score'].agg(['min','max','count','nunique'])

Unnamed: 0_level_0,min,max,count,nunique
topic,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,2,12,1783,11
2,1,6,1800,6
3,0,3,1726,4
4,0,3,1770,4
5,0,4,1805,5
6,0,4,1800,5
7,2,24,1569,23
8,10,60,723,34


In [12]:
from spellchecker import SpellChecker
import pandas as pd
from datetime import datetime

# Initialize SpellChecker
spell_checker = SpellChecker()

t0 = datetime.now()

# Apply spelling checking
training_set['matches'] = training_set['essay'].apply(lambda txt: spell_checker.unknown(spell_checker.split_words(txt)))

# Count corrections
training_set['corrections'] = training_set['matches'].apply(len)

# Function to correct spelling using SpellChecker
def apply_correction(txt):
        corrected_text = ' '.join(spell_checker.correction(word) if word in spell_checker else word for word in spell_checker.split_words(txt))
        return corrected_text

# Apply spelling correction
training_set['corrected'] = training_set['essay'].apply(apply_correction)

t1 = datetime.now()
print('Processing time: {}'.format(t1 - t0))

# Save the DataFrame with corrected essays and correction count
training_set.to_pickle('./SavedModels/training_corr.pkl')

Processing time: 0:00:50.344205


# NLP with SpaCy

In [13]:
training_set = pd.read_pickle('./SavedModels/training_corr.pkl')

In [14]:
import spacy
from spacy.lang.en import STOP_WORDS
import string

In [15]:
sents = []
tokens = []
lemma = []
pos = []
ner = []

stop_words = set(STOP_WORDS)
stop_words.update(string.punctuation)  # Add this line

nlp = spacy.load('en_core_web_sm')

t0 = datetime.now()


# Use n_process instead of n_threads
for essay in nlp.pipe(training_set['corrected'], batch_size=100, n_process=3):
    if essay.is_parsed:
        tokens.append([e.text for e in essay])
        sents.append([sent.text.strip() for sent in essay.sents])  # Corrected line
        pos.append([e.pos_ for e in essay])
        ner.append([e.text for e in essay.ents])
        lemma.append([n.lemma_ for n in essay])
    else:
        # We want to make sure that the lists of parsed results have the
        # same number of entries of the original DataFrame, so add some blanks in case the parse fails
        tokens.append(None)
        lemma.append(None)
        pos.append(None)
        sents.append(None)
        ner.append(None)

training_set['tokens'] = tokens
training_set['lemma'] = lemma
training_set['pos'] = pos
training_set['sents'] = sents
training_set['ner'] = ner

t1 = datetime.now()
print('Processing time: {}'.format(t1 - t0))

Processing time: 0:06:58.067337


In [16]:
training_set.to_pickle('./SavedModels/training_spacy.pkl')

In [17]:
training_set = pd.read_pickle('./SavedModels/training_spacy.pkl')

In [18]:
training_set[['lemma', 'pos', 'ner']].sample(3)

Unnamed: 0,lemma,pos,ner
3938,"[the, feature, of, the, setting, affect, the, ...","[DET, NOUN, ADP, DET, NOUN, VERB, DET, NOUN, A...",[Rough Road Ahead Do Not Exceed Posted Speed L...
10413,"[New, York, be, say, to, be, the, city, that, ...","[PROPN, PROPN, AUX, VERB, PART, AUX, DET, NOUN...","[New York, New York, four, Al Smith, the Empir..."
9020,"[allow, dirigible, to, dock, at, the, Empire, ...","[VERB, NOUN, PART, VERB, ADP, DET, PROPN, PROP...","[the Empire State Building, One, PERSON1, the ..."


 Generate vectorized features from processed essays

In [19]:
"""Choose arbitrary essay from highest available target_score for each topic.
all other essays will be compared to these.
The uncorrected essays will be used since the reference essays should have fewer errors.
"""
reference_essays = {1: 161, 2: 3022, 3: 5263, 4: 5341, 5: 7209, 6: 8896, 7: 11796, 8: 12340} # topic: essay_id

references = {}

t0 = datetime.now()

nlp = spacy.load('en_core_web_sm')
stop_words = set(STOP_WORDS)

# generate nlp object for reference essays:
for topic, index in reference_essays.items():
    references[topic] = nlp(training_set.iloc[index]['essay'])

# generate document similarity for each essay compared to topic reference
training_set['similarity'] = training_set.apply(lambda row: nlp(row['essay']).similarity(references[row['topic']]), axis=1)

t1 = datetime.now()
print('Processing time: {}'.format(t1 - t0))

Processing time: 0:11:49.048759


In [20]:
print(training_set.columns)
print(training_set.head())

Index(['essay_id', 'topic', 'essay', 'rater1_domain1', 'rater2_domain1',
       'rater3_domain1', 'target_score', 'rater1_domain2', 'rater2_domain2',
       'topic2_target', 'rater1_trait1', 'rater1_trait2', 'rater1_trait3',
       'rater1_trait4', 'rater1_trait5', 'rater1_trait6', 'rater2_trait1',
       'rater2_trait2', 'rater2_trait3', 'rater2_trait4', 'rater2_trait5',
       'rater2_trait6', 'rater3_trait1', 'rater3_trait2', 'rater3_trait3',
       'rater3_trait4', 'rater3_trait5', 'rater3_trait6', 'word_count',
       'matches', 'corrections', 'corrected', 'tokens', 'lemma', 'pos',
       'sents', 'ner', 'similarity'],
      dtype='object')
   essay_id  topic                                              essay  \
0         1      1  Dear local newspaper, I think effects computer...   
1         2      1  Dear @CAPS1 @CAPS2, I believe that using compu...   
2         3      1  Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...   
3         4      1  Dear Local Newspaper, @CAPS1 I hav

In [21]:
# count various features

t0 = datetime.now()

training_set['token_count'] = training_set.apply(lambda x: len(x['tokens']), axis=1)
training_set['unique_token_count'] = training_set.apply(lambda x: len(set(x['tokens'])), axis=1)
training_set['nostop_count'] = training_set.apply(lambda x: len([token for token in x['tokens'] if token not in stop_words]), axis=1)
training_set['sent_count'] = training_set.apply(lambda x: len(x['sents']), axis=1)
training_set['ner_count'] = training_set.apply(lambda x: len(x['ner']), axis=1)
training_set['comma'] = training_set.apply(lambda x: x['corrected'].count(','), axis=1)
training_set['question'] = training_set.apply(lambda x: x['corrected'].count('?'), axis=1)
training_set['exclamation'] = training_set.apply(lambda x: x['corrected'].count('!'), axis=1)
training_set['quotation'] = training_set.apply(lambda x: x['corrected'].count('"') + x['corrected'].count("'"), axis=1)
training_set['organization'] = training_set.apply(lambda x: x['corrected'].count(r'@ORGANIZATION'), axis=1)
training_set['caps'] = training_set.apply(lambda x: x['corrected'].count(r'@CAPS'), axis=1)
training_set['person'] = training_set.apply(lambda x: x['corrected'].count(r'@PERSON'), axis=1)
training_set['location'] = training_set.apply(lambda x: x['corrected'].count(r'@LOCATION'), axis=1)
training_set['money'] = training_set.apply(lambda x: x['corrected'].count(r'@MONEY'), axis=1)
training_set['time'] = training_set.apply(lambda x: x['corrected'].count(r'@TIME'), axis=1)
training_set['date'] = training_set.apply(lambda x: x['corrected'].count(r'@DATE'), axis=1)
training_set['percent'] = training_set.apply(lambda x: x['corrected'].count(r'@PERCENT'), axis=1)
training_set['noun'] = training_set.apply(lambda x: x['pos'].count('NOUN'), axis=1)
training_set['adj'] = training_set.apply(lambda x: x['pos'].count('ADJ'), axis=1)
training_set['pron'] = training_set.apply(lambda x: x['pos'].count('PRON'), axis=1)
training_set['verb'] = training_set.apply(lambda x: x['pos'].count('VERB'), axis=1)
training_set['noun'] = training_set.apply(lambda x: x['pos'].count('NOUN'), axis=1)
training_set['cconj'] = training_set.apply(lambda x: x['pos'].count('CCONJ'), axis=1)
training_set['adv'] = training_set.apply(lambda x: x['pos'].count('ADV'), axis=1)
training_set['det'] = training_set.apply(lambda x: x['pos'].count('DET'), axis=1)
training_set['propn'] = training_set.apply(lambda x: x['pos'].count('PROPN'), axis=1)
training_set['num'] = training_set.apply(lambda x: x['pos'].count('NUM'), axis=1)
training_set['part'] = training_set.apply(lambda x: x['pos'].count('PART'), axis=1)
training_set['intj'] = training_set.apply(lambda x: x['pos'].count('INTJ'), axis=1)

t1 = datetime.now()
print('Processing time: {}'.format(t1 - t0))

Processing time: 0:00:07.682125


In [22]:
# save to file
training_set.to_pickle('./SavedModels/training_features.pkl')

In [23]:
df = pd.read_pickle('./SavedModels/training_features.pkl')

# Now df contains the data from the pickled file
print(df.head())


   essay_id  topic                                              essay  \
0         1      1  Dear local newspaper, I think effects computer...   
1         2      1  Dear @CAPS1 @CAPS2, I believe that using compu...   
2         3      1  Dear, @CAPS1 @CAPS2 @CAPS3 More and more peopl...   
3         4      1  Dear Local Newspaper, @CAPS1 I have found that...   
4         5      1  Dear @LOCATION1, I know having computers has a...   

   rater1_domain1  rater2_domain1  rater3_domain1  target_score  \
0               4               4             NaN             8   
1               5               4             NaN             9   
2               4               3             NaN             7   
3               5               5             NaN            10   
4               4               4             NaN             8   

   rater1_domain2  rater2_domain2  topic2_target  ...  adj  pron  verb  cconj  \
0             NaN             NaN            NaN  ...   18    46    48     14

In [26]:
df['similarity']


0        0.915258
1        0.921898
2        0.900862
3        0.921350
4        0.932292
           ...   
12971    0.776672
12972    0.881727
12973    0.828828
12974    0.889080
12975    0.829652
Name: similarity, Length: 12976, dtype: float64

In [29]:
df[df['essay_id'] == 4]

Unnamed: 0,essay_id,topic,essay,rater1_domain1,rater2_domain1,rater3_domain1,target_score,rater1_domain2,rater2_domain2,topic2_target,...,adj,pron,verb,cconj,adv,det,propn,num,part,intj
3,4,1,"Dear Local Newspaper, @CAPS1 I have found that...",5,5,,10,,,,...,37,31,72,17,21,41,46,0,23,0
