In [1]:
import numpy as np
from DataReader.XMLReader import get_essays


## 1. Load dataset

In [2]:
from essay_evaluation.lexical_density import LexicalDensityFeatures
from essay_evaluation.lexical_sophistication import LexicalSophisticationFeatures
from essay_evaluation.lexical_variation import LexicalVariationFeatures
dataset_path = '/home/simon/Downloads/efcamdat/'
levels = ['b1']


from concurrent.futures import ThreadPoolExecutor, as_completed

essays = []
scores = []

result = {}
with ThreadPoolExecutor(max_workers=4) as executor:
    
    futures_level = {executor.submit(get_essays, dataset_path + 'level_' + level + '.xml'): level for level in levels}
    for future in as_completed(futures_level):
        level = futures_level[future]
        try:
            level_result = future.result()
            result[level] = level_result
        except Exception as exc:
            print('%r generated an exception: %s' % (level, exc))
        
    for level in levels:
        essays += result[level][0]
        scores += result[level][1]
scores = np.array(scores)
print("done")
print("Number of essays: " + str(len(essays)))

done
Number of essays: 10000


# 2. Calculate the features
No we pass all texts through the spacy pipeline

In [3]:
from notebook_utils import log_progress
import spacy
from essay_evaluation.lexical_accuracy import SpellChecker, CollocationPreprocessor, CollocationDectector, LexicalAccuracy, CollocationEvaluator
from essay_evaluation.pipeline import FeatureCollector
from essay_evaluation.collocational_aspects import CollocationalAspects

nlp = spacy.load('en_core_web_sm')
# we only need the tokenizer from spaCy
#nlp.remove_pipe('tagger')
nlp.remove_pipe('parser')
nlp.remove_pipe('ner')

lvf = LexicalVariationFeatures()
lsf = LexicalSophisticationFeatures()
ldf = LexicalDensityFeatures()

nlp.add_pipe(lvf)
nlp.add_pipe(ldf)
nlp.add_pipe(lsf)
#
## add all required components
#spell_checker = SpellChecker()
#nlp.add_pipe(spell_checker, name=spell_checker.name, last=True)
#
#
#col_preproc = CollocationPreprocessor()
#nlp.add_pipe(col_preproc, name=col_preproc.name, last=True)
#
#col_detect = CollocationDectector()
#nlp.add_pipe(col_detect, name=col_detect.name, last=True)
#
#col_evaluator = CollocationEvaluator()
#nlp.add_pipe(col_evaluator, name=col_evaluator.name, last=True)
#
## add the lexical accuracy feature extractor
#la_feature_extractor = LexicalAccuracy()
#nlp.add_pipe(la_feature_extractor, name=la_feature_extractor.name, last=True)
#
## add the collocational aspects feature extractor
#
#ca_feature_extractor = CollocationalAspects()
#nlp.add_pipe(ca_feature_extractor, name=ca_feature_extractor.name, last=True)
#
#
# add the feature collector to get a nice feature matrix
feature_collector = FeatureCollector()
nlp.add_pipe(feature_collector, name=feature_collector.name, last=True)


In [4]:
#feature_names = la_feature_extractor.feature_names + ca_feature_extractor.feature_names
print("Start pipeline")
docs = []
import time
for essay in essays[0:10]:
    start = time.time() 
    doc = nlp(essay)
    docs.append(doc)
    end = time.time()
    print("finished one document (" + str(end-start) + ")")
    

    

Start pipeline
finished one document (225.46627759933472)


KeyboardInterrupt: 

In [19]:
text = "Hi Peter: I am planning a party with my yoga club friends in Apple on sunday 22th ,at 11 am. I am sending out invitations on EMS to 30 of my friends and I hope they are all coming .Heury and Paul are bringing some juice and beer and Jackie is choosing some rock CD’s for the party .linda is making a cake for the party . I hope you can come . june"
doc = nlp(text)

In [20]:
print(doc._.spell_errors)
print(len(doc))
print(len(doc._.spell_errors))
print(doc._.features_la)

[sunday, june]
78
2
[0.02564102564102564, 0.8125]


In [23]:
from pprint import pprint
print(doc._.spell_errors)
print(len(doc))
print(len(doc._.spell_errors))
print(doc._.features_la)
pprint([str(col) for col in doc._.collocation_errors])
pprint([str(col) for col in doc._.collocations])
print(doc._.features_ca)

[sunday, june]
78
2
[0.02564102564102564, 0.8125]
['yoga_NOUN<--[ NOUN+NOUN (None) ]--club_NOUN',
 'club_NOUN<--[ NOUN+NOUN (None) ]--friend_NOUN',
 'sunday_NOUN<--[ NOUN+NOUN (None) ]--22th_NOUN',
 'beer_NOUN<--[ NOUN+NOUN (None) ]--juice_NOUN',
 'choose_NOUN<--[ NOUN+NOUN (None) ]--bring_NOUN',
 'rock_NOUN<--[ NOUN+NOUN (None) ]--cd_NOUN',
 'cd_NOUN<--[ NOUN+NOUN (None) ]--’s_NOUN',
 'party_NOUN<--[ VERB+NOUN (None) ]--plan_VERB',
 'invitation_NOUN<--[ VERB+NOUN (None) ]--send_VERB',
 '’s_NOUN<--[ VERB+NOUN (None) ]--choose_VERB',
 'come_VERB<--[ VERB+VERB (None) ]--hope_VERB',
 'can_VERB<--[ VERB+VERB (None) ]--come_VERB',
 'come_VERB<--[ VERB+VERB (None) ]--hope_VERB']
['yoga_NOUN<--[ NOUN+NOUN (None) ]--club_NOUN',
 'club_NOUN<--[ NOUN+NOUN (None) ]--friend_NOUN',
 'sunday_NOUN<--[ NOUN+NOUN (None) ]--22th_NOUN',
 'beer_NOUN<--[ NOUN+NOUN (None) ]--juice_NOUN',
 'choose_NOUN<--[ NOUN+NOUN (None) ]--bring_NOUN',
 'rock_NOUN<--[ NOUN+NOUN (None) ]--cd_NOUN',
 'cd_NOUN<--[ NOUN+NOUN 

In [3]:
import html

print(essays[167])


      I have many plans for the next five years. Here in Brazil we are not used to take a gap year. It's more common to go to United States and make a work experience and learn English. I graduated in Executive Secretary Course in 2008. In 2009 I made a specialization in public management. Now I work in a Natural Gas Company as a Secretary and I'm studying to get a better job. I'm trying to have a public job at "Ministrio Pblico da Unio". In the next year's I am going to continue studying to get it.
    
