In [2]:
import sys

import numpy as np
from DataReader.XMLReader import get_essays
from time import time
from concurrent.futures import ProcessPoolExecutor
from concurrent.futures import ThreadPoolExecutor, as_completed

dataset_path = '/home/simon/Downloads/efcamdat/'
levels = ['b1']

essays = []
scores = []

result = {}
with ThreadPoolExecutor(max_workers=4) as executor:
    futures_level = {executor.submit(get_essays, dataset_path + 'level_' + level + '.xml'): level for level in levels}
    for future in as_completed(futures_level):
        level = futures_level[future]
        try:
            level_result = future.result()
            result[level] = level_result
        except Exception as exc:
            print('%r generated an exception: %s' % (level, exc))

    for level in levels:
        essays += result[level][0]
        scores += result[level][1]
scores = np.array(scores)
print("done")
print("Number of essays: " + str(len(essays)))
sys.stdout.flush()

done
Number of essays: 10000


In [3]:
from notebook_utils import log_progress
import spacy
from essay_evaluation.lexical_accuracy import SpellChecker, CollocationPreprocessor, CollocationDectector, \
    LexicalAccuracy, CollocationEvaluator
from essay_evaluation.pipeline import FeatureCollector
from essay_evaluation.collocational_aspects import CollocationalAspects


def chunks(l, n):
    """Yield successive n-sized chunks from l.
        from Ned Batchelder - https://stackoverflow.com/a/312464
    """
    for i in range(0, len(l), n):
        yield l[i:i + n]

def pipe(id, essay_batch):
    print('process ' + str(id) + ' start!')
    sys.stdout.flush()
    nlp = spacy.load('en_core_web_sm')
    # we only need the tokenizer from spaCy
    nlp.remove_pipe('tagger')
    nlp.remove_pipe('parser')
    nlp.remove_pipe('ner')

    # add all required components
    spell_checker = SpellChecker()
    nlp.add_pipe(spell_checker, name=spell_checker.name, last=True)

    col_preproc = CollocationPreprocessor()
    nlp.add_pipe(col_preproc, name=col_preproc.name, last=True)

    col_detect = CollocationDectector()
    nlp.add_pipe(col_detect, name=col_detect.name, last=True)

    col_evaluator = CollocationEvaluator()
    nlp.add_pipe(col_evaluator, name=col_evaluator.name, last=True)

    # add the lexical accuracy feature extractor
    la_feature_extractor = LexicalAccuracy()
    nlp.add_pipe(la_feature_extractor, name=la_feature_extractor.name, last=True)

    # add the collocational aspects feature extractor

    ca_feature_extractor = CollocationalAspects()
    nlp.add_pipe(ca_feature_extractor, name=ca_feature_extractor.name, last=True)

    # add the feature collector to get a nice feature matrix
    feature_collector = FeatureCollector()
    nlp.add_pipe(feature_collector, name=feature_collector.name, last=True)

    feature_names = la_feature_extractor.feature_names + ca_feature_extractor.feature_names

    docs = []
    start = time()
    for idx, essay in enumerate(essay_batch):
        docs.append(nlp(essay))
        if idx % 10 == 0:
            print("process " + str(id) + ' - '  + str(idx + 1) + " docs - elapsed time: " + str(time() - start))
            sys.stdout.flush()
    # we can't return the documents at this point.
    # Returning them from one process to another would mean pickling them
    # this causes an error (token can not be pickeled
    # https://github.com/allenai/allennlp/issues/1887 (other project with same problem)

    return feature_collector.get_feature_matrix()

In [9]:
chucked = chunks(essays[:100], 50)
result = {}

In [10]:
with ProcessPoolExecutor(max_workers=2) as executor:
    futures = {executor.submit(pipe, idx, batch): idx for idx, batch in enumerate(chucked)}
    for future in as_completed(futures):
        idx = futures[future]
        try:
            batch_result = future.result()
            result[idx] = batch_result
        except Exception as exc:
            print('%r generated an exception: %s' % (idx, exc))

print("writing to disk")
sys.stdout.flush()

process 0 start!
process 1 start!
process 1 - 1 docs - elapsed time: 3.0637433528900146
process 0 - 1 docs - elapsed time: 3.2927629947662354
process 1 - 11 docs - elapsed time: 15.100690841674805
process 0 - 11 docs - elapsed time: 17.2802312374115
process 1 - 21 docs - elapsed time: 29.280436515808105
process 0 - 21 docs - elapsed time: 32.122382402420044
process 1 - 31 docs - elapsed time: 43.597820520401
process 0 - 31 docs - elapsed time: 46.489773988723755
process 1 - 41 docs - elapsed time: 57.23483610153198
process 0 - 41 docs - elapsed time: 61.07015323638916
writing to disk


In [11]:
result

{1: [[3,
   0.026785714285714284,
   0.5,
   0.15384615384615385,
   0.15384615384615385,
   0.6923076923076923,
   2.0,
   2.0,
   9.0],
  [2,
   0.017857142857142856,
   0.5588235294117647,
   0.06666666666666667,
   0.06666666666666667,
   0.8666666666666667,
   1.0,
   1.0,
   13.0],
  [2, 0.017857142857142856, 0.6428571428571429, 0.3, 0.0, 0.7, 3.0, 0.0, 7.0],
  [1, 0.008771929824561403, 0.5652173913043478, 0.3, 0.1, 0.6, 3.0, 1.0, 6.0],
  [1,
   0.008928571428571428,
   0.5428571428571428,
   0.125,
   0.125,
   0.75,
   2.0,
   2.0,
   12.0],
  [6,
   0.05357142857142857,
   0.4074074074074074,
   0.3125,
   0.125,
   0.5625,
   5.0,
   2.0,
   9.0],
  [3,
   0.02586206896551724,
   0.34782608695652173,
   0.3333333333333333,
   0.0,
   0.6666666666666666,
   5.0,
   0.0,
   10.0],
  [1,
   0.008928571428571428,
   0.4,
   0.08333333333333333,
   0.16666666666666666,
   0.75,
   2.0,
   4.0,
   18.0],
  [3,
   0.026785714285714284,
   0.4090909090909091,
   0.07692307692307693,


In [16]:
feature_matrix = []

for idx, batch in enumerate(chucked):
    feature_matrix += result[idx]



In [20]:
for idx in range(len(result)):
    feature_matrix += result[idx]

In [22]:
len(feature_matrix)

100

In [None]:
nparr = np.array(feature_matrix)
np.save(sys.argv[2] +'.npy',nparr)