In [None]:
import numpy as np
import json
import time
import argparse
import _pickle as pickle

from os import path
from tqdm import tqdm

import sys
sys.path.append('../..')
from utils import CoreNLP_path
#from stanford_corenlp_pywrapper import CoreNLP
from gensim.models import KeyedVectors
from tokenizer import CoreNLPTokenizer

import multiprocessing
from multiprocessing import Pool
from multiprocessing.util import Finalize
from functools import partial

Using TensorFlow backend.


In [2]:
def word2vec(word2vec_path):
    model = KeyedVectors.load_word2vec_format(word2vec_path)

    def get_word_vector(word):
        try:
            return model[word]
        except KeyError:
            return np.zeros(model.vector_size)

    return get_word_vector

In [3]:
print('Reading SQuAD data... ', end='')
with open('../../data/train_parsed.json') as fd:
    samples = json.load(fd)
print('Done!')

Reading SQuAD data... Done!


In [4]:
print('Initiating CoreNLP service connection... ', end='')
tokenizer = CoreNLPTokenizer(classpath='/home/anatoly/stanford-corenlp-full-2017-06-09/*', annatators='pos, ner, lemma')
print('Done!')

Initiating CoreNLP service connection... Done!


In [5]:
try:
    cpus = multiprocessing.cpu_count()
except NotImplementedError:
    cpus = 2   # arbitrary default

cpus

4

In [6]:
def chunks(l, n):
    """Yield successive n-sized chunks from l."""
    for i in range(0, len(l), n):
        yield l[i:i + n]

In [12]:
class Tokenizer(object):
    def __init__(self, cpus):
        self.cpus = cpus
        
    def worker(self, arr):
        t = CoreNLPTokenizer(classpath='/home/anatoly/stanford-corenlp-full-2017-06-09/*')
        return [t.tokenize(sample) for sample in arr]
        
    def tokenize(self, arr):
        chunked = chunks(arr, round(len(arr) / self.cpus))
        p = Pool(self.cpus)
        nested_list = p.map(self.worker, chunked)
        return [val for sublist in nested_list for val in sublist]
            
t = Tokenizer(4)
t.tokenize([sample['context'] for sample in samples[0:10]])

[(['Architecturally',
   ',',
   'the',
   'school',
   'has',
   'a',
   'Catholic',
   'character',
   '.',
   'Atop',
   'the',
   'Main',
   'Building',
   "'s",
   'gold',
   'dome',
   'is',
   'a',
   'golden',
   'statue',
   'of',
   'the',
   'Virgin',
   'Mary',
   '.',
   'Immediately',
   'in',
   'front',
   'of',
   'the',
   'Main',
   'Building',
   'and',
   'facing',
   'it',
   ',',
   'is',
   'a',
   'copper',
   'statue',
   'of',
   'Christ',
   'with',
   'arms',
   'upraised',
   'with',
   'the',
   'legend',
   '``',
   'Venite',
   'Ad',
   'Me',
   'Omnes',
   "''",
   '.',
   'Next',
   'to',
   'the',
   'Main',
   'Building',
   'is',
   'the',
   'Basilica',
   'of',
   'the',
   'Sacred',
   'Heart',
   '.',
   'Immediately',
   'behind',
   'the',
   'basilica',
   'is',
   'the',
   'Grotto',
   ',',
   'a',
   'Marian',
   'place',
   'of',
   'prayer',
   'and',
   'reflection',
   '.',
   'It',
   'is',
   'a',
   'replica',
   'of',
   'the',
  

In [None]:
print('Tokenizing dataset with CoreNLP using pool of workers')
x = [tokenizer.tokenize(sample['context']) for sample in tqdm(samples[0:10])]
print('Done!')

In [None]:
x