In [1]:
from spacy.symbols import ORTH
from concurrent.futures import ProcessPoolExecutor

def parallel(func, arr, max_workers=4):
    if max_workers<2: results = list(progress_bar(map(func, enumerate(arr)), total=len(arr)))
    else:
        with ProcessPoolExecutor(max_workers=max_workers) as ex:
            return list(progress_bar(ex.map(func, enumerate(arr)), total=len(arr)))
    if any([o is not None for o in results]): return results

In [26]:
import spacy
from fastprogress import *

In [7]:
from collections import *

In [9]:
class Processor():
    "Basic class for a processor that will be applied to items at the end of the data block API."
    def __init__(self, ds:Collection=None):  self.ref_ds = ds
    def process_one(self, item):         return item
    def process(self, ds:Collection):        ds.items = array([self.process_one(item) for item in ds.items])

In [74]:
UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ = "xxunk xxpad xxbos xxeos xxrep xxwrep xxup xxmaj".split()

def sub_br(t):
    "Replaces the <br /> by \n"
    re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
    return re_br.sub("\n", t)

def spec_add_spaces(t):
    "Add spaces around / and #"
    return re.sub(r'([/#])', r' \1 ', t)

def rm_useless_spaces(t):
    "Remove multiple spaces"
    return re.sub(' {2,}', ' ', t)

def replace_rep(t):
    "Replace repetitions at the character level: cccc -> TK_REP 4 c"
    def _replace_rep(m:Collection[str]) -> str:
        c,cc = m.groups()
        return f' {TK_REP} {len(cc)+1} {c} '
    re_rep = re.compile(r'(\S)(\1{3,})')
    return re_rep.sub(_replace_rep, t)
    
def replace_wrep(t):
    "Replace word repetitions: word word word -> TK_WREP 3 word"
    def _replace_wrep(m:Collection[str]) -> str:
        c,cc = m.groups()
        return f' {TK_WREP} {len(cc.split())+1} {c} '
    re_wrep = re.compile(r'(\b\w+\W+)(\1{3,})')
    return re_wrep.sub(_replace_wrep, t)

def fixup_text(x):
    "Various messy things we've seen in documents"
    re1 = re.compile(r'  +')
    x = x.replace('#39;', "'").replace('amp;', '&').replace('#146;', "'").replace(
        'nbsp;', ' ').replace('#36;', '$').replace('\\n', "\n").replace('quot;', "'").replace(
        '<br />', "\n").replace('\\"', '"').replace('<unk>',UNK).replace(' @.@ ','.').replace(
        ' @-@ ','-').replace('\\', ' \\ ')
    return re1.sub(' ', html.unescape(x))
    
default_pre_rules = [fixup_text]#, replace_rep, replace_wrep, spec_add_spaces, rm_useless_spaces, sub_br]
default_spec_tok = [UNK, PAD, BOS, EOS, TK_REP, TK_WREP, TK_UP, TK_MAJ]

In [75]:
def replace_all_caps(x):
    "Replace tokens in ALL CAPS by their lower version and add `TK_UP` before."
    res = []
    for t in x:
        if t.isupper() and len(t) > 1: res.append(TK_UP); res.append(t.lower())
        else: res.append(t)
    return res

def deal_caps(x):
    "Replace all Capitalized tokens in by their lower version and add `TK_MAJ` before."
    res = []
    for t in x:
        if t == '': continue
        if t[0].isupper() and len(t) > 1 and t[1:].islower(): res.append(TK_MAJ)
        res.append(t.lower())
    return res

def add_eos_bos(x): return [BOS] + x + [EOS]

default_post_rules = [deal_caps, replace_all_caps, add_eos_bos]

In [76]:
class TokenizeProcessor(Processor):
    def __init__(self, lang="en", chunksize=2000, pre_rules=None, post_rules=None, max_workers=4): 
        self.chunksize,self.max_workers = chunksize,max_workers
        self.tokenizer = spacy.blank(lang).tokenizer
        for w in default_spec_tok:
            self.tokenizer.add_special_case(w, [{ORTH: w}])
        self.pre_rules  = default_pre_rules  if pre_rules  is None else pre_rules
        self.post_rules = default_post_rules if post_rules is None else post_rules

    def proc_chunk(self, args):
        i,chunk = args
        chunk = [compose(t, self.pre_rules) for t in chunk]
        docs = [[d.text for d in doc] for doc in self.tokenizer.pipe(chunk)]
        print(docs)
        docs = [compose(t, self.post_rules) for t in docs]
        return docs

    def __call__(self, items): 
        toks = []
        #if isinstance(items[0], Path): items = [read_file(i) for i in items]
        chunks = [items[i: i+self.chunksize] for i in (range(0, len(items), self.chunksize))]
        toks = parallel(self.proc_chunk, chunks, max_workers=self.max_workers)
        return sum(toks, [])
    
    def proc1(self, item): return self.proc_chunk([item])[0]
    
    def deprocess(self, toks): return [self.deproc1(tok) for tok in toks]
    def deproc1(self, tok):    return " ".join(tok)

In [77]:
def listify(o):
    if o is None: return []
    if isinstance(o, list): return o
    if isinstance(o, str): return [o]
    if isinstance(o, Iterable): return list(o)
    return [o]

def compose(x, funcs, *args, order_key='_order', **kwargs):
    key = lambda o: getattr(o, order_key, 0)
    for f in sorted(listify(funcs), key=key): x = f(x, **kwargs)
    return x
import re

In [78]:
tp = TokenizeProcessor()

In [79]:
text = 'Comedian Adam Sandler\'s last theatrical release "I Now Pronounce You Chuck and Larry" served as a loud and proud plea for tolerance of the gay community. The former "Saturday Night Live" funnyman\'s new movie "You Don\'t Mess with the Zohan" (*** out o'

texts = []
texts.append(text)
texts.append(text)
texts.append(text)
texts.append(text)
texts.append(text)
texts.append(text)
texts.append(text)



In [80]:
tp(texts)

[['Comedian', 'Adam', 'Sandler', "'s", 'last', 'theatrical', 'release', '"', 'I', 'Now', 'Pronounce', 'You', 'Chuck', 'and', 'Larry', '"', 'served', 'as', 'a', 'loud', 'and', 'proud', 'plea', 'for', 'tolerance', 'of', 'the', 'gay', 'community', '.', 'The', 'former', '"', 'Saturday', 'Night', 'Live', '"', 'funnyman', "'s", 'new', 'movie', '"', 'You', 'Do', "n't", 'Mess', 'with', 'the', 'Zohan', '"', '(', '*', '*', '*', 'out', 'o'], ['Comedian', 'Adam', 'Sandler', "'s", 'last', 'theatrical', 'release', '"', 'I', 'Now', 'Pronounce', 'You', 'Chuck', 'and', 'Larry', '"', 'served', 'as', 'a', 'loud', 'and', 'proud', 'plea', 'for', 'tolerance', 'of', 'the', 'gay', 'community', '.', 'The', 'former', '"', 'Saturday', 'Night', 'Live', '"', 'funnyman', "'s", 'new', 'movie', '"', 'You', 'Do', "n't", 'Mess', 'with', 'the', 'Zohan', '"', '(', '*', '*', '*', 'out', 'o'], ['Comedian', 'Adam', 'Sandler', "'s", 'last', 'theatrical', 'release', '"', 'I', 'Now', 'Pronounce', 'You', 'Chuck', 'and', 'Larry'

[['xxbos',
  'xxmaj',
  'comedian',
  'xxmaj',
  'adam',
  'xxmaj',
  'sandler',
  "'s",
  'last',
  'theatrical',
  'release',
  '"',
  'i',
  'xxmaj',
  'now',
  'xxmaj',
  'pronounce',
  'xxmaj',
  'you',
  'xxmaj',
  'chuck',
  'and',
  'xxmaj',
  'larry',
  '"',
  'served',
  'as',
  'a',
  'loud',
  'and',
  'proud',
  'plea',
  'for',
  'tolerance',
  'of',
  'the',
  'gay',
  'community',
  '.',
  'xxmaj',
  'the',
  'former',
  '"',
  'xxmaj',
  'saturday',
  'xxmaj',
  'night',
  'xxmaj',
  'live',
  '"',
  'funnyman',
  "'s",
  'new',
  'movie',
  '"',
  'xxmaj',
  'you',
  'xxmaj',
  'do',
  "n't",
  'xxmaj',
  'mess',
  'with',
  'the',
  'xxmaj',
  'zohan',
  '"',
  '(',
  '*',
  '*',
  '*',
  'out',
  'o',
  'xxeos'],
 ['xxbos',
  'xxmaj',
  'comedian',
  'xxmaj',
  'adam',
  'xxmaj',
  'sandler',
  "'s",
  'last',
  'theatrical',
  'release',
  '"',
  'i',
  'xxmaj',
  'now',
  'xxmaj',
  'pronounce',
  'xxmaj',
  'you',
  'xxmaj',
  'chuck',
  'and',
  'xxmaj',
  'larr