In [3]:
import spacy
from fastprogress import *
nlp = spacy.load('en_core_web_sm')

In [36]:
from tqdm import tqdm_notebook, tqdm

In [22]:
from concurrent.futures import ProcessPoolExecutor
import re

def parallel(func, arr, max_workers=4):
    if max_workers<2: results = list(progress_bar(map(func, enumerate(arr)), total=len(arr)))
    else:
        with ProcessPoolExecutor(max_workers=max_workers) as ex:
            return list(progress_bar(ex.map(func, enumerate(arr)), total=len(arr)))
    if any([o is not None for o in results]): return results
    
def listify(o):
    if o is None: return []
    if isinstance(o, list): return o
    if isinstance(o, str): return [o]
    if isinstance(o, Iterable): return list(o)
    return [o]

def compose(x, funcs, *args, order_key='_order', **kwargs):
    key = lambda o: getattr(o, order_key, 0)
    for f in sorted(listify(funcs), key=key): x = f(x, **kwargs)
    return x


In [20]:
class TokenizeProcessor():
    def __init__(self, nlp, chunksize=2000, pre_rules=None, post_rules=None, max_workers=4): 
        self.chunksize,self.max_workers = chunksize,max_workers
        self.tokenizer = nlp.tokenizer
        self.pre_rules  = pre_rules 
        self.post_rules = post_rules

    def proc_chunk(self, args):
        i,chunk = args
        chunk = [compose(t, self.pre_rules) for t in chunk]
        docs = [[d.text for d in doc] for doc in self.tokenizer.pipe(chunk)]
        docs = [compose(t, self.post_rules) for t in docs]
        return docs

    def __call__(self, items): 
        toks = []
        chunks = [items[i: i+self.chunksize] for i in (range(0, len(items), self.chunksize))]
        toks = parallel(self.proc_chunk, chunks, max_workers=self.max_workers)
        return sum(toks, [])
    
    def proc1(self, item): return self.proc_chunk([item])[0]
    
    def deprocess(self, toks): return [self.deproc1(tok) for tok in toks]
    def deproc1(self, tok):    return " ".join(tok)

In [50]:
def multi_thread_based_tokenizations(nlp, text_list, n_threads=4):
    docs = nlp.pipe(text_list, n_threads = n_threads)
    word_sequences = []
    
    for doc in tqdm(docs):
        word_seq = []
        for token in doc:
            word_seq.append(token.text)
        word_sequences.append(word_seq)
    return word_sequences

In [9]:
texts = ["This is a text", "These are lots of texts"]

In [24]:
tp = TokenizeProcessor(nlp)

In [25]:
tp(texts)

[['This', 'is', 'a', 'text'], ['These', 'are', 'lots', 'of', 'texts']]

In [43]:
word_sequences = multi_thread_based_tokenizations(nlp, texts)
word_sequences

2it [00:00, 101.57it/s]


[['This', 'is', 'a', 'text'], ['These', 'are', 'lots', 'of', 'texts']]

### Example Test

In [26]:
import pandas as pd
df = pd.read_csv('/Users/ankur.kumar/Desktop/Work/projects/client/pantaloon/Pantaloon-MH-processed.csv')[['UID', 'Cleaned_Verbatim']]

In [44]:
df = df.drop_duplicates(['UID'])

In [45]:
df = df[0:10000]
df.shape

(10000, 2)

In [46]:
df['Cleaned_Verbatim'] = df['Cleaned_Verbatim'].astype(str)

In [47]:
Verbatims = df.Cleaned_Verbatim.values.tolist()

with multi threading

In [64]:
%%time
word_sequences = multi_thread_based_tokenizations(nlp, Verbatims, 4)
len(word_sequences)




0it [00:00, ?it/s][A[A[A


1it [00:04,  4.39s/it][A[A[A


1001it [00:08,  3.07s/it][A[A[A


2001it [00:13,  2.15s/it][A[A[A


3001it [00:17,  1.51s/it][A[A[A


4001it [00:22,  1.06s/it][A[A[A


5001it [00:27,  1.35it/s][A[A[A


6001it [00:31,  1.92it/s][A[A[A


7001it [00:35,  2.74it/s][A[A[A


8001it [00:39,  3.89it/s][A[A[A


10000it [00:43, 227.69it/s]A[A[A

CPU times: user 36.9 s, sys: 7 s, total: 43.9 s
Wall time: 43.9 s





10000

with multi processing

In [66]:
%%time
tp = TokenizeProcessor(nlp, chunksize=2000, max_workers=4)
word_sequences = tp(Verbatims)
len(word_sequences)

CPU times: user 3.93 s, sys: 600 ms, total: 4.53 s
Wall time: 5.47 s


10000

In [68]:
word_sequences[0:10]

[['1',
  '.',
  'the',
  'fit',
  '&',
  'price',
  'was',
  'good',
  '.',
  '2',
  '.',
  'i',
  'was',
  'not',
  'looking',
  'for',
  'any',
  'particular',
  'brand',
  'or',
  'product',
  '.',
  '3',
  '.',
  'this',
  'is',
  'was',
  'just',
  'a',
  'casual',
  'shopping',
  '.',
  '4',
  '.',
  'the',
  'staff',
  'did',
  'assist',
  'me',
  'well',
  '.',
  '5',
  '.',
  'apart',
  'form',
  'pantaloons',
  'i',
  'also',
  'shop',
  'west',
  '-',
  'side6',
  '.',
  'recommend',
  'pantaloons',
  'to',
  'friends',
  '&',
  'family',
  '-',
  'yes'],
 ['1',
  '.',
  'i',
  'am',
  'satisfied',
  'with',
  'the',
  'staff',
  'service',
  '.',
  '2',
  '.',
  'the',
  'staff',
  'was',
  'very',
  'helpful',
  'to',
  'me',
  '.',
  '3',
  '.',
  'they',
  'explained',
  'me',
  'about',
  'the',
  'products',
  '.',
  '4',
  '.',
  'recommend',
  'pantaloons',
  'to',
  'friends',
  '&',
  'family',
  '-',
  'yes'],
 ['1',
  '.',
  'i',
  'am',
  'fine',
  'with',
  'th