In [3]:
import spacy
from fastprogress import *
nlp = spacy.load('en_core_web_sm')

In [36]:
from tqdm import tqdm_notebook, tqdm

In [22]:
from concurrent.futures import ProcessPoolExecutor
import re

def parallel(func, arr, max_workers=4):
    if max_workers<2: results = list(progress_bar(map(func, enumerate(arr)), total=len(arr)))
    else:
        with ProcessPoolExecutor(max_workers=max_workers) as ex:
            return list(progress_bar(ex.map(func, enumerate(arr)), total=len(arr)))
    if any([o is not None for o in results]): return results
    
def listify(o):
    if o is None: return []
    if isinstance(o, list): return o
    if isinstance(o, str): return [o]
    if isinstance(o, Iterable): return list(o)
    return [o]

def compose(x, funcs, *args, order_key='_order', **kwargs):
    key = lambda o: getattr(o, order_key, 0)
    for f in sorted(listify(funcs), key=key): x = f(x, **kwargs)
    return x


In [20]:
class TokenizeProcessor():
    def __init__(self, nlp, chunksize=2000, pre_rules=None, post_rules=None, max_workers=4): 
        self.chunksize,self.max_workers = chunksize,max_workers
        self.tokenizer = nlp.tokenizer
        self.pre_rules  = pre_rules 
        self.post_rules = post_rules

    def proc_chunk(self, args):
        i,chunk = args
        chunk = [compose(t, self.pre_rules) for t in chunk]
        docs = [[d.text for d in doc] for doc in self.tokenizer.pipe(chunk)]
        docs = [compose(t, self.post_rules) for t in docs]
        return docs

    def __call__(self, items): 
        toks = []
        chunks = [items[i: i+self.chunksize] for i in (range(0, len(items), self.chunksize))]
        toks = parallel(self.proc_chunk, chunks, max_workers=self.max_workers)
        return sum(toks, [])

    def proc1(self, item): return self.proc_chunk([item])[0]
 
    def deprocess(self, toks): return [self.deproc1(tok) for tok in toks]
    def deproc1(self, tok):    return " ".join(tok)

In [76]:
def multi_thread_based_tokenizations(nlp, text_list, batch_size=1000, n_threads=4, n_process=1):
    docs = nlp.pipe(text_list, batch_size=batch_size, n_threads = n_threads, n_process=n_process)
    word_sequences = []

    for doc in tqdm(docs):
        word_seq = []
        for token in doc:
            word_seq.append(token.text)
        word_sequences.append(word_seq)
    return word_sequences

In [9]:
texts = ["This is a text", "These are lots of texts"]

In [24]:
tp = TokenizeProcessor(nlp)

In [25]:
tp(texts)

[['This', 'is', 'a', 'text'], ['These', 'are', 'lots', 'of', 'texts']]

In [43]:
word_sequences = multi_thread_based_tokenizations(nlp, texts)
word_sequences

2it [00:00, 101.57it/s]


[['This', 'is', 'a', 'text'], ['These', 'are', 'lots', 'of', 'texts']]

### Example Test

In [26]:
import pandas as pd
df = pd.read_csv('/Users/ankur.kumar/Desktop/Work/projects/client/pantaloon/Pantaloon-MH-processed.csv')[['UID', 'Cleaned_Verbatim']]

In [44]:
df = df.drop_duplicates(['UID'])

In [45]:
df = df[0:10000]
df.shape

(10000, 2)

In [46]:
df['Cleaned_Verbatim'] = df['Cleaned_Verbatim'].astype(str)

In [47]:
Verbatims = df.Cleaned_Verbatim.values.tolist()

In [70]:
nlp.pipe?

with multi threading

In [64]:
%%time
word_sequences = multi_thread_based_tokenizations(nlp, Verbatims, 4)
len(word_sequences)




0it [00:00, ?it/s][A[A[A


1it [00:04,  4.39s/it][A[A[A


1001it [00:08,  3.07s/it][A[A[A


2001it [00:13,  2.15s/it][A[A[A


3001it [00:17,  1.51s/it][A[A[A


4001it [00:22,  1.06s/it][A[A[A


5001it [00:27,  1.35it/s][A[A[A


6001it [00:31,  1.92it/s][A[A[A


7001it [00:35,  2.74it/s][A[A[A


8001it [00:39,  3.89it/s][A[A[A


10000it [00:43, 227.69it/s]A[A[A

CPU times: user 36.9 s, sys: 7 s, total: 43.9 s
Wall time: 43.9 s





10000

In [74]:
%%time
word_sequences = multi_thread_based_tokenizations(nlp, Verbatims, 2, 6)
len(word_sequences)




0it [00:00, ?it/s][A[A[A


1it [00:06,  6.53s/it][A[A[A


96it [00:06,  4.57s/it][A[A[A


215it [00:06,  3.20s/it][A[A[A


343it [00:06,  2.24s/it][A[A[A


477it [00:06,  1.57s/it][A[A[A


601it [00:07,  1.10s/it][A[A[A


753it [00:07,  1.30it/s][A[A[A


900it [00:07,  1.86it/s][A[A[A


1027it [00:07,  2.65it/s][A[A[A


1169it [00:07,  3.79it/s][A[A[A


1312it [00:07,  5.40it/s][A[A[A


1476it [00:07,  7.71it/s][A[A[A


1616it [00:07, 10.98it/s][A[A[A


1785it [00:07, 15.64it/s][A[A[A


1938it [00:07, 22.25it/s][A[A[A


2088it [00:08, 31.48it/s][A[A[A


2222it [00:08, 44.51it/s][A[A[A


2356it [00:08, 62.64it/s][A[A[A


2498it [00:08, 87.82it/s][A[A[A


2633it [00:08, 122.04it/s][A[A[A


2776it [00:08, 168.17it/s][A[A[A


2920it [00:08, 228.78it/s][A[A[A


3059it [00:08, 293.70it/s][A[A[A


3183it [00:08, 378.70it/s][A[A[A


3305it [00:09, 471.11it/s][A[A[A


3423it [00:09, 566.53it/s][A[A[A


3538it [0

CPU times: user 7.83 s, sys: 603 ms, total: 8.43 s
Wall time: 15.2 s





10000

with multi processing

In [66]:
%%time
tp = TokenizeProcessor(nlp, chunksize=2000, max_workers=4)
word_sequences = tp(Verbatims)
len(word_sequences)

CPU times: user 3.93 s, sys: 600 ms, total: 4.53 s
Wall time: 5.47 s


10000

In [68]:
word_sequences[0:10]

[['1',
  '.',
  'the',
  'fit',
  '&',
  'price',
  'was',
  'good',
  '.',
  '2',
  '.',
  'i',
  'was',
  'not',
  'looking',
  'for',
  'any',
  'particular',
  'brand',
  'or',
  'product',
  '.',
  '3',
  '.',
  'this',
  'is',
  'was',
  'just',
  'a',
  'casual',
  'shopping',
  '.',
  '4',
  '.',
  'the',
  'staff',
  'did',
  'assist',
  'me',
  'well',
  '.',
  '5',
  '.',
  'apart',
  'form',
  'pantaloons',
  'i',
  'also',
  'shop',
  'west',
  '-',
  'side6',
  '.',
  'recommend',
  'pantaloons',
  'to',
  'friends',
  '&',
  'family',
  '-',
  'yes'],
 ['1',
  '.',
  'i',
  'am',
  'satisfied',
  'with',
  'the',
  'staff',
  'service',
  '.',
  '2',
  '.',
  'the',
  'staff',
  'was',
  'very',
  'helpful',
  'to',
  'me',
  '.',
  '3',
  '.',
  'they',
  'explained',
  'me',
  'about',
  'the',
  'products',
  '.',
  '4',
  '.',
  'recommend',
  'pantaloons',
  'to',
  'friends',
  '&',
  'family',
  '-',
  'yes'],
 ['1',
  '.',
  'i',
  'am',
  'fine',
  'with',
  'th

## Stats

In [83]:
%%time
word_sequences = multi_thread_based_tokenizations(nlp, Verbatims[0:1000], 4)




0it [00:00, ?it/s][A[A[A


9it [00:00, 89.16it/s][A[A[A


25it [00:00, 100.34it/s][A[A[A


41it [00:00, 110.09it/s][A[A[A


57it [00:00, 120.00it/s][A[A[A


73it [00:00, 125.86it/s][A[A[A


89it [00:00, 132.80it/s][A[A[A


102it [00:00, 127.94it/s][A[A[A


115it [00:00, 125.04it/s][A[A[A


129it [00:00, 123.23it/s][A[A[A


145it [00:01, 126.83it/s][A[A[A


161it [00:01, 128.93it/s][A[A[A


177it [00:01, 135.06it/s][A[A[A


193it [00:01, 134.16it/s][A[A[A


209it [00:01, 136.02it/s][A[A[A


223it [00:01, 135.03it/s][A[A[A


237it [00:01, 124.44it/s][A[A[A


253it [00:01, 129.56it/s][A[A[A


269it [00:02, 131.47it/s][A[A[A


285it [00:02, 132.13it/s][A[A[A


301it [00:02, 131.80it/s][A[A[A


317it [00:02, 132.64it/s][A[A[A


333it [00:02, 138.50it/s][A[A[A


349it [00:02, 138.48it/s][A[A[A


365it [00:02, 141.40it/s][A[A[A


381it [00:02, 145.86it/s][A[A[A


397it [00:02, 147.60it/s][A[A[A


413it [00:03,

CPU times: user 6.85 s, sys: 416 ms, total: 7.26 s
Wall time: 7.24 s





In [82]:
import time
start = time.time()
word_sequences = multi_thread_based_tokenizations(nlp, Verbatims[0:1000], 4)
end = time.time()
print(end - start)




0it [00:00, ?it/s][A[A[A


13it [00:00, 111.63it/s][A[A[A


29it [00:00, 122.09it/s][A[A[A


49it [00:00, 130.91it/s][A[A[A


65it [00:00, 135.32it/s][A[A[A


81it [00:00, 140.72it/s][A[A[A


97it [00:00, 136.07it/s][A[A[A


113it [00:00, 132.40it/s][A[A[A


129it [00:00, 135.34it/s][A[A[A


145it [00:01, 138.20it/s][A[A[A


161it [00:01, 138.03it/s][A[A[A


177it [00:01, 143.55it/s][A[A[A


193it [00:01, 142.22it/s][A[A[A


213it [00:01, 150.30it/s][A[A[A


229it [00:01, 138.64it/s][A[A[A


245it [00:01, 138.43it/s][A[A[A


261it [00:01, 141.28it/s][A[A[A


277it [00:01, 142.51it/s][A[A[A


293it [00:02, 143.54it/s][A[A[A


309it [00:02, 143.87it/s][A[A[A


325it [00:02, 140.13it/s][A[A[A


341it [00:02, 139.76it/s][A[A[A


356it [00:02, 141.73it/s][A[A[A


371it [00:02, 142.87it/s][A[A[A


386it [00:02, 143.68it/s][A[A[A


401it [00:02, 142.16it/s][A[A[A


417it [00:02, 140.70it/s][A[A[A


433it [00:0

6.987317085266113





In [None]:
results = pd.DataFrame({''})

In [86]:
def get_stats_for_pipe_multi_threading(nlp, Verbatims, batch_sizes, n_threadss, n_processs):
    result_df = pd.DataFrame({"batch_size":[],'n_threads':[],'n_process':[],'Time':[]})
    for batch_size in batch_sizes:
        for n_threads in n_threadss:
            for n_process in n_processs:
                start = time.time()
                word_sequences = multi_thread_based_tokenizations(nlp, Verbatims, batch_size=batch_size, n_threads=n_threads, n_process=n_process)
                end = time.time()
                t_time = end - start
                result_df = result_df.append(pd.DataFrame({"batch_size":[batch_size],'n_threads':[n_threads],'n_process':[n_process],'Time':[t_time]}))
                print('result shape:', result_df.shape)
    return result_df
                

In [108]:
batch_sizes=[1000, 1500, 2000, 2500, 3000, 3500, 4000]
n_threads=[2]
n_process=[2,4]
threading_result_df = get_stats_for_pipe_multi_threading(nlp, Verbatims, batch_sizes, n_threads, n_process)





0it [00:00, ?it/s][A[A[A[A



1it [00:04,  4.68s/it][A[A[A[A



138it [00:04,  3.28s/it][A[A[A[A



287it [00:04,  2.29s/it][A[A[A[A



437it [00:04,  1.61s/it][A[A[A[A



581it [00:05,  1.12s/it][A[A[A[A



736it [00:05,  1.27it/s][A[A[A[A



898it [00:05,  1.81it/s][A[A[A[A



1033it [00:05,  2.59it/s][A[A[A[A



1167it [00:05,  3.70it/s][A[A[A[A



1305it [00:05,  5.27it/s][A[A[A[A



1457it [00:05,  7.52it/s][A[A[A[A



1594it [00:05, 10.72it/s][A[A[A[A



1764it [00:05, 15.28it/s][A[A[A[A



1923it [00:06, 21.73it/s][A[A[A[A



2073it [00:09, 25.68it/s][A[A[A[A



2197it [00:09, 36.36it/s][A[A[A[A



2328it [00:09, 51.34it/s][A[A[A[A



2477it [00:09, 72.27it/s][A[A[A[A



2604it [00:09, 100.77it/s][A[A[A[A



2743it [00:09, 139.60it/s][A[A[A[A



2896it [00:09, 191.91it/s][A[A[A[A



3033it [00:10, 244.31it/s][A[A[A[A



3162it [00:10, 322.77it/s][A[A[A[A



3301it [00:10, 419.31it

result shape: (1, 4)






1it [00:05,  5.42s/it][A[A[A[A



133it [00:05,  3.80s/it][A[A[A[A



268it [00:05,  2.66s/it][A[A[A[A



420it [00:05,  1.86s/it][A[A[A[A



570it [00:05,  1.30s/it][A[A[A[A



710it [00:05,  1.10it/s][A[A[A[A



861it [00:06,  1.57it/s][A[A[A[A



1001it [00:06,  2.24it/s][A[A[A[A



1136it [00:06,  3.19it/s][A[A[A[A



1263it [00:06,  4.55it/s][A[A[A[A



1388it [00:06,  6.49it/s][A[A[A[A



1541it [00:06,  9.26it/s][A[A[A[A



1679it [00:06, 13.19it/s][A[A[A[A



1843it [00:06, 18.78it/s][A[A[A[A



1986it [00:06, 26.68it/s][A[A[A[A



2129it [00:07, 37.59it/s][A[A[A[A



2262it [00:07, 53.06it/s][A[A[A[A



2389it [00:07, 74.41it/s][A[A[A[A



2515it [00:07, 103.63it/s][A[A[A[A



2640it [00:07, 142.74it/s][A[A[A[A



2774it [00:07, 194.94it/s][A[A[A[A



2924it [00:07, 263.76it/s][A[A[A[A



3058it [00:07, 331.65it/s][A[A[A[A



3180it [00:07, 424.29it/s][A[A[A[A



3300it [00:08, 

result shape: (2, 4)






1it [00:07,  7.41s/it][A[A[A[A



160it [00:07,  5.19s/it][A[A[A[A



328it [00:07,  3.63s/it][A[A[A[A



486it [00:07,  2.54s/it][A[A[A[A



655it [00:07,  1.78s/it][A[A[A[A



815it [00:07,  1.25s/it][A[A[A[A



974it [00:08,  1.15it/s][A[A[A[A



1153it [00:08,  1.64it/s][A[A[A[A



1307it [00:08,  2.34it/s][A[A[A[A



1460it [00:08,  3.34it/s][A[A[A[A



1609it [00:08,  4.76it/s][A[A[A[A



1770it [00:08,  6.79it/s][A[A[A[A



1916it [00:08,  9.68it/s][A[A[A[A



2066it [00:08, 13.79it/s][A[A[A[A



2234it [00:08, 19.63it/s][A[A[A[A



2386it [00:08, 27.87it/s][A[A[A[A



2532it [00:09, 39.48it/s][A[A[A[A



2699it [00:09, 55.84it/s][A[A[A[A



2859it [00:09, 78.58it/s][A[A[A[A



3013it [00:14, 52.63it/s][A[A[A[A



3123it [00:14, 73.68it/s][A[A[A[A



3250it [00:14, 102.68it/s][A[A[A[A



3406it [00:14, 142.66it/s][A[A[A[A



3544it [00:14, 195.12it/s][A[A[A[A



3691it [00:14, 263

result shape: (3, 4)






1it [00:08,  8.80s/it][A[A[A[A



142it [00:08,  6.16s/it][A[A[A[A



285it [00:08,  4.31s/it][A[A[A[A



435it [00:09,  3.02s/it][A[A[A[A



568it [00:09,  2.11s/it][A[A[A[A



718it [00:09,  1.48s/it][A[A[A[A



859it [00:09,  1.04s/it][A[A[A[A



1028it [00:09,  1.38it/s][A[A[A[A



1195it [00:09,  1.97it/s][A[A[A[A



1342it [00:09,  2.81it/s][A[A[A[A



1486it [00:09,  4.01it/s][A[A[A[A



1630it [00:09,  5.72it/s][A[A[A[A



1783it [00:10,  8.16it/s][A[A[A[A



1933it [00:10, 11.63it/s][A[A[A[A



2071it [00:10, 16.55it/s][A[A[A[A



2222it [00:10, 23.53it/s][A[A[A[A



2361it [00:10, 33.33it/s][A[A[A[A



2494it [00:10, 47.10it/s][A[A[A[A



2637it [00:10, 66.34it/s][A[A[A[A



2777it [00:10, 92.88it/s][A[A[A[A



2921it [00:10, 129.10it/s][A[A[A[A



3060it [00:11, 172.69it/s][A[A[A[A



3193it [00:11, 233.63it/s][A[A[A[A



3318it [00:11, 308.74it/s][A[A[A[A



3453it [00:11, 40

result shape: (4, 4)






1it [00:09,  9.67s/it][A[A[A[A



149it [00:09,  6.77s/it][A[A[A[A



316it [00:09,  4.74s/it][A[A[A[A



484it [00:09,  3.32s/it][A[A[A[A



644it [00:10,  2.32s/it][A[A[A[A



804it [00:10,  1.63s/it][A[A[A[A



951it [00:10,  1.14s/it][A[A[A[A



1097it [00:10,  1.25it/s][A[A[A[A



1251it [00:10,  1.79it/s][A[A[A[A



1409it [00:10,  2.56it/s][A[A[A[A



1593it [00:10,  3.65it/s][A[A[A[A



1773it [00:10,  5.21it/s][A[A[A[A



1938it [00:10,  7.44it/s][A[A[A[A



2101it [00:11, 10.58it/s][A[A[A[A



2260it [00:11, 15.08it/s][A[A[A[A



2408it [00:11, 21.44it/s][A[A[A[A



2554it [00:11, 30.42it/s][A[A[A[A



2719it [00:11, 43.11it/s][A[A[A[A



2882it [00:11, 60.90it/s][A[A[A[A



3042it [00:11, 85.59it/s][A[A[A[A



3197it [00:11, 118.61it/s][A[A[A[A



3341it [00:11, 163.60it/s][A[A[A[A



3495it [00:12, 223.50it/s][A[A[A[A



3669it [00:12, 302.63it/s][A[A[A[A



3824it [00:12, 39

result shape: (5, 4)






1it [00:11, 11.03s/it][A[A[A[A



137it [00:11,  7.72s/it][A[A[A[A



275it [00:11,  5.40s/it][A[A[A[A



419it [00:11,  3.78s/it][A[A[A[A



569it [00:11,  2.65s/it][A[A[A[A



726it [00:11,  1.85s/it][A[A[A[A



878it [00:11,  1.30s/it][A[A[A[A



1010it [00:11,  1.10it/s][A[A[A[A



1159it [00:11,  1.57it/s][A[A[A[A



1318it [00:11,  2.24it/s][A[A[A[A



1477it [00:12,  3.20it/s][A[A[A[A



1624it [00:12,  4.57it/s][A[A[A[A



1767it [00:12,  6.52it/s][A[A[A[A



1922it [00:12,  9.30it/s][A[A[A[A



2067it [00:12, 13.20it/s][A[A[A[A



2195it [00:12, 18.78it/s][A[A[A[A



2327it [00:12, 26.66it/s][A[A[A[A



2490it [00:12, 37.82it/s][A[A[A[A



2655it [00:12, 53.51it/s][A[A[A[A



2801it [00:13, 75.25it/s][A[A[A[A



2947it [00:13, 104.91it/s][A[A[A[A



3094it [00:13, 145.40it/s][A[A[A[A



3268it [00:13, 200.50it/s][A[A[A[A



3440it [00:13, 272.78it/s][A[A[A[A



3598it [00:13, 36

result shape: (6, 4)






1it [00:11, 11.82s/it][A[A[A[A



160it [00:11,  8.28s/it][A[A[A[A



320it [00:12,  5.79s/it][A[A[A[A



501it [00:12,  4.06s/it][A[A[A[A



674it [00:12,  2.84s/it][A[A[A[A



854it [00:12,  1.99s/it][A[A[A[A



1036it [00:12,  1.39s/it][A[A[A[A



1192it [00:12,  1.03it/s][A[A[A[A



1348it [00:12,  1.47it/s][A[A[A[A



1530it [00:12,  2.09it/s][A[A[A[A



1714it [00:12,  2.99it/s][A[A[A[A



1893it [00:12,  4.27it/s][A[A[A[A



2064it [00:13,  6.09it/s][A[A[A[A



2239it [00:13,  8.69it/s][A[A[A[A



2409it [00:13, 12.38it/s][A[A[A[A



2576it [00:13, 17.56it/s][A[A[A[A



2733it [00:13, 24.96it/s][A[A[A[A



2897it [00:13, 35.43it/s][A[A[A[A



3070it [00:13, 50.18it/s][A[A[A[A



3227it [00:13, 70.71it/s][A[A[A[A



3384it [00:13, 98.96it/s][A[A[A[A



3538it [00:14, 137.53it/s][A[A[A[A



3704it [00:14, 189.71it/s][A[A[A[A



3861it [00:14, 256.40it/s][A[A[A[A



4016it [00:14, 34

result shape: (7, 4)






1it [00:15, 15.05s/it][A[A[A[A



110it [00:15, 10.53s/it][A[A[A[A



177it [00:15,  7.37s/it][A[A[A[A



280it [00:15,  5.16s/it][A[A[A[A



439it [00:15,  3.61s/it][A[A[A[A



597it [00:15,  2.53s/it][A[A[A[A



755it [00:15,  1.77s/it][A[A[A[A



903it [00:15,  1.24s/it][A[A[A[A



1054it [00:15,  1.15it/s][A[A[A[A



1191it [00:15,  1.64it/s][A[A[A[A



1337it [00:16,  2.35it/s][A[A[A[A



1514it [00:16,  3.35it/s][A[A[A[A



1693it [00:16,  4.79it/s][A[A[A[A



1881it [00:16,  6.83it/s][A[A[A[A



2062it [00:16,  9.74it/s][A[A[A[A



2236it [00:16, 13.88it/s][A[A[A[A



2414it [00:16, 19.77it/s][A[A[A[A



2588it [00:16, 27.92it/s][A[A[A[A



2740it [00:16, 39.57it/s][A[A[A[A



2923it [00:17, 56.01it/s][A[A[A[A



3080it [00:17, 78.71it/s][A[A[A[A



3252it [00:17, 110.27it/s][A[A[A[A



3433it [00:17, 153.52it/s][A[A[A[A



3600it [00:17, 210.97it/s][A[A[A[A



3781it [00:17, 287.

result shape: (8, 4)






1it [00:14, 14.72s/it][A[A[A[A



150it [00:14, 10.30s/it][A[A[A[A



303it [00:14,  7.21s/it][A[A[A[A



472it [00:15,  5.05s/it][A[A[A[A



652it [00:15,  3.53s/it][A[A[A[A



826it [00:15,  2.47s/it][A[A[A[A



1005it [00:15,  1.73s/it][A[A[A[A



1182it [00:15,  1.21s/it][A[A[A[A



1357it [00:15,  1.18it/s][A[A[A[A



1545it [00:15,  1.68it/s][A[A[A[A



1730it [00:15,  2.40it/s][A[A[A[A



1923it [00:15,  3.43it/s][A[A[A[A



2104it [00:15,  4.90it/s][A[A[A[A



2284it [00:16,  6.99it/s][A[A[A[A



2470it [00:16,  9.96it/s][A[A[A[A



2655it [00:16, 14.20it/s][A[A[A[A



2838it [00:16, 20.21it/s][A[A[A[A



3016it [00:16, 28.55it/s][A[A[A[A



3164it [00:16, 40.45it/s][A[A[A[A



3339it [00:16, 57.22it/s][A[A[A[A



3494it [00:16, 80.46it/s][A[A[A[A



3667it [00:16, 112.70it/s][A[A[A[A



3828it [00:17, 156.11it/s][A[A[A[A



3987it [00:17, 213.75it/s][A[A[A[A



4161it [00:17, 28

result shape: (9, 4)






1it [00:15, 15.80s/it][A[A[A[A



149it [00:15, 11.06s/it][A[A[A[A



288it [00:16,  7.74s/it][A[A[A[A



457it [00:16,  5.42s/it][A[A[A[A



637it [00:16,  3.79s/it][A[A[A[A



799it [00:16,  2.66s/it][A[A[A[A



948it [00:16,  1.86s/it][A[A[A[A



1121it [00:16,  1.30s/it][A[A[A[A



1299it [00:16,  1.10it/s][A[A[A[A



1466it [00:16,  1.57it/s][A[A[A[A



1651it [00:16,  2.24it/s][A[A[A[A



1848it [00:16,  3.20it/s][A[A[A[A



2034it [00:17,  4.56it/s][A[A[A[A



2214it [00:17,  6.51it/s][A[A[A[A



2392it [00:17,  9.28it/s][A[A[A[A



2570it [00:17, 13.23it/s][A[A[A[A



2763it [00:17, 18.85it/s][A[A[A[A



2957it [00:17, 26.81it/s][A[A[A[A



3143it [00:17, 37.68it/s][A[A[A[A



3332it [00:17, 53.38it/s][A[A[A[A



3499it [00:17, 75.22it/s][A[A[A[A



3694it [00:18, 105.71it/s][A[A[A[A



3875it [00:18, 147.30it/s][A[A[A[A



4056it [00:18, 203.33it/s][A[A[A[A



4238it [00:18, 277

result shape: (10, 4)






1it [00:16, 17.00s/it][A[A[A[A



134it [00:17, 11.90s/it][A[A[A[A



283it [00:17,  8.33s/it][A[A[A[A



450it [00:17,  5.83s/it][A[A[A[A



592it [00:17,  4.08s/it][A[A[A[A



718it [00:17,  2.86s/it][A[A[A[A



860it [00:17,  2.00s/it][A[A[A[A



1038it [00:17,  1.40s/it][A[A[A[A



1211it [00:17,  1.02it/s][A[A[A[A



1361it [00:17,  1.46it/s][A[A[A[A



1502it [00:18,  2.08it/s][A[A[A[A



1643it [00:18,  2.97it/s][A[A[A[A



1807it [00:18,  4.24it/s][A[A[A[A



1968it [00:18,  6.05it/s][A[A[A[A



2122it [00:18,  8.63it/s][A[A[A[A



2274it [00:18, 12.29it/s][A[A[A[A



2426it [00:18, 17.49it/s][A[A[A[A



2574it [00:18, 24.85it/s][A[A[A[A



2718it [00:18, 35.21it/s][A[A[A[A



2867it [00:18, 49.80it/s][A[A[A[A



3047it [00:19, 70.31it/s][A[A[A[A



3203it [00:19, 98.53it/s][A[A[A[A



3360it [00:19, 137.06it/s][A[A[A[A



3515it [00:19, 175.05it/s][A[A[A[A



3663it [00:19, 238.

result shape: (11, 4)






1it [00:16, 16.77s/it][A[A[A[A



150it [00:16, 11.74s/it][A[A[A[A



310it [00:16,  8.22s/it][A[A[A[A



491it [00:17,  5.75s/it][A[A[A[A



655it [00:17,  4.03s/it][A[A[A[A



819it [00:17,  2.82s/it][A[A[A[A



980it [00:17,  1.97s/it][A[A[A[A



1159it [00:17,  1.38s/it][A[A[A[A



1331it [00:17,  1.03it/s][A[A[A[A



1532it [00:17,  1.48it/s][A[A[A[A



1716it [00:17,  2.11it/s][A[A[A[A



1908it [00:17,  3.01it/s][A[A[A[A



2089it [00:17,  4.30it/s][A[A[A[A



2267it [00:18,  6.13it/s][A[A[A[A



2456it [00:18,  8.75it/s][A[A[A[A



2642it [00:18, 12.48it/s][A[A[A[A



2825it [00:18, 17.77it/s][A[A[A[A



3008it [00:18, 25.28it/s][A[A[A[A



3194it [00:18, 35.91it/s][A[A[A[A



3380it [00:18, 50.87it/s][A[A[A[A



3564it [00:18, 70.32it/s][A[A[A[A



3739it [00:19, 98.75it/s][A[A[A[A



3910it [00:19, 137.65it/s][A[A[A[A



4099it [00:19, 190.68it/s][A[A[A[A



4284it [00:19, 260.

result shape: (12, 4)






1it [00:19, 19.27s/it][A[A[A[A



134it [00:19, 13.49s/it][A[A[A[A



274it [00:19,  9.44s/it][A[A[A[A



412it [00:19,  6.61s/it][A[A[A[A



549it [00:19,  4.63s/it][A[A[A[A



668it [00:19,  3.24s/it][A[A[A[A



819it [00:19,  2.27s/it][A[A[A[A



950it [00:19,  1.59s/it][A[A[A[A



1107it [00:20,  1.11s/it][A[A[A[A



1259it [00:20,  1.29it/s][A[A[A[A



1434it [00:20,  1.84it/s][A[A[A[A



1605it [00:20,  2.62it/s][A[A[A[A



1761it [00:20,  3.74it/s][A[A[A[A



1937it [00:20,  5.34it/s][A[A[A[A



2098it [00:20,  7.62it/s][A[A[A[A



2262it [00:20, 10.86it/s][A[A[A[A



2422it [00:20, 15.46it/s][A[A[A[A



2578it [00:20, 21.99it/s][A[A[A[A



2735it [00:21, 31.23it/s][A[A[A[A



2910it [00:21, 44.27it/s][A[A[A[A



3071it [00:21, 62.40it/s][A[A[A[A



3226it [00:21, 87.60it/s][A[A[A[A



3401it [00:21, 122.50it/s][A[A[A[A



3561it [00:21, 169.24it/s][A[A[A[A



3722it [00:21, 231.3

result shape: (13, 4)






1it [00:20, 20.25s/it][A[A[A[A



189it [00:20, 14.18s/it][A[A[A[A



375it [00:20,  9.92s/it][A[A[A[A



554it [00:20,  6.95s/it][A[A[A[A



710it [00:20,  4.86s/it][A[A[A[A



887it [00:20,  3.40s/it][A[A[A[A



1050it [00:20,  2.38s/it][A[A[A[A



1230it [00:20,  1.67s/it][A[A[A[A



1407it [00:21,  1.17s/it][A[A[A[A



1610it [00:21,  1.22it/s][A[A[A[A



1811it [00:21,  1.75it/s][A[A[A[A



1995it [00:21,  2.49it/s][A[A[A[A



2186it [00:21,  3.56it/s][A[A[A[A



2371it [00:21,  5.08it/s][A[A[A[A



2555it [00:21,  7.25it/s][A[A[A[A



2755it [00:21, 10.34it/s][A[A[A[A



2958it [00:21, 14.75it/s][A[A[A[A



3151it [00:21, 21.00it/s][A[A[A[A



3356it [00:22, 29.86it/s][A[A[A[A



3552it [00:22, 42.36it/s][A[A[A[A



3747it [00:22, 59.96it/s][A[A[A[A



3940it [00:22, 84.21it/s][A[A[A[A



4120it [00:22, 113.75it/s][A[A[A[A



4307it [00:22, 158.36it/s][A[A[A[A



4501it [00:22, 218

result shape: (14, 4)





In [110]:
threading_result_df.sort_values('Time')

Unnamed: 0,batch_size,n_threads,n_process,Time
0,1000.0,2.0,4.0,17.773555
0,1500.0,2.0,4.0,18.986561
0,2000.0,2.0,4.0,20.7908
0,2500.0,2.0,4.0,21.243879
0,3000.0,2.0,4.0,21.84963
0,3500.0,2.0,4.0,22.805901
0,1000.0,2.0,2.0,25.049998
0,4000.0,2.0,4.0,26.027611
0,2500.0,2.0,2.0,26.379638
0,1500.0,2.0,2.0,26.804577


In [92]:
def get_stats_for_concurrent(nlp, Verbatims, batch_sizes, n_processs):
    result_df = pd.DataFrame({"batch_size":[],'n_threads':[],'n_process':[],'Time':[]})
    for batch_size in batch_sizes:
            for n_process in n_processs:
                start = time.time()
                tp = TokenizeProcessor(nlp, chunksize=batch_size, max_workers=n_process)
                word_sequences = tp(Verbatims)
                end = time.time()
                t_time = end - start
                result_df = result_df.append(pd.DataFrame({"batch_size":[batch_size],'n_threads':[n_threads],'n_process':[n_process],'Time':[t_time]}))
                print('result shape:', result_df.shape)
    return result_df
                

In [106]:
batch_sizes=[1000, 1500, 2000, 2500, 3000, 3500, 4000]
n_process=[2,4]
multiprocessing_result_df_2 = get_stats_for_concurrent(nlp, Verbatims, batch_sizes, n_process)

result shape: (1, 4)


result shape: (2, 4)


result shape: (3, 4)


result shape: (4, 4)


result shape: (5, 4)


result shape: (6, 4)


result shape: (7, 4)


result shape: (8, 4)


result shape: (9, 4)


result shape: (10, 4)


result shape: (11, 4)


result shape: (12, 4)


result shape: (13, 4)


result shape: (14, 4)


In [112]:
del multiprocessing_result_df_2['n_threads']

In [113]:
multiprocessing_result_df_2.sort_values('Time')

Unnamed: 0,batch_size,n_process,Time
0,4000.0,2.0,3.468632
0,4000.0,4.0,3.48754
0,3500.0,2.0,4.040751
0,3000.0,2.0,4.352866
0,3000.0,4.0,4.422525
0,2500.0,4.0,4.445214
0,2500.0,2.0,4.512357
0,3500.0,4.0,4.737216
0,2000.0,4.0,5.303331
0,2000.0,2.0,5.378104
