# Data Step 2: Counting up Global Counts for Word List

In the EF processing script, token counts were collected in batches, folded to language x token counts in batches, and saved to HDF5 stores in `/store`. This script will fold those batches into a single list, so each language-token combination only has one count. The previous script was a `map`, this script with `reduce`.

In [None]:
import glob
import re
import pandas as pd
import numpy as np
from ipyparallel import Client
import logging
import os
from tqdm import tqdm_notebook
from bokeh.io import output_notebook
import dask.dataframe as dd
from dask.diagnostics import ProgressBar, Profiler, ResourceProfiler, CacheProfiler, visualize
output_notebook()
rawstores = glob.glob("/notebooks/data/batch2/stores/*h5")

## Preparation

Attach engines and initialize logging. *We'll be processing large in-memory chunks, so don't start too many processes.* I'm using a machine with 128MB RAM, and 10 processes hits around 2/3 of the RAM (80MB) for chunksize=1m in Step 1.

In [None]:
def init_log(name=False):
    import logging, os
    if not name:
        name = os.getpid()
    handler = logging.FileHandler("/notebooks/data2/logs/bw-%s.log" % name, 'a')
    formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s', "%m/%d-%H:%M:%S")
    handler.setFormatter(formatter)
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    logger.addHandler(handler)
    logging.info("Log initialized")

init_log('root')

In [None]:
rc = Client()
dview = rc[:]
v = rc.load_balanced_view()

# Need this later to force garbage collection
@dview.remote(block=True)
def force_gc():
    import gc
    before = gc.get_count()
    gc.collect()
    return before[0], gc.get_count()[0]

dview.push(dict(init_log=init_log))
%px init_log()

## Step 1: Triage and merge small chunks by lang (Parallelized)

Iterate through all the stores, groupby by language then summing counts by token. These counts are still saved to an engine's own store under merge1/{language}, so that it can be parallelized.

In [None]:
def triage(inputstore):
    try:
        import numpy as np
        import pandas as pd
        import logging
        import os
        import gc
    except:
        return "import error for " + inputstore

    chunksize = 1000000
    storefolder = 'merged1' # this is in the h5 hierarchy
    outputstorename = "/notebooks/data2/batch2-redo/merge-%s.h5" % os.getpid()
    max_str_bytes = 50
    
    errors = 0
    with pd.HDFStore(outputstorename, complevel=9, mode="a", complib='blosc') as outstore:
        with pd.HDFStore(inputstore, complevel=9, mode="r", complib='blosc') as store:
            row_size = store.get_storer('/tf/corpus').nrows
            storeiter = store.select('/tf/corpus', start=0, chunksize=chunksize)

            i = 0
            for chunk in storeiter:
                i += 1
                try:
                    lang_groups = chunk.groupby(level=['language'])
                    for lang,df in lang_groups:
                        if df.empty:
                            continue
                        merged = df.groupby(level=['token']).sum()
                        
                        fname = "%s/%s" % (storefolder, lang)
                        outstore.append(fname, merged, data_columns=['count'], min_itemsize = {'index': max_str_bytes})
                    logging.info("Completed %d/%d" % (i, np.ceil(row_size/chunksize)))
                except:
                    errors += 1
                    logging.exception("Error processing batch %d (docs %d-%d) of input store" % (i, (i-1)*chunksize, i*chunksize))
                gc.collect()
    gc.collect()
    if errors == 0:
        return "success"
    else:
        return "%d errors on process %s, check logs" % (errors, os.getpid())
dview.push(dict(triage=triage))

<AsyncResult: _push>

In [None]:
logging.info("Processing Started")
parallel_job = v.map(triage, rawstores, ordered=False)
i = 0

for result in tqdm_notebook(parallel_job, smoothing=0):
    i += 1
    if result == "success":
        logging.info("Done processing batch %d" % i)
    else:
        logging.error(result)
        
print(force_gc())

The installed widget Javascript is the wrong version.


CompositeError: one or more exceptions from call to method: triage
[Engine Exception]EngineError: Engine b'c426022b-5b16-44cd-a480-ca5d11c0d96e' died while running task 'd3269715-7885-49c3-806b-99d7af445991'

## Step 2: Folding lang stores by bigger chunks (Parallelized)

Starting with a blank DF, interate through each store (separately by language), selecting N million rows at once, and merging it into the initially blank DF (i.e. read chunk, concat to full_df, groupby(level='token'), and sum). Save to `/staged/{lang}`.

Note that the engines are handed stores now, rather than saving to their own pid-named store.

**TODO**

Might as well cycle through all of the chunks and collect to an in-memory collector. This will require a large amount of memory per process, given that the vocab can get to ~100m, and the process of append, group, sum will result in an in-memory copy. It's nonetheless much faster than any type of index-based iteration (trust Pandas and Numpy!), and a bit neater than running Step 2 repeatedly.

In [None]:
stores = glob.glob("/notebooks/data3/fold/*h5")

In [None]:
def fold(storefile, args):
    import logging
    import pandas as pd
    import gc
    import numpy as np
    gc.collect()
    
    storefolder = args['storefolder'] if 'storefolder' in args else '/staged'
    # The PyTables folder to fold, as in "folder/language"
    targetfolder = args['targetfolder'] if 'targetfolder' in args else '/merged1'
    chunksize = args['chunksize'] if 'chunksize' in args else 1000000
    max_str_bytes = 50
    errors = 0
    # Where queries
    # These queries are cycled through, to optimize folding and to move to super-low occurance
    # words (count == 1) aside to worry about later
    # Note that this is only possible if count was saved as a data column earlier
    (10)
    default_wheres = ["count>=%d and count <%d" % nm for nm in [(10**6, 10**7), (10**5, 10**6), (10**4, 10**5),
                                                    (1000, 10**4),(100, 1000), (50, 100), (10,50), (4,10)]]
    default_wheres = ["count >= %d" % 10**7] + default_wheres + ["count == %d" % i for i in range(3,0, -1)]
    wheres = args['wheres'] if 'wheres' in args else default_wheres
    
    with pd.HDFStore(storefile, complevel=9, mode="a", complib='blosc') as store:
        keys = [name for name in store.keys() if targetfolder in name]
        for key in keys:
            lang = key.split("/")[-1]
            logging.info("Processing %s in %s of %s" % (lang, storefolder, storefile))
            outfolder = "%s/%s" % (storefolder, lang)
             # Output words with only one occurance to another table
            #outfolder_ones = "%s_ones/%s" % (storefolder, lang)
            row_size = store.get_storer(key).nrows
            
            for where in wheres:
                i = 0
                storeiter = store.select(key, where=where, chunksize=chunksize)
                for chunk in storeiter:
                    try:
                        if chunk['count'].dtype != np.int64:
                            chunk['count'] = chunk['count'].astype("int64")
                        folded = chunk.groupby(level='token').sum()
                        store.append(outfolder,
                                     folded,
                                     data_columns=['count'], 
                                     min_itemsize = {'index': max_str_bytes})
                        i += 1
                        logging.info("Completed chunk %d for %s where %s" % (i, lang, where))
                        gc.collect()
                    except:
                        errors += 1
                        logging.exception("Error folding %d/%d for %s" % (i, np.ceil(row_size/chunksize), lang))
    
    gc.collect()
    if errors == 0:
        return "success"
    else:
        return "%d folding errors on process %s, check logs" % (errors, os.getpid())

In [None]:
# Fold counts where count > 10**5. This allows a much bigger chunk size. We'll come back for the rest later
#wheres = ["count>=%d and count <%d" % nm for nm in [(10**6, 10**7), (10**5, 10**6), (10**4, 10**5),
#                                                    (1000, 10**4), (100, 1000), (50, 100), (10,50), (4,10)]]
#wheres = ["count >= %d" % 10**7] + wheres
wheres = ["count>=%d and count <%d" % nm for nm in [(1, 3)]]
args = dict(targetfolder='/merged1', storefolder='/staged5-under3', chunksize=500000, wheres=wheres)
parallel_job = v.map(fold, stores, [args]*len(stores), ordered=False)
i = 0
for result in tqdm_notebook(parallel_job, smoothing=0):
    i += 1
    if result == "success":
        logging.info("Done folding batch %d" % i)
    else:
        logging.error(result)
print(force_gc())

100**5 to 10**6 took 29.56 with chunksize=1m

## Step 3: Final combine (Single process)

Collect each lang's dfs from all the stores and merge. Easy-peasy.

### *Everything below is incomplete.*

In [None]:
#TODO: Sort index as items are combined

def sum_words(storefile, args):
    '''
    Merge entire tables into a final store
    '''
    import logging
    import pandas as pd
    import gc
    import numpy as np
    
    storefolder = args['storefolder'] if 'storefolder' in args else 'final'
    targetfolder = args['targetfolder'] if 'targetfolder' in args else 'staged'
    chunksize = args['chunksize'] if 'chunksize' in args else 4000000
    max_str_bytes = 50
    
    with pd.HDFStore(storefile, complevel=9, mode="a", complib='blosc') as store:
        keys = [name for name in store.keys() if targetfolder in name]
        for key in keys:
            lang = key.split("/")[-1]
            collector = pd.DataFrame()
            
            outfolder = "%s/%s" % (storefolder, lang)
            logging.info("Collecting final counts in %s of %s" % (outfolder, storefile))
            
            i = 0
            row_size = store.get_storer(key).nrows
            storeiter = store.select(key, chunksize=chunksize)
            for chunk in storeiter:
                try:
                    folded = chunk.groupby(level='token').sum().sort_index()
                    store.append(outfolder, folded, min_itemsize = {'index': max_str_bytes})
                    i += 1
                    logging.info("Completed chunk %d/%d for %s" % (i, np.ceil(row_size/chunksize), lang))
                    gc.collect()
                except:
                    errors += 1
                    logging.exception("Error folding %d/%d for %s" % (i, np.ceil(row_size/chunksize), lang))
    
    gc.collect()
    if errors == 0:
        return "success"
    else:
        return "%d folding errors on process %s, check logs" % (errors, os.getpid())
    
args = dict(targetfolder='/staged5', storefolder='/final')
sum_words(sumstores[0])

'/notebooks/data2/batch2-redo/merge-3805.h5'