# Data Step 2: Counting up Global Counts for Word List

In the EF processing script, token counts were collected in batches, folded to language x token counts in batches, and saved to HDF5 stores in `/store`. This script will fold those batches into a single list, so each language-token combination only has one count. The previous script was a `map`, this script with `reduce`.

In [None]:
import glob
import pandas as pd
import numpy as np
from ipyparallel import Client
import logging
import os
from tqdm import tqdm_notebook
rawstores = glob.glob("/notebooks/data/batch2/stores/*h5")

## Preparation

Attach engines and initialize logging. *We'll be processing large in-memory chunks, so don't start too many processes.* I'm using a machine with 128MB RAM, and 10 processes hits around 2/3 of the RAM (80MB) for chunksize=1m in Step 1. Use many fewer processes for step 2 (todo: add what *many fewer* means!).

In [None]:
rc = Client()
dview = rc[:]
v = rc.load_balanced_view()

# Need this later to force garbage collection
@dview.remote(block=True)
def force_gc():
    import gc
    before = gc.get_count()
    gc.collect()
    return before[0], gc.get_count()[0]

In [None]:
def init_log(name=False):
    import logging, os
    if not name:
        name = os.getpid()
    handler = logging.FileHandler("/notebooks/data2/logs/bw-%s.log" % name, 'a')
    formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s', "%m/%d-%H:%M:%S")
    handler.setFormatter(formatter)
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    logger.addHandler(handler)
    logging.info("Log initialized")

dview.push(dict(init_log=init_log))
%px init_log()
init_log("root")

## Step 1: Triage and merge small chunks by lang (Parallelized)

Iterate through all the stores, groupby by language then summing counts by token. These counts are still saved to an engine's own store under merge1/{language}, so that it can be parallelized.

In [None]:
def triage(inputstore):
    try:
        import numpy as np
        import pandas as pd
        import logging
        import os
        import gc
    except:
        return "import error for " + inputstore

    chunksize = 2000000
    storefolder = 'merged1' # this is in the h5 hierarchy
    outputstorename = "/notebooks/data2/batch2/merge-%s.h5" % os.getpid()
    max_str_bytes = 50
    
    errors = 0
    with pd.HDFStore(outputstorename, complevel=9, mode="a", complib='blosc') as outstore:
        with pd.HDFStore(inputstore, complevel=9, mode="r", complib='blosc') as store:
            row_size = store.get_storer('/tf/corpus').nrows
            storeiter = store.select('/tf/corpus', start=0, chunksize=chunksize)

            i = 0
            for chunk in storeiter:
                i += 1
                try:
                    lang_groups = chunk.groupby(level=['language'])
                    for lang,df in lang_groups:
                        if df.empty:
                            continue
                        merged = df.groupby(level=['token']).sum()
                        fname = "%s/%s" % (storefolder, lang)
                        outstore.append(fname, merged, min_itemsize = {'index': max_str_bytes})
                    logging.info("Completed %d/%d" % (i, np.ceil(row_size/chunksize)))
                except:
                    errors += 1
                    logging.exception("Error processing batch %d (docs %d-%d) of input store" % (i, (i-1)*chunksize, i*chunksize))
                gc.collect()
    gc.collect()
    if errors == 0:
        return "success"
    else:
        return "%d errors on process %s, check logs" % (errors, os.getpid())
dview.push(dict(triage=triage))

<AsyncResult: _push>

In [None]:
logging.info("Processing Started")
parallel_job = v.map(triage, rawstores, ordered=False)
i = 0

for result in tqdm_notebook(parallel_job, smoothing=0):
    i += 1
    if result == "success":
        logging.info("Done processing batch %d" % i)
    else:
        logging.error(result)
        
print(force_gc())

The installed widget Javascript is the wrong version.


## Step 2: Folding lang stores by bigger chunks (Parallelized)

Starting with a blank DF, interate through each store (separately by language), selecting N million rows at once, and merging it into the initially blank DF (i.e. read chunk, concat to full_df, groupby(level='token'), and sum). Save to `/staged/{lang}`.

Note that the engines are handed stores now, rather than saving to their own pid-named store.

**TODO**

Might as well cycle through all of the chunks and collect to an in-memory collector. This will require a large amount of memory per process, given that the vocab can get to ~100m, and the process of append, group, sum will result in an in-memory copy. It's nonetheless much faster than any type of index-based iteration (trust Pandas and Numpy!), and a bit neater than running Step 2 repeatedly.

In [None]:
stores = glob.glob("/notebooks/data2/batch1/*h5")

In [None]:
def fold(storefile, args):
    import logging
    import pandas as pd
    import gc
    import numpy as np
    gc.collect()
    
    storefolder = args['storefolder'] if 'storefolder' in args else '/staged'
    # The PyTables folder to fold, as in "folder/language"
    targetfolder = args['targetfolder'] if 'targetfolder' in args else '/merged1'
    # Much bigger chunk size, because groupby().sum() will be much more effective now that
    # languages are no longer mixed
    chunksize = args['chunksize'] if 'chunksize' in args else 4000000
    max_str_bytes = 50
    
    with pd.HDFStore(storefile, complevel=9, mode="a", complib='blosc') as store:
        keys = [name for name in store.keys() if targetfolder in name]
        for key in keys:
            lang = key.split("/")[-1]
            logging.info("Processing %s in %s of %s" % (lang, storefolder, storefile))
            outfolder = "%s/%s" % (storefolder, lang)
            i = 0
            row_size = store.get_storer(key).nrows
            storeiter = store.select(key, chunksize=chunksize)
            for chunk in storeiter:
                try:
                    folded = chunk.groupby(level='token').sum()
                    store.append(outfolder, folded, min_itemsize = {'index': max_str_bytes})
                    i += 1
                    logging.info("Completed chunk %d/%d for %s" % (i, np.ceil(row_size/chunksize), lang))
                    gc.collect()
                except:
                    except:
                    errors += 1
                    logging.exception("Error folding %d/%d for %s" % (i, np.ceil(row_size/chunksize), lang))
    
    gc.collect()
    if errors == 0:
        return "success"
    else:
        return "%d folding errors on process %s, check logs" % (errors, os.getpid())

In [None]:
args = dict(targetfolder='/merged1', storefolder='/staged')
parallel_job = v.map(fold, rawstores, [args]*len(rawstores), ordered=False)
i = 0

for result in tqdm_notebook(parallel_job, smoothing=0):
    i += 1
    if result == "success":
        logging.info("Done folding batch %d" % i)
    else:
        logging.error(result)
        
print(force_gc())

In [None]:
args = dict(targetfolder='/staged2', storefolder='/staged3')
fold(storefile=stores[3], args=args)

### Optional: Delete intermediate tables

At risk of running out of disk space? It's fine to delete `/merged1`. Sensibly, the size of this is only a tiny fraction of the per-volume counts: 100GB/1m texts, so it will be 1.5TB if not deleted here.

In [None]:
delete = False
if delete:
    for storename in rawstores:
        with pd.HDFStore(storename, complevel=9, mode="a", complib='blosc') as store:
            keys = [name for name in store.keys() if "/merged1" in name]
            for key in keys:
                store.remove(key)

### Intermission: Check Table Sizes

See if it's actually folding.

In [None]:
def get_total_size(storefile, args):
    import pandas as pd
    targetfolder = args['targetfolder']
    nrows = 0
    with pd.HDFStore(storefile, complevel=9, mode="a", complib='blosc') as store:
        keys = [name for name in store.keys() if targetfolder in name]
        for key in keys:
            nrows += store.get_storer(key).nrows
    return nrows
dview.push(dict(get_total_size=get_total_size))

for name in ['/merged1', 'staged']:
    print()
    args = dict(targetfolder = name)
    per_store = dview.map_sync(get_total_size, stores, [args]*len(stores))
    print("Total rows in %s: %d" % (name, sum(args)))

[35149068, 3458987, 36154650, 35320506, 35570859, 2399990]
[0, 0, 0, 34971494, 0, 0]
[0, 0, 0, 16520678, 0, 0]


## Step 3: Final combine (Single process)

Collect each lang's dfs from all the stores and merge. Easy-peasy.

### *Everything below is incomplete.*

In [None]:
def sum(storefile, args):
    '''
    Merge entire tables into a final store
    '''
    import logging
    import pandas as pd
    import gc
    import numpy as np
    
    storefolder = args['storefolder'] if 'storefolder' in args else 'final'
    targetfolder = args['targetfolder'] if 'targetfolder' in args else 'staged'
    chunksize = args['chunksize'] if 'chunksize' in args else 4000000
    max_str_bytes = 50
    
    with pd.HDFStore(storefile, complevel=9, mode="a", complib='blosc') as store:
        keys = [name for name in store.keys() if targetfolder in name]
        for key in keys:
            lang = key.split("/")[-1]
            collector = pd.DataFrame()
            
            outfolder = "%s/%s" % (storefolder, lang)
            logging.info("Collecting final counts in %s of %s" % (outfolder, storefile))
            
            i = 0
            row_size = store.get_storer(key).nrows
            storeiter = store.select(key, chunksize=chunksize)
            for chunk in storeiter:
                try:
                    folded = chunk.groupby(level='token').sum()
                    store.append(outfolder, folded, min_itemsize = {'index': max_str_bytes})
                    i += 1
                    logging.info("Completed chunk %d/%d for %s" % (i, np.ceil(row_size/chunksize), lang))
                    gc.collect()
                except:
                    except:
                    errors += 1
                    logging.exception("Error folding %d/%d for %s" % (i, np.ceil(row_size/chunksize), lang))
    
    gc.collect()
    if errors == 0:
        return "success"
    else:
        return "%d folding errors on process %s, check logs" % (errors, os.getpid())