# Data Step 1: Processing Feature Files for Bookworm

This notebook runs through Extracted Features files, saving:

1. Global token counts (by language) toward the eventual Bookworm Wordlist. 
    These aren't all folded here: rather, they are folded by batch and saved to and HDF5 Store.
    Later, they'll all be folded into one big list.

2. "Raw" unigram counts per book. These will eventually be trimmed to only the BW vocabulary and
    labelled by an id. This information first needs the wordlist that #1 above will create, but
    since we're already opening the EF files, might as well do some processing and save this
    intermediate state to a fast IO format (HDF5 store, again).

In [None]:
from htrc_features import FeatureReader, utils
import pandas as pd
from tqdm import tqdm_notebook # Progress bars!
from ipyparallel import Client
import numpy as np
import logging

Before attaching to ipyparallel engines, they need to be started with 

```bash
    ipcluster start -n NUM
```

In [None]:
rc = Client()
dview = rc[:]
v = rc.load_balanced_view()

# Need this later to force garbage collection
@dview.remote(block=True)
def force_gc():
    import gc
    before = gc.get_count()
    gc.collect()
    return before[0], gc.get_count()[0]

Initialize logging. There's no nice way to pass logs between engines, so just give each one its own log.

The timestamp format is designed for easy sort, so you can track all logs with 

```bash
watch "tail -q -n 100 logs/* | sort"
```

In [None]:
def init_log(name=False):
    import logging, os
    if not name:
        name = os.getpid()
    handler = logging.FileHandler("/notebooks/data/logs/bw-%s.log" % name, 'a')
    formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s', "%m/%d-%H:%M:%S")
    handler.setFormatter(formatter)
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)
    logger.addHandler(handler)
    logging.info("Log initialized")

dview.push(dict(init_log=init_log))
init_log("root")
%px init_log()

INFO:root:Log initialized


Load paths to feature files. This notebook maintains a list of successully processed ids, so there are some functions that help us cross reference all volumes with done volumes.

In [None]:
with open("/notebooks/features/listing/ucw-to-yul-full.1.txt", "r") as f:
    paths = ["/notebooks/features/"+path.strip() for path in f.readlines()][1:]
    print("Number of texts", len(paths))

successfile = "/notebooks/data/successful-counts.txt"
def get_processed():
    import numpy as np
    ''' Get already processed files. Wrapped in func for easy refresh'''
    try:
        with open(successfile, "r") as f:
            paths = f.read().strip().split("\n")
        paths = ["/notebooks/features/"+utils.id_to_rsync(path) for path in paths]
        return np.array(paths)
    except:
        return np.array([])

path_to_id = lambda x: x.replace(".json.bz2", "").split("/")[-1]

Number of texts 1533959


`get_count` is the function that does the processing of the volume. To improve performance, however, the subprocesses run larger volumes in larger batches with `get_doc_counts`.

In [None]:
def trim_token(t, max=50):
    ''' Trim unicode string to max number of bytes'''
    if len(t.encode('utf-8')) > max:
        while len(t.encode('utf-8')) > max:
            t = t[:-1]
    return t

def get_count(path, store=False):
    ''' Get tokencount information from a single doc, by path'''
    from htrc_features import FeatureReader    
    max_char = 50
    vol = FeatureReader(path).first()
    tl = vol.tokenlist(pages=False, pos=False)
    if tl.empty:
        return tl
    else:
        tl = tl.reset_index('section')[['count']]
    tl.index = [trim_token(t, max_char) for t in tl.index.values]
    tl.index.names=['token']
    tl['id'] = vol.id
    tl['language'] = vol.language
    tl = tl.reset_index('token').set_index(['language', 'id', 'token']).sort_index()
    return tl

# Send to Engines
dview.push(dict(trim_token=trim_token, get_count=get_count))

# Example
get_count(paths[0]).head(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
language,id,token,Unnamed: 3_level_1
heb,uc1.b3571708,!,150
heb,uc1.b3571708,!00וזז־ו,1
heb,uc1.b3571708,!09,1


In [None]:
def get_doc_counts(paths, mincount=False, max_str_bytes = 50):
    '''
    This method lets you process multiple paths at a time on a single engine.
    This means the engine can collect enough texts to do a simple filter (i.e. >X counts in Y texts)
    and can save to it's own store.
    '''
    import logging
    import os
    import gc
    import pandas as pd
    fname = '/notebooks/data/stores/bw_counts_%s.h5' % os.getpid()
    success_log = []
    logging.info("Starting %d volume batch on PID=%s" % (len(paths), os.getpid()))
    with pd.HDFStore(fname, mode="a", complevel=9, complib='blosc') as store:
        tl_collector = []
        for path in paths:
            try:
                tl = get_count(path, store=store)
                if tl.empty:
                    logging.info("%s is empty" % path)
                    continue
                tl_collector.append(tl)
            except:
                logging.exception("Unable to get count for path %s" % path)
                continue
            success_log.append(path)

        # Save a DF combining all the counts from this batch
        try:
            logging.info("Merging and Saving texts for %d paths starting with %s" % (len(paths), paths[0]))
            combineddf = pd.concat(tl_collector)
            
            # Save tf(doc) with volid but no lang
            # For efficient HDF5 storage, enforcing a 50 byte token limit. Can't use
            # DataFrame.str.slice(stop=50) though, because really we care about bytes and 
            # some unicode chars are multiple codepoints.
            # volids are capped at 25chars (the longest PD vol id)
            store.append('tf/docs',
                         combineddf.reset_index('language')[['count']],
                         min_itemsize = {'id': 25, 'token':max_str_bytes})
            
            ### Save tf(corpus)
            df = combineddf.groupby(level=['language', 'token'])[['count']]\
                           .sum().sort_index()
            # Filtering this way (by corpus total, not language total) is too slow:
            #if mincount:
            #    df = df.groupby(level='token')[['count']].filter(lambda x: x.sum()>=mincount)
            # Because we can't feasibly filter on total count and have to do so by lang x token, it
            # might unfairly punish sparse languages. My workaround is to only even trim English by
            # mincount: any bias this would have would be in the bottom of the wordlist anyway.
            if mincount:
                df = df[(df.index.get_level_values(0) != 'eng') | (df['count']>2)]
            store.append('tf/corpus', df, min_itemsize = {'token': max_str_bytes})
            tl_collector = dict()
            return success_log
        except:
            logging.exception("Saving error for %d paths starting with %s" % (len(paths), paths[0]))
            return []
    gc.collect()
    return paths

In [None]:
import time
# Split paths into N-sized chunks, so engines can iterate on multiple texts at once
chunk_size = 800
remaining_paths = np.setdiff1d(paths, get_processed())
print("%d paths remaining" % len(remaining_paths))
n = 10000000
start = 0
chunked_paths = [remaining_paths[start+i:start+i+chunk_size] for i in range(0, len(remaining_paths[start:start+n]), chunk_size)]

starttime = time.time()
logging.info("Starting parallel job")
parallel_job = v.map(get_doc_counts, chunked_paths, ordered=False)

i = 0
for result in tqdm_notebook(parallel_job, smoothing=0):
    i += 1
    if result:
        with open(successfile, "a+") as f:
            ids = [path_to_id(path) for path in result]
            f.write("\n".join(ids)+"\n")
        logging.info("Done processing batch %d, from %s to %s" % (i, result[0], result[-1]))
    else:
        logging.error("Problem with result in batch %d" % i)

force_gc()
logging.info("Done")
logging.info(time.time()-starttime)

INFO:root:Starting parallel job


1533959 paths remaining


INFO:root:Done processing batch 1, from /notebooks/features/ufl1/pairtree_root/ar/k+/=1/39/60/=t/02/z2/3d/0f/ark+=13960=t02z23d0f/ufl1.ark+=13960=t02z23d0f.json.bz2 to /notebooks/features/ufl1/pairtree_root/ar/k+/=1/39/60/=t/0v/q3/w3/5f/ark+=13960=t0vq3w35f/ufl1.ark+=13960=t0vq3w35f.json.bz2
INFO:root:Done processing batch 2, from /notebooks/features/ufl1/pairtree_root/ar/k+/=1/39/60/=t/0v/q3/wk/1s/ark+=13960=t0vq3wk1s/ufl1.ark+=13960=t0vq3wk1s.json.bz2 to /notebooks/features/ufl1/pairtree_root/ar/k+/=1/39/60/=t/1p/g2/nx/6q/ark+=13960=t1pg2nx6q/ufl1.ark+=13960=t1pg2nx6q.json.bz2
INFO:root:Done processing batch 3, from /notebooks/features/ufl1/pairtree_root/ar/k+/=1/39/60/=t/43/r2/0n/0p/ark+=13960=t43r20n0p/ufl1.ark+=13960=t43r20n0p.json.bz2 to /notebooks/features/ufl1/pairtree_root/ar/k+/=1/39/60/=t/4z/g7/mw/81/ark+=13960=t4zg7mw81/ufl1.ark+=13960=t4zg7mw81.json.bz2
INFO:root:Done processing batch 4, from /notebooks/features/ufl1/pairtree_root/ar/k+/=1/39/60/=t/1p/g2/q1/6w/ark+=13960=t

CompositeError: one or more exceptions from call to method: get_doc_counts
[Engine Exception]EngineError: Engine b'f2d8ec3e-c791-41b2-b504-d2aacfdc2eae' died while running task '0547da2d-ca52-42c1-9a14-f66860442f4d'

## Todo

- Check for duplicates in "successful-counts.txt". I caught one text duplicated due to a bug, good to check that it doesn't happen again.
- Create a table index after storage (e.g. `store.create_table_index('df', optlevel=9, kind='full')`)

## Notes
- Future merges need to be at uint64 or int64, because uint32 is too small. For some reason, PyTables doesn't allow uint64 data columns, so int64 is used solely for that reason.

# Utilities

## Count up unique volume ids from stores

Useful in the case I ran into where the ZMQ connect between the root and nodes broke, so I wasn't saving the list of successfully processed volumes, but the Engines were still happily crunching away.

In [None]:
import glob
from ipyparallel import require
storestocheck = glob.glob("/notebooks/data/stores/*h5")

@require(get_processed)
def check_for_processed(storefile):
    import gc
    import pandas as pd
    import numpy as np
    import logging
    from htrc_features import utils
    
    all_unique = []
    batchsize = 100000000
    
    with pd.HDFStore(storefile, mode="r") as store:
        # Rejecting files where the last row was not mdp
        try:
            n = int(store.get_storer("/tf/docs").nrows)
        except:
            logging.exception("Can't get row count for %s, moving on" % storefile)
            return []
        try:
            a = store.select_column('/tf/docs', 'id', start=n-2)
            if a.str.split(".")[0][0] != 'mdp':
                logging.info("%s didn't process mdp most recently, skipping." % storefile)
                return []
        except:
            logging.exception("Error with %s" % storefile)
            return []

        logging.info("Figuring out what is already processed.")
        already_processed = get_processed()

        logging.info("Going through file backwards until all the volume ids are in the success list")
        
        while True:
            try:
                logging.info("Processing %s from %d" % (storefile, n-batchsize))
                startrow = (n - batchsize) if n > batchsize else 0
                unique = store.select_column('/tf/docs', 'id', start=startrow, stop=n).unique()
                uniquemdp = unique[np.char.startswith(unique.astype(np.unicode), "mdp")]
                as_paths =  pd.Series(uniquemdp).apply(lambda x: '/notebooks/features/' + utils.id_to_rsync(x)).values
                
                to_process = np.setdiff1d(as_paths, already_processed)
                if to_process.shape[0] == 0:
                    logging.info("Done at %d" % (n-batchsize))
                    break
                else:
                    n -= batchsize
                    all_unique.append(to_process)
            except:
                n -= batchsize
                logging.exception("Error with %s from %d)" % (storefile, n))
            try:
                gc.collect()
            except:
                logging.exception("gc error")
    if len(all_unique) > 0:
        try:
            return np.unique(np.concatenate(all_unique))
        except:
            logging.exception("problem with array concatenatation, returning list")
            return all_unique
    else:
        return []

NameError: name 'get_processed' is not defined

## Quick store check

Grab the last item from each store. This is a good way to check if a store broke for whatever reason.

The ptrepack command on your system seems to repack the non-corrupted part of the file, at least until it hits the error. That will be incomplete, but at least you have something that isn't crashing processes down the line.

In [None]:
import glob
storestocheck = glob.glob("/notebooks/data/stores/*h5")
def get_last(storefile):
    import pandas as pd
    with pd.HDFStore(storefile, mode="a") as store:
        n = int(store.get_storer("/tf/docs").nrows)
        return store.select_column('/tf/docs', 'id', start=n-1)

last = []
for store in storestocheck:
    print(store)
    last.append(get_last(store))
    print(last[-1])
last

/notebooks/data/stores/bw_counts_4796.h5
0    uc1.b2990360
Name: id, dtype: object
/notebooks/data/stores/bw_counts_4798.h5
0    uc1.b2991200
Name: id, dtype: object
/notebooks/data/stores/bw_counts_4815.h5
0    uc1.b2993949
Name: id, dtype: object
/notebooks/data/stores/bw_counts_4880.h5
0    uc1.b2996411
Name: id, dtype: object
/notebooks/data/stores/bw_counts_4862.h5
0    uc1.b2997309
Name: id, dtype: object
/notebooks/data/stores/bw_counts_4800.h5
0    uc1.b2995580
Name: id, dtype: object
/notebooks/data/stores/bw_counts_4802.h5


  values = values[self.cname]


0    uc1.b2998141
Name: id, dtype: object
/notebooks/data/stores/bw_counts_4782.h5
0    uc1.b2999999
Name: id, dtype: object
/notebooks/data/stores/bw_counts_4817.h5
0    uc1.31822022962674
Name: id, dtype: object
/notebooks/data/stores/bw_counts_4889.h5
0    uc1.31822015234818
Name: id, dtype: object
/notebooks/data/stores/bw_counts_4853.h5
0    uc1.b2992047
Name: id, dtype: object
/notebooks/data/stores/bw_counts_4844.h5
0    uc1.b2993105
Name: id, dtype: object
/notebooks/data/stores/bw_counts_4871.h5
0    uc1.31822014309348
Name: id, dtype: object
/notebooks/data/stores/bw_counts_4835.h5
0    uc1.b2936728
Name: id, dtype: object
/notebooks/data/stores/bw_counts_4907.h5
0    uc1.b2939433
Name: id, dtype: object
/notebooks/data/stores/bw_counts_4898.h5
0    uc1.b2994756
Name: id, dtype: object
/notebooks/data/stores/bw_counts_4826.h5
0    uc1.b2935833
Name: id, dtype: object
/notebooks/data/stores/bw_counts_4781.h5
0    uc1.b2938538
Name: id, dtype: object
/notebooks/data/stores/bw_c

[0    uc1.b2990360
 Name: id, dtype: object, 0    uc1.b2991200
 Name: id, dtype: object, 0    uc1.b2993949
 Name: id, dtype: object, 0    uc1.b2996411
 Name: id, dtype: object, 0    uc1.b2997309
 Name: id, dtype: object, 0    uc1.b2995580
 Name: id, dtype: object, 0    uc1.b2998141
 Name: id, dtype: object, 0    uc1.b2999999
 Name: id, dtype: object, 0    uc1.31822022962674
 Name: id, dtype: object, 0    uc1.31822015234818
 Name: id, dtype: object, 0    uc1.b2992047
 Name: id, dtype: object, 0    uc1.b2993105
 Name: id, dtype: object, 0    uc1.31822014309348
 Name: id, dtype: object, 0    uc1.b2936728
 Name: id, dtype: object, 0    uc1.b2939433
 Name: id, dtype: object, 0    uc1.b2994756
 Name: id, dtype: object, 0    uc1.b2935833
 Name: id, dtype: object, 0    uc1.b2938538
 Name: id, dtype: object, 0    uc1.b2999047
 Name: id, dtype: object]

In [None]:
dview.push(dict(successfile=successfile, get_processed=get_processed))
parallel_job = v.map(check_for_processed, storestocheck, ordered=False)
all_ids = []
i = 0
for ids in tqdm_notebook(parallel_job, smoothing=0):
    all_ids.append(ids)
    i += 1
    logging.info("Batch %d done" % i)

uniqueids = np.unique(np.concatenate(all_ids))

np.save("addtosuccessful2", uniqueids)

CompositeError: one or more exceptions from call to method: check_for_processed
[Engine Exception]EngineError: Engine b'e0665b72-c5bc-4726-9686-211e078b20b5' died while running task '4cbe26e4-5b12-46cd-ab24-535d307a1ffe'

In [None]:
a = pd.Series(uniqueids)
b = a[a.str.find("mdp") >= 0]
c = get_processed()
d = np.setdiff1d(b.values, c)
e = pd.Series(d).apply(lambda x: x.split("/")[-1].split(".json")[0]).values
with open(successfile, "a+") as f:
    f.write("\n".join(e)+"\n")

In [None]:
remaining_paths.shape

(1690746,)

In [None]:
remaining_paths = np.setdiff1d(paths, get_processed())