# Loading/Processing Corpus

In [877]:
from bs4 import BeautifulSoup as bs
import gzip
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm import tqdm_notebook
# from tqdm.notebook import tqdm
import os
from collections import OrderedDict
import pickle
import warnings
import copy
import spacy
from sentence_transformers import SentenceTransformer
from pathlib import Path

### General Helper Functions

In [606]:
def convert_to_list(item):
    if type(item) is not list:
        item = [item]
    return item

def file_exists(path):
    """Check if path or list of paths has item that does not exist"""
    exists = []
    path = convert_to_list(path)
    for p in path:
        exists.append(os.path.exists(p))
    return all(exists)

def remove_unnamed_cols(df, show_removed=False):
    """Bug where useless columns entitled 'Unnamed' appear"""
    removed = False
    for col in df.columns:
        if "Unnamed" in col:
            del df[col]
            removed=True
    if show_removed:
        return df, removed
    else:
        return df

## File IO Management

In [833]:
class FilePathHandler:
    """Paths will be in the format:
    
    proj_dir/dataset_dir/corpus_name/file_purpose/instance_identifier+split_identifier+sfile_type
    """
    def __init__(self, proj_dir, dataset_dir="dataset", compression='gzip'):
        self.proj_dir = proj_dir
        self.dataset_dir = proj_dir + '/' + dataset_dir
        self.create_dir_if_not_exists(self.dataset_dir)
        self.path_df_path = self.dataset_dir + '/' + 'file_path_df.csv.gz'
        self.corpus_sources_pickle_path = self.dataset_dir + '/' + 'corpus_sources.pickle'
        self.compression = compression
        self.file_purposes = ["topics", "corpus", "nuggets", "embeddings", "updates"]
        self.path_df_cols = {"corpus_name":str, "file_purpose":str, "split_identifier":str, "num_splits":int,
                             "split_step":int, "nested_dir":str, "instance_identifier":str, 
                             "file_type":str, "path":str, "exists":bool}
        self.corpus_sources_keys = ['corpus_name', 'dir_path', 'nuggets_path', 'matches_path', 'topics_path']
        
        # load meta files
        self.load_corpus_sources()
        self.load_path_df()
        
    def load_path_df_slice(self, corpus_name, file_purpose, instance_identifier=None, file_type=None,
                           split_identifier=None, exists=None, nested_dir=None, split_step=None):
        paths = self.path_df
        paths = paths[paths['corpus_name'] == corpus_name]
        paths = paths[paths['file_purpose'] == file_purpose]
        if instance_identifier is not None:
            paths = paths[paths['instance_identifier'] == instance_identifier]
        if split_identifier is not None:
            paths = paths[paths['split_identifier'] == split_identifier]
        if nested_dir is not None:
            paths = paths[paths['nested_dir'] == nested_dir]
        if split_step is not None:
            paths = paths[paths['split_step'] == split_step]
        if file_type is not None:
            paths = paths[paths['file_type'] == file_type]
        if exists is not None:
            paths = paths[paths['exists'] == exists]
        return paths
        
    def false_exists_in_df(self, remove_false_exists=False, verbose=True):
        """Function to check which paths are listed as existing, but actually do not"""
        exists = self.path_df[self.path_df['exists'] == True]
        false_exists = []
        for path in exists['path']:
            if not os.path.exists(path):
                false_exists.append(path)
    
        if verbose:
            out = "\n".join(false_exists)
            print("Paths found in path_df but not on system: " + str(out))
        
        if remove_false_exists:
            prev_len = len(self.path_df)
            # drop rows not in paths
            self.path_df = self.path_df[~self.path_df['path'].isin(false_exists)]
            cur_len = len(self.path_df)
            if prev_len == cur_len:
                raise Exception("Operation did not remove paths from dataframe")
            self.save_path_df()
            if verbose:
                print("Removed " + str(prev_len - cur_len) + " paths from path_df dataframe")
        
    
    def get_path(self, corpus_name, file_purpose, inst_identifier, file_type, add_path=True, exists=False,
                split_identifier=None, num_splits=1, split_step=0, nested_dir=None, warn=False):
        # do check here make sure filename compatible, or elsewhere
        path = self.dataset_dir + '/' + corpus_name + '/' + file_purpose + '/'
        if nested_dir is not None:
            path += nested_dir + '/'
        path += str(inst_identifier)
        if split_identifier is not None:
            path += '_' + str(split_identifier)
        path += file_type
        
        if add_path:
            self.add_path_to_df(corpus_name, file_purpose, split_identifier, num_splits, split_step, nested_dir,
                                inst_identifier, file_type, path, exists, save=True, warn=warn)
        return path
            
    def add_path_to_df(self, corpus_name, file_purpose, split_identifier, num_splits, split_step, nested_dir, 
                       inst_identifier, file_type, path, exists, save=True, warn=False):
        if not (self.path_df['path'] == path).any():  # check if row exists
            # create appropriate dir if needed
            file_purp_dir_path = self.dataset_dir + '/' + corpus_name +  '/' + file_purpose
            self.create_dir_if_not_exists(file_purp_dir_path)
            if nested_dir is not None:  # created nested dir if neccessary
                nested_dir_path = file_purp_dir_path + '/' + nested_dir
                self.create_dir_if_not_exists(nested_dir_path)
            # add to path_df
            if num_splits is None:
                num_splits = 1
            
            row = pd.DataFrame({'corpus_name': pd.Series([corpus_name], dtype=str),
                                  'file_purpose': pd.Series([file_purpose], dtype=str),
                                  'split_identifier': pd.Series([split_identifier], dtype=str),
                                  'num_splits': pd.Series([num_splits], dtype=int),
                                  'split_step': pd.Series([split_step], dtype=int),
                                  'nested_dir': pd.Series([nested_dir], dtype=str),
                                  'instance_identifier': pd.Series([inst_identifier], dtype=str),
                                  'file_type': pd.Series([file_type], dtype=str),
                                  'path': pd.Series([path], dtype=str),
                                  'exists': pd.Series([exists], dtype=bool)})
            
            self.path_df = self.path_df.append(row, ignore_index=True)
            if save:  # save new path_df
                self.save_path_df()
        else:
            if warn:
                warnings.warn("Path already exists in dataframe: " + str(path))
            
    def update_path_exists(self, path, save=True):
        self.path_df.loc[self.path_df['path'] == path, 'exists'] = True
        if save:
            self.save_path_df()
        

    def create_dir_if_not_exists(self, dir_path, warn=True):
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
            warnings.warn("Created new directory at " + str(dir_path))
            return True
        return False
    
    def search_path_df(self, search_dict, df_slice=None):
        if df_slice is None:
            df_slice = self.path_df
        for col_name, value in search_dict.items():
            df_slice = df_slice[col_name == value]
        return df_slice
    
    def source_dict_correct(self, source_dict):
        # check has all appropriate keys
        for key in self.corpus_sources_keys:
            if key not in source_dict:
                raise Exception(str(key) + " is missing from corpus_source dict")
        false_paths = []
        for path_type, path in source_dict.items():
            if path_type == "corpus_name":  # dict entry not a path, don't check
                continue
            if not file_exists(path):
                false_paths.append(str(path_type) + " does not exist at " + str(path))
        if len(false_paths) > 0:
            error_str = "\n".join(false_paths)
            raise FileNotFoundError(error_str)
        else:
            return True
        
    
    def create_corpus_source_dict(self, corpus_name, dir_path, topics_file_path, nuggets_file_path,
                                 matches_file_path):
        s_dict = {"corpus_name":corpus_name, "dir_path":dir_path, 
                  "topics_path":topics_file_path, "nuggets_path":nuggets_file_path,
                 "matches_path":matches_file_path}
        self.source_dict_correct(s_dict)
        return p_dict
    
    def add_corpus_source(self, corpus_source_dict, overwrite=False):
        """Add a corpus directory to load from and its meta files"""
        # check paths exist
        self.source_dict_correct(corpus_source_dict)
        corpus_name = copy.deepcopy(corpus_source_dict["corpus_name"])
        # store new entry
        if corpus_name in self.corpus_sources and overwrite==False:
            warnings.warn(str(corpus_name) + " is already present in corpus source dictionary. \n Proceeding with dict entry")
        else:
            del corpus_source_dict["corpus_name"]  # remove corpus_name from dict to add as a key
            self.corpus_sources[corpus_name] = corpus_source_dict
            # create folder for outputting new source files
            new_corpus_dir = self.dataset_dir + '/' + corpus_name
            self.create_dir_if_not_exists(new_corpus_dir)
            self.save_corpus_sources()
            
    def get_corpus_sources(self, corpus_names=None):
        """Retrieve file paths from corpus_load dicts
        Parameters:
            corpus_names: list of corpus names retrieve, if None then retrieve all
        
        Returns:
            A dictionary where keys are the corpus names and values are target file paths
        """
        if corpus_names is None:
            corpus_names = self.corpus_sources.keys()
        corpus_paths = {}
        for name in corpus_names:
            corpus_paths[name] = self.corpus_sources[name]
        return corpus_paths
    
    def save_corpus_sources(self):
        with open(self.corpus_sources_pickle_path, 'wb') as handle:
            pickle.dump(self.corpus_sources, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    def load_corpus_sources(self):
        if os.path.exists(self.corpus_sources_pickle_path):
            with open(self.corpus_sources_pickle_path, 'rb') as handle:
                self.corpus_sources = pickle.load(handle)
            return True
        else:
            self.corpus_sources = {} # create empty dictionary
            return False
        
    def save_path_df(self):
        self.path_df.to_csv(self.path_df_path, compression=self.compression)
        
    def load_path_df(self):
        """File containing info about file paths to systematically load files"""
        if os.path.exists(self.path_df_path):
            self.path_df = pd.read_csv(self.path_df_path, compression=self.compression)

            self.path_df, removed = remove_unnamed_cols(self.path_df, show_removed=True)
            if removed:  # save changes
                self.save_path_df()
            return True
        else:
            path_df_dict = {}
            for col_name, data_type in self.path_df_cols.items():  # ensure columns don't infer wrong typing
                path_df_dict[col_name] = pd.Series([], dtype=data_type)
            self.path_df = pd.DataFrame(path_df_dict)  # create empty dataframe
            return False

## Markup Loading Functions

In [817]:
# open and get beautifulsoup object from markup file
def open_markup_file(addr, gz=True, xml=False, verbose=False):
    markup = None
    f = None
    
    if verbose:
        print(addr)

    if gz:
        f = gzip.open(addr)
        if verbose:
            print("gz file opened")
    else:
        f = open(addr)
        
    if xml == False:
        markup = bs(f, "lxml")  # using lxml parser for speed
    else:
        markup = bs(f, "xml")
        
    f.close()
    return markup


# parse markup and return 2D list [entry:tags]
def parse_markup(markup, entry_list, tag_list, find_tag, topic_id=None):
    for e in markup.find_all(find_tag):
        entry = OrderedDict.fromkeys(tag_list)
        if topic_id is not None:
            entry['topic_id'] = topic_id
        for c in e.children:  # children use direct children, descendants uses all
            if str(c.name).lower() in entry:
                entry[c.name] = str(c.string)
            elif c.name is None and c.string != '\n':  # inner body of <doc> tag
                entry['text'] = str(c.string)
        entry_list.append(list(entry.values()))
        
            
# recursively find gz html files from a directory address
def search_dir(path):    
    # separate the subdirectories and html files 
    # (help maintain sequential order of insertion)
    gz_paths = []
    for f in os.scandir(path):
        if os.path.splitext(f.path)[-1].lower() == ".gz":
            gz_paths.append(f.path)
    
    return gz_paths


def list_to_dataframe(markup_list, tags):
    return pd.DataFrame(markup_list, columns=tags)

## Dataframe Loading

### Control Functions

In [882]:
def topic_id_as_int(topic_id):
    try:
        topic_id = int(topic_id)
        return topic_id
    except ValueError:  # non-standard topic_id, e.g. 'TS14.18'
        split = topic_id.split(".", 2)  
        try:
            match = split[0].upper()
            if match == "TS14" or match == "14":
                topic_id = int(split[1])  # extract int '18'
                return topic_id
            else:
                raise ValueError()
        except ValueError:
            return None  # no discernable topic_id

def convert_df_topic_id(df, col_name="query_id", remove_null=True, in_place=False):
    conv_df = df
#     print("df[" + col_name + "] unique entries: " + conv_df[col_name].unique())
    conv_df[col_name] = conv_df[col_name].apply(lambda x: topic_id_as_int(x))
    if remove_null:
        conv_df = conv_df[conv_df[col_name].notna()]
        conv_df = conv_df.astype({col_name:int})
    if in_place:
        df = conv_df
        return df
    else:
        return conv_df

def find_duplicates(df):
    seen = set()
    seen_twice = set()
    for docid in df['docid']:
        if docid not in seen:
            seen.add(docid)
        else:
            seen_twice.add(docid)
    return seen_twice

def get_file_ext(path):
    ext = Path(path).suffixes
    ext = "".join(ext)  # in case of multiple suffix e.g. .csv and .gz
    return ext

def save_df_file_type(df, save_path, verbose=True):
    file_type = get_file_ext(save_path)
    df = remove_unnamed_cols(df)
    if file_type == ".csv.gz":
        df.to_csv(save_path, compression='gzip')
        if verbose:
            print("df saved as gzipped csv at: " + str(save_path))
    elif file_type == ".hdf":
        complevel = 9
        key = "single_df"
        # key param (single_df) is required, since only storing one df in file, useless info
        df.to_hdf(save_path, "single_df", complevel=complevel)
        if verbose:
            print("df saved as hdf complevel " + str(complevel) + " at: " + str(save_path))
    else:
        raise ValueError(str(file_type) + " is not a valid file type option")
        
def read_df_file_type(save_path, verbose=True, concat_multiple=True):
    # convert to list to allow read multiple
    save_path = convert_to_list(save_path)
    dfs = []
    for path in save_path:
        file_type = get_file_ext(path)
        if file_type == ".csv.gz":
            for path in save_path:
                dfs.append(pd.read_csv(path, compression='gzip'))
            if verbose:
                print("loaded from .csv.gz file")
        elif file_type == ".hdf":
            for path in save_path:
                dfs.append(pd.read_hdf(path))
            if verbose:
                print("loaded from .hdf file")
        else:
            raise ValueError(str(file_type) + " is not a valid file type option")
    for df in dfs:
        df = remove_unnamed_cols(df)
    if concat_multiple:
        dfs = pd.concat(dfs, ignore_index=True, sort=False)  # combine into one df
    return dfs

def load_df_control(save_path, load_func, save=True, force_reload=False, 
                    name=None, verbose=True, path_handler=None):
    df = None
    save_path = convert_to_list(save_path)  # allows loading groups of saved files same way as singular paths
    if name is not None and verbose:
        print("Loading " + name)
    if not file_exists(save_path) or force_reload:
        if len(save_path) > 1:
            raise ValueError("There should only be one path to save to if no save paths already exist")
        df = load_func()
        df = remove_unnamed_cols(df)
        if verbose:
            print("df created from scratch")
        if save:
            # in case of loading df from original file, should only be one save_path
            save_df_file_type(df, save_path[0], verbose=verbose)
            if path_handler is not None:
                path_handler.update_path_exists(save_path[0])
    else:
        df = read_df_file_type(save_path, verbose=verbose)
    if verbose:
        print(display(df[0:4]))
    return df

### Dataframes from Corpus Files

#### Topics

In [883]:
# load topics into dataframe
def __load_topics(path, verbose=True):
    topics_list = []
    path = convert_to_list(path)
    for p in path:
        parse_markup(open_markup_file(p, gz=False, xml=True), 
                        topics_list, topic_tags, "event")
    df = list_to_dataframe(topics_list, topic_tags)
    
    df = convert_df_topic_id(df, col_name='id', remove_null=True)
    # drop any duplicates found over the files
    prev_size = len(df)
    df = df.drop_duplicates(subset=['id'], keep='first')  # no duplicate documents
    if verbose:
        num_removed = len(df) - prev_size
        print(str(num_removed) + " duplicate documents removed from topics df")
    return df

def load_topics(save_path, load_path=None, save=True, force_reload=False, verbose=True, path_handler=None):
    topics = load_df_control(save_path, 
                             
                             lambda: __load_topics(load_path, verbose=verbose), 
                             
                             save=save, force_reload=force_reload, name="topics", verbose=verbose, 
                             path_handler=path_handler)
    return topics

# topics = load_topics()

#### Main Corpus Files

In [884]:
# load all formatted gzipped html files into dataframe

def __load_corpus(corpus_dir, doc_tags=None, topic_ids=None, split_every=None, split_start_doc=None,
                 drop_duplicates=True, verbose=True):
    if doc_tags is None:
        doc_tags = ['topic_id','streamid', 'docid', 'yyyymmddhh', 'kbastream', 'zulu', 'epoch', 'title', 'text', 'url'] # doc fields
    df = pd.DataFrame(columns=doc_tags)
    
    for topic_id in topic_ids:
        print("Loading topic " + str(topic_id) + "...")
        topic_list = []
        topic_path = corpus_dir + '/' + str(topic_id)
        gz_paths = search_dir(topic_path)
        
        if split_every is not None and split_start_doc is not None:
            end_split = split_start_doc + split_every
            if end_split >= len(gz_paths):  # last section
                end_split = len(gz_paths) - 1
            gz_paths = gz_paths[split_start_doc:end_split]
        
        for gz_path in tqdm(gz_paths, position=0, leave=True):
            parse_markup(open_markup_file(gz_path, verbose=False),
                             topic_list, doc_tags, "doc", topic_id=topic_id)
        topic_df = list_to_dataframe(topic_list, doc_tags)
        df = df.append(topic_df)
    df['epoch'] = pd.to_numeric(df['epoch'])
    if drop_duplicates:
        prev_size = len(df)
        df = df.drop_duplicates(subset=['streamid'], keep='first')  # no duplicate documents
        if verbose:
            num_removed = len(df) - prev_size
            print(str(num_removed) + " duplicate documents removed from corpus")
    return df

def load_corpus(save_path, corpus_dir=None, doc_tags=None, topic_ids=None, split_every=None, split_start_doc=None,
                save=True, force_reload=False, verbose=True, path_handler=None, drop_duplicates=True):
    
    corpus = load_df_control(save_path, 
                             
                             lambda: __load_corpus(corpus_dir, doc_tags=doc_tags, 
                                                   topic_ids=topic_ids, split_every=split_every,
                                                   split_start_doc=split_start_doc, 
                                                   drop_duplicates=drop_duplicates, verbose=verbose), 
                             
                             save=save, force_reload=force_reload, name="corpus", verbose=verbose, 
                             path_handler=path_handler)
    # remove duplicate documents from corpus if required
    if drop_duplicates:
        if corpus['streamid'].duplicated().any():  # if there are any duplicates
            prev_size = len(corpus)
            corpus = corpus.drop_duplicates(subset=['streamid'], keep='first')  # get rid of them
            if verbose:
                num_removed = len(corpus) - prev_size
                print(str(num_removed) + " duplicate documents removed from corpus")
    
    if verbose:
        print("Corpus loaded succesfully: " + str(len(corpus)) + " documents loaded.")
    return corpus

# corpus = load_corpus(doc_tags=doc_tags)

#### Nuggets (Evaluation Technique)

In [821]:
def spacy_sents_string_list(text, nlp):
    if nlp is None:
        nlp = spacy.load("en_core_web_sm")
    spacy_text = nlp(text)
    spacy_sents = list(map(str, spacy_text.sents))
    return spacy_sents, nlp

def find_nugget_spacy(text, match_start, nlp):
    spacy_sents, nlp = spacy_sents_string_list(text, nlp)
    nug = None
    char_count = 0
    sent_id = 0
    for s in spacy_sents:  # find sentence by where characters start
        s = str(s)  # convert from spacy tokens to string
        s_len = len(s)
        if char_count + s_len > match_start:
            nug = s
            break
        char_count += s_len
        sent_id += 1
    return nug, sent_id, nlp

def find_nugget_in_text(text, sent_id, match_start, nlp, spacy_if_not_found=True):
    """Retrieve sentence at index sent_id"""
    technique = "splitlines"  # indicate how sent was found in df
    split = text.splitlines()
    if split[0] == "":
        sent_id += 1  # first entry is empty, adjust offset
    nug = None
    try:
        nug = split[sent_id]
    except IndexError:  # increment has pushed offset out of bounds
        try:
            sent_id -= 1
            nug = split[sent_id]
        except IndexError as e:  # sent_id does not match text indexing
            if spacy_if_not_found:
                nug, sent_id, nlp = find_nugget_spacy(text, match_start, nlp)
                technique = "spacy"
    return nug, sent_id, technique

In [860]:
def create_nugget_df(corpus_df, nuggets_tsv=None, matches_tsv=None, nuggets_tsv_path=None,
                     matches_tsv_path=None, topic_ids=None, spacy_if_not_found=True, verbose=True):
    def check_load_tsv(tsv, path):
        if tsv is None:
            if path is None:
                raise Exception("Must either pass the tsv file or the path to load it")
            else:  # load tsv
                path = convert_to_list(path)
                tsv = []
                for p in path:
                    tsv.append(pd.read_csv(p, "\t"))
                tsv = pd.concat(tsv, ignore_index=True, sort=False)
        return tsv
    
    # perform check if tsvs or their paths have been passed
    nuggets_tsv = check_load_tsv(nuggets_tsv, nuggets_tsv_path)
    matches_tsv = check_load_tsv(matches_tsv, matches_tsv_path)
    
    def parse_update_id(update_id):
        """Separate update_id into component streamid and sent_id"""
        update_id = update_id.split("-")
        sent_id = int(update_id[-1])
        streamid = "-".join(update_id[:-1])
        return streamid, sent_id
    
    
    nlp = None  # spacy model, load if needed
    entry_list = []  # list of dicts to build dataframe
    
    # what columns from each dataframe to extract to put into nugget_df
    nug_tsv_cols = ['nugget_id', 'importance', 'nugget_len', 'nugget_text']
    mat_tsv_cols = ['query_id', 'match_start', 'match_end']
    corp_cols = ['docid', 'streamid', 'epoch']
    # reference what columns to convert from string into numerical values
    num_cols = ['query_id', 'importance', 'nugget_len', 'epoch', 'sent_id', 'match_start', 'match_end']
    
    # convert topic_ids to int standard
    # set to new var to allow passing same unchanged nuggets/matches_tsv each time
    nug_tsv = convert_df_topic_id(nuggets_tsv, col_name='query_id', remove_null=True)
    mat_tsv = convert_df_topic_id(matches_tsv, col_name='query_id', remove_null=True)
    
    # target only selected topic_ids if not None
    if topic_ids is not None:
        for topic_id in topic_ids:
            nug_tsv = nug_tsv[nug_tsv['query_id'] == topic_id]
            mat_tsv = mat_tsv[mat_tsv['query_id'] == topic_id]
    
    missed_streamids = []  # store streamids not found for debug purposes
    missed_nuggetids = []  # debug purposes
    missed_sentid_streamids = []  # streamid where sent_id indexing out of bounds
    pbar = tqdm(total=len(mat_tsv), position=0, leave=True)
    for index, row  in mat_tsv.iterrows():
        entry = {}
        
        # get streamid and sentid of nugget occurence
        streamid, sent_id = parse_update_id(row['update_id'])
        
        # find occurence in corpus
        occur = corpus_df[corpus_df['streamid'] == streamid]
        if len(occur) == 0:
            missed_streamids.append(streamid)
            pbar.update()
            continue
        elif len(occur) > 1:
            if verbose:
                print("Number of entries with streamid: " + str(len(occur)))
                print(display(occur))
            raise Exception("There should be one entry in corpus with given streamid " + str(streamid))
        occur = occur.iloc[0].to_dict()
        
        # get text of the occurence
        occur_text = occur['text']
        match_start = int(row['match_start'])
        
        # get text of the nugget
        nug_row = nug_tsv[nug_tsv['nugget_id'] == row['nugget_id']]
        if len(nug_row) != 1:
            missed_nuggetids.append(row['nugget_id'])
            continue
        nug_row = nug_row.iloc[0].to_dict()
        
        # add columns from each dataframe
        for col in mat_tsv_cols:
            entry[col] = mat_tsv.at[index, col]
        for col in nug_tsv_cols:
            entry[col] = nug_row[col]
        # adding these columns here to control order of columns in final df
        found_sent, sent_id, technique = find_nugget_in_text(occur_text, sent_id, match_start, nlp,
                                                            spacy_if_not_found=spacy_if_not_found)
        entry['sent_in_text'] = found_sent
        entry['sent_id'] = sent_id
        entry['technique'] = technique
        for col in corp_cols:
            entry[col] = occur[col]
        
        if technique == "spacy":  # sent_id indexing was wrong
            missed_sentid_streamids.append(streamid)
        
        entry_list.append(entry)
        pbar.update()
        
    if verbose:
        print("Nugget entries were generated for " + str(len(entry_list)) + " nuggets. There were "
             + str(len(missed_streamids)) + " found in matches.tsv but not in corpus")
        print("There were " + str(len(missed_nuggetids)) + " nugget_ids found in matches.tsv but not in nuggets.tsv")
        print(str(len(missed_sentid_streamids)) + " out of " + str(len(entry_list)) + 
              " streamids had out of bounds sent_ids")
        
    nugget_df = pd.DataFrame(entry_list)
    if len(nugget_df) > 0:
        nugget_df[num_cols] = nugget_df[num_cols].apply(pd.to_numeric, errors='coerce', axis=1)  # convert appropriate cols to numerical values
        nugget_df.rename(columns={'query_id':'topic_id'}, inplace=True)  # topic_id matches other dataframes
    
    if verbose:
        print("nugget_df entries: " + str(len(nugget_df)))
    
    return nugget_df

In [891]:
def load_nugget_df(save_path, corpus_df=None, topic_ids=None,path_handler=None, save=True, force_reload=False, 
                   verbose=True, nuggets_tsv=None, matches_tsv=None, nuggets_tsv_path=None, matches_tsv_path=None,
                  spacy_if_not_found=True):
    
    nugget_df = load_df_control(save_path, 
                                
                    lambda: create_nugget_df(corpus_df, nuggets_tsv=nuggets_tsv, verbose=verbose,
                                             matches_tsv=matches_tsv, nuggets_tsv_path=nuggets_tsv_path, 
                                             matches_tsv_path=matches_tsv_path, topic_ids=topic_ids,
                                            spacy_if_not_found=spacy_if_not_found), 
                                
                                save=save, force_reload=force_reload, verbose=verbose,
                               path_handler=path_handler)
    return nugget_df

### Embedding Generation

In [823]:
def create_embedding_df(emb_model, corpus_df, nugget_df, sents_default="splitlines",
                        only_docs_with_nugs=False, nlp=None, verbose=True):
    """
    Pass through corpus and create embedding for each sentence.
    Use nugget_df to identify is sentences were created by splitlines or spacy  (add label on df)
    Use nugget_df sent_id label in resulting embedding dataframe whether sentence is a nugget
    """
        
    entry_list = []
    docs_multiple_nugs = []

    for index, row in tqdm_notebook(corpus_df.iterrows(), total=len(corpus_df)):
        # look up doc in nugget_df
        streamid = row['streamid']
        nug = nugget_df[nugget_df['streamid'] == streamid]
        nug_sent_ids = []  # sent ids of nuggets in doc
        technique = sents_default  # technique used to split sentences/get nuggets

        if len(nug) == 0:  # doc has no nuggets
            if only_docs_with_nugs:  # skip this document
                continue
        elif len(nug) > 1:  # doc has multiple possible nuggets
            # collect unique sent_ids
            uniq_sent_ids = list(nug['sent_id'].unique())
            # check if technique/way sentences were constructed match
            uniq_techs = list(nug['technique'].unique())
            if len(uniq_techs) > 1:
                continue
#                 print(display(nug))
#                 raise Exception("Streamid with multiple nuggets, uses inconsistent techniques to collect nuggets")
            # check sents with different ids are different
            for sent_id in uniq_sent_ids:
                nug_sent_id = nug[nug['sent_id'] == sent_id]
                uniq_sents = list(nug_sent_id['sent_in_text'].unique())
                if len(uniq_sents) > 1:  # mismatch sent_ids and sentence its referencing
                    print(display(nug))
                    print("Unique sentences for " + str(streamid) + " at sent_id " + str(sent_id) + ": ")
                    print(uniq_sents)
                    raise Exception("Streamid with multiple nuggets, has mismatched sent_ids and referred sentences")
            
            docs_multiple_nugs.append(streamid)  # debug/verbose info
            technique = uniq_techs[0] # should only be one in list
            nug_sent_ids.extend(uniq_sent_ids)  # add multiple nug sent ids
            
        elif len(nug) == 1: # doc has a single nugget
            #  get nugget info for df columns
            nug = nug.iloc[0].to_dict()
            if technique in nug and nug['technique'] is not None:
                technique = nug['technique']
            nug_sent_ids.append(nug['sent_id'])  # change this for multiple

        # split sentence in accordance with how nugget sentence was found/default method if no nuggets
        sents = []
        if technique == "splitlines":
            sents = row['text'].splitlines()
        elif technique == "spacy":
            sents, nlp = spacy_sents_string_list(str(row['text']), nlp)

        # get contextual sentence embeddings
        emb_sents = emb_model.encode(sents, show_progress_bar=False)

        # create dataframe entries
        topic_id = int(row['topic_id'])
        for i in range(len(sents)):
            entry = {"topic_id":topic_id, "streamid":streamid, "sent_id":i, "sentence":sents[i],
                     "embedding":emb_sents[i], "is_nugget":False, "technique":technique}
            if i in nug_sent_ids:
                entry['is_nugget'] = True
            entry_list.append(entry)

    if verbose:
        print("Embeddings generated for " + str(len(entry_list)) + " sentences")
    emb_df = pd.DataFrame(entry_list)
    return emb_df 

In [887]:
def load_embeddings(save_path, emb_model=None, corpus_df=None, nugget_df=None, sents_default="splitlines", 
                    only_docs_with_nugs=False, nlp=None, 
                    force_reload=False, save=True, verbose=True, path_handler=None):
    emb_df = load_df_control(save_path, 
                             
                             lambda: create_embedding_df(emb_model, corpus_df, nugget_df,
                                            sents_default=sents_default, nlp=nlp, verbose=verbose,
                                            only_docs_with_nugs=only_docs_with_nugs),

                                save=save, force_reload=force_reload, name="emb_df", verbose=verbose,
                                path_handler=path_handler)
    return emb_df

#### Update Dataframe (Temporal Information)

In [11]:
# def create_update_df():
#     """Data Frame containing information about docs which have updates/multiple instances in corpus"""
#     def create_entry(row, col_tags):
#         entry = {}
#         for col in col_tags:
#             entry[col] = row[col]
#         return entry
    
#     col_tags = ['docid', 'streamid', 'epoch', 'yyyymmddhh', 'zulu']
#     entry_list = []
#     dups = find_duplicates(corpus)
#     for docid in tqdm(dups, position=0, leave=True):
#         d = corpus[corpus['docid'] == docid]
#         for index, row in d.iterrows():
#             entry = create_entry(row, col_tags)
#             entry_list.append(entry)
             
#     update_df = pd.DataFrame(entry_list)
#     update_df = update_df.set_index(col_tags)
#     return update_df

# def load_update_df(save=True, force_reload=False, verbose=True):
#     update_df = load_df_control(update_csv_path, create_update_df, 
#                                 save=save, force_reload=force_reload, name="update_df", verbose=verbose)
#     return update_df

# update_df = load_update_df()

### Embedding Label Generation

In [825]:
def emb_str_to_float_vector(emb_string):
    """Vectors are being stored as a string. equivalent to str(vector)"""
    # get newlines
    emb_temp = emb_string.split("\n")
    lines = []
    for line in emb_temp:
        # remove bracket if necessary
        line = line.replace("[", "")
        line = line.replace("]", "")
        # get individual nums
        line = line.split(" ")
        line = [x for x in line if x != ""]  # remove any empty entries
        # convert string to float
        line = np.array(line)
        line = line.astype(np.float32)  # type used in the bert embeddings
        lines.append(line)
    # join all lines into single vector
    emb_vec = np.concatenate(lines, axis=0)
    return emb_vec

def cosine_similarity(vec_a, vec_b):
    """Get cosine similarity between two vectors"""
    # retain most precision by converting to 64 bit for operation
    vec_a = vec_a.astype(np.float64)
    vec_b = vec_b.astype(np.float64)
    cos_sim = np.dot(vec_a, vec_b)/(np.linalg.norm(vec_a)*np.linalg.norm(vec_b))
    # adjust marginal out of bounds floating point rounding errors
    if cos_sim > 1.0:  
        print("cos_sim over 1.0: " + str(cos_sim))
        cos_sim = np.float64(1.0)
    elif cos_sim < -1.0:
        cos_sim = np.float64(-1.0)
    return cos_sim

In [888]:
class EmbeddingLabelGenerator:
    def __init__(self, proj_dir):
        self.proj_dir = proj_dir
        self.path_handler = FilePathHandler(proj_dir)
        self.nug_emb_filename = "avg_nug_emb.npy"
        
    def add_cosine_label(self, corpus_name, nested_dir, file_type=".hdf", split_step=None, verbose=True, force_reload=False, 
                         save=True):
        """Add column to embedding dataframes labelling cosine distance to averaged nugget embedding"""
        if verbose:
            print("Adding cosine label to " + str(corpus_name) + " | " + str(nested_dir) + " embedding dataframes")
        emb_df_paths = self.path_handler.load_path_df_slice(corpus_name, "embeddings", nested_dir=nested_dir,
                                                   exists=True, split_step=split_step, file_type=file_type)
        # load avg nugget emb
        avg_emb = self.get_avg_nugget_emb(corpus_name, nested_dir, emb_df_paths, verbose=verbose, 
                                          force_reload=force_reload, save=save)
        
        for emb_path in tqdm_notebook(emb_df_paths['path']):
           # check emb_df
            emb_df = load_embeddings(emb_path, verbose=False)
            if 'cosine_similarity' in emb_df.columns and not force_reload:  # already processed
                continue
             # retrieve emb_df embeddings
            embs = list(emb_df['embedding'])
            if type(embs[0]) is str:
                embs = [emb_str_to_float_vector(x) for x in embs]  # convert to original float values
            # get cosine similarity for each emb
            sims = [cosine_similarity(avg_emb, x) for x in embs]
            # add to df
            sim_col = pd.Series(sims, dtype=np.float64)
            emb_df['cosine_similarity'] = sim_col
            if save:
                save_df_file_type(emb_df, emb_path, verbose=False)
                if verbose:
                    print("saved with cosine column at: " + str(emb_path))
        print("Finished adding cosine labels for " + str(corpus_name) + " " + str(nested_dir) + " embeddings")
            
        
    def get_avg_nugget_emb(self, corpus_name, nested_dir, emb_df_paths, verbose=True, force_reload=False, save=True):
        """Generate or load the model averaged embedding of nuggets"""
        emb_dir = os.path.join(self.path_handler.dataset_dir, corpus_name, "embeddings",
                                      nested_dir)
        nug_emb_path = os.path.join(emb_dir, self.nug_emb_filename)
        
        avg_emb = None
        if not os.path.exists(nug_emb_path) or force_reload:  # generate fresh from original files
            avg_emb = self.generate_avg_nugget_emb(emb_df_paths, verbose=verbose)
            if save:
                np.save(nug_emb_path, avg_emb)
                if verbose:
                    print("Avg nugget embedding saved to: " + str(nug_emb_path))
        else:
            avg_emb = np.load(nug_emb_path)
            if verbose:
                print("Avg nugget embedding loaded from: " + str(nug_emb_path))
        return avg_emb
        
    
    def generate_avg_nugget_emb(self, emb_df_paths, verbose=True):
        """Generate a representative model embedding by averaging nugget embeddings"""
        if verbose:
            print("Generating average nugget embedding from dataframes")
        
        nug_embs = []  # initially a list because undetermined length
        for emb_path in tqdm_notebook(list(emb_df_paths['path'])):
            # load section of embeddings
            emb_df = load_embeddings(emb_path, verbose=False)
            # extract nugget embeddings from this section
            nugs = emb_df[emb_df['is_nugget'] == True]
            nugs = list(nugs['embedding'])
            converted = [emb_str_to_float_vector(x) for x in nugs]  # converted formatted str to float
            nug_embs.extend(converted)
        # convert to matrix
        nug_embs = np.stack(nug_embs, axis=0)
        # get avg embedding
        avg_emb = np.mean(nug_embs, axis=0, dtype=np.float64)  # use f64 dtype for better precision in avg
        return avg_emb

In [None]:
# proj_dir = '/nfs/proj-repo/AAARG-dissertation'
# corpus_name = "original-trects-kba2014-filtered"
# nested_dir = "distilbert-base-nli-stsb-mean-tokens"

# test_emb_gen = EmbeddingLabelGenerator(proj_dir)
# test_emb_gen.add_cosine_label(corpus_name, nested_dir)
# # test_emb_gen.get_model_embedding(corpus_name, nested_dir)

## Filter the Larger Trects Dataset

In [331]:
class TrectsFilter:
    def __init__(self):
        self.base_dir = '/nfs/trects-kba2014'
        self.updates_dir = "/nfs/TemporalSummarization/ts14/results"
        self.updates_csv_paths = self.generate_update_paths()
        self.save_dir = '/nfs/mine-trects-kba2014-filtered'
        self.proc_history_path = self.save_dir + '/' + 'process_history.pickle'
        self.proc_history = None
        self.streamids = set()

    def generate_update_paths(self, verbose=True):
        up_dir = "/nfs/TemporalSummarization"
        ts_dirs = ["ts13", "ts14", "ts15"]
        target_files = ['updates_sampled.extended.tsv', 'updates_sampled.tsv',
                                 'matches.tsv']
        
        up_paths = []
        wrong_paths = []
        for ts_dir in ts_dirs:
            for target_file in target_files:
                full_path = up_dir + '/' + ts_dir + '/results/' + target_file
                if os.path.exists(full_path):
                    up_paths.append(full_path)
                else:
                    wrong_paths.append(full_path)
        if verbose:
            print("Attempted to find streamids in these files, but no path exists:")
            print(wrong_paths)
            print("")
        return up_paths
        
        
    def create_filtered_dataset(self, force_reload=False, verbose=True, no_soup=True):
        """ Outline of Process
        1. Find streamids
            1.1 open updates_sampled.tsv file (or updates_sampled.extended.tsv)
            1.2 Scrape update_id column
            1.3 transform into streamid (drop last hyphenated numbers (these are sentenceids))
            1.4 Put streamids into datastructure for comparing (e.g. set)
        2. Create a new directory for each topic folder there is in target dir
        3. Opening up documents
            3.1 Go for each topic folder
            3.2 Open up each document
            3.3 Parse into html tree
            3.4 if streamid matches, store locally in memory buffer
            3.5 store file with same filename with matched streamids in another location
        """
        # get streamids for docs that we will filter for
        for update_csv_path in self.updates_csv_paths:
            self.get_streamids(update_csv_path)
        if verbose:
            print("Number of streamids searching for: " + str(len(self.streamids)))

        # get topicids from folder names
        topic_ids = [int(tid) for tid in os.listdir(self.base_dir) if tid.isdigit()]
        topic_ids.sort()
        
        # create dir to save filtered corpus to
        self.create_dir(self.save_dir)
        
        # load history of files already processed if exists
        self.load_process_history_dict(topic_ids)
        
        
        for topic_id in tqdm_notebook(topic_ids, position=0, leave=True):
            # create save directory
            topic_save_dir = self.save_dir + '/' + str(topic_id)
            self.create_dir(topic_save_dir)
            
            # get paths for files in target topic dir
            topic_dir = self.base_dir + '/' + str(topic_id)
            gz_paths = search_dir(topic_dir)
            
            # remove already processed files
            if not force_reload:
                if verbose:
                    prev_printed = [x for x in gz_paths if x in self.proc_history[topic_id]]
                    print("Previously processed " + str(len(prev_printed)) + " of " + str(len(gz_paths))
                         + " paths for topic " + str(topic_id))
                gz_paths = [x for x in gz_paths if x not in self.proc_history[topic_id]]
            
            if verbose:
                if len(gz_paths) > 0:
                    print("Processing topic " + str(topic_id))
            
            # process each file
            for gz_path in tqdm_notebook(gz_paths):
                if no_soup == True:
                    matches = self.process_file(gz_path, verbose=verbose)
                    if verbose:
                        print("len matches: " + str(len(matches)))
                    save_path = self.get_file_save_path(topic_id, gz_path)
                    self.write_docs_to_file(matches, save_path, no_soup=True, verbose=verbose)
                else:
                    # get file markup
                    markup = open_markup_file(gz_path, verbose=verbose)  # 50MB file proving hard for beautifulsoup
                    # get docs in file that are in streamids
                    matches = self.retrieve_matching_docs(markup, verbose=verbose)
                    save_path = self.get_file_save_path(topic_id, gz_path)
                    # write file and save results
                    self.write_docs_to_file(matches, save_path, verbose=verbose)
                self.proc_history[topic_id].add(gz_path)
                self.save_process_history_dict(verbose=verbose)
                
        print("Finished filtering corpus")
        
    def process_file(self, filepath, verbose=True):
        if verbose:
            print("Processing file at: " + str(filepath))
        matching_docs = []
        doc_buffer = []
        found_sid = False
        save_doc = False
        with gzip.open(filepath, 'rt') as f:
            for line in f:
                doc_buffer.append(line)  # add cur line to buffer
                buff_len = len(doc_buffer)
                if buff_len > 1:
                    if not found_sid:
                        if self.is_tag(line, tag="streamid"):
                            found_sid = True
                            sid = self.get_inner_tag(line, tag="streamid", remove_whitespace=True)
                            if sid in self.streamids:
                                save_doc = True
                    if self.is_tag(line, tag="doc", start_tag=False, end_tag=True):
                        if save_doc:  # if has matching streamid save doc file
                            matching_docs.append(doc_buffer)
                        doc_buffer = []
                        save_doc = False
                        found_sid = False
                        
                elif self.is_tag(line, tag="doc", start_tag=True, end_tag=False):
                    # reset variables
                    doc_buffer = []
                    doc_buffer.append(line)
        return matching_docs
                
                
    def is_tag(self, line, tag="streamid", start_tag=True, end_tag=True):
        start, end = self.create_tags(tag)
        start_true = False
        end_true = False
        
        if re.search(start, line, re.IGNORECASE):
            start_true = True
        if re.search(end, line, re.IGNORECASE):
            end_true = True
        
        if start_tag == True:
            if end_tag == True:
                return start_true and end_true
            else:
                return start_true
        else:
            return end_true
        
    def create_tags(self, tag):
        start_tag = "<" + tag + ">"
        end_tag = "</" + tag + ">"
        return start_tag, end_tag
            
    def get_inner_tag(self, line, tag="streamid", remove_whitespace=False):
        start_tag, end_tag = self.create_tags(tag)
        no_tags = line.replace(start_tag,'').replace(end_tag,'')
        no_tags = line.replace(start_tag.upper(), '').replace(end_tag.upper(),'')  # repeat for uppercase
        if remove_whitespace == True:
            no_tags = no_tags.rstrip()
        return no_tags
        
    def save_process_history_dict(self, verbose=True):
        with open(self.proc_history_path, 'wb') as handle:
            pickle.dump(self.proc_history, handle, protocol=pickle.HIGHEST_PROTOCOL)
            if verbose:
                print("saved proc_history")
        
    def load_process_history_dict(self, topic_ids):
        if os.path.exists(self.proc_history_path):
            with open(self.proc_history_path, 'rb') as handle:
                self.proc_history = pickle.load(handle)
            return True
        else:
            self.proc_history = self.create_process_history_dict(topic_ids)
            return False
        
    def create_process_history_dict(self, topic_ids):
        """Create a dictionary to keep track of what files have already been searched"""
        proc_history = {}
        for topic_id in topic_ids:
            proc_history[int(topic_id)] = set()  # sets have faster indexing
        return proc_history

                
    def get_file_save_path(self, topic_id, gz_path):
        filename = self.get_filename_from_gz_path(gz_path)
        save_path = self.save_dir + '/' + str(topic_id) + '/' + filename
        return save_path
        
                
    def get_filename_from_gz_path(self, gz_path):
        split = gz_path.split("/")
        filename = split[-1]
        return filename  # return with file extension on
                

    def write_docs_to_file(self, doc_list, save_path, no_soup=False, verbose=True):
        # transform docs into string
        if len(doc_list) > 0:  # don't write empty files
            out = ""
            if no_soup == True:
                out += "<html>\n"
                for doc in doc_list:
                    out += "".join(list(map(str, doc)))
                    out += "\n"
                out += "</html>"
            else:
                out = "\n".join(list(map(str, doc_list)))
            # write
            with gzip.open(save_path, "wt") as f:
                f.write(out)
                if verbose:
                    print("File written to: " + str(save_path))
        
            
    def retrieve_matching_docs(self, markup, verbose=False):
        """Retrieve docs with matching streamids from markup"""
        matches = []
        doc_count = 0
        match_count = 0
        for doc in markup.find_all("doc"):
            d_streamid = str(doc.find("streamid").string)
            if d_streamid in self.streamids:  # matching doc
                matches.append(doc)
                match_count += 1
            doc_count +=1
        if verbose:
            print("doc count: " + str(doc_count) + "\nmatch_count: " + str(match_count))
        return matches
    
    def get_streamids(self, path):
        # read tsv file
        updates_csv = pd.read_csv(path, "\t")
        # take column with streamids
        updateids = list(updates_csv['update_id'])
        for updateid in updateids:
            streamid = self.parse_streamid(updateid)
            self.streamids.add(streamid)
        return self.streamids
        
    def parse_streamid(self, updateid):
        """Convert updateid in format: epoch-docid-sentid into epoch-docid"""
        split = updateid.split("-")
        split = split[:-1]  # remove sentid from end
        streamid = "-".join(split)
        return streamid
    
    def create_dir(self, dir_path):
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
            print("Created new directory at " + str(dir_path))

In [129]:
trectsfilter = TrectsFilter()
trectsfilter.create_filtered_dataset(verbose=True, force_reload=False, no_soup=True)

Attempted to find streamids in these files, but no path exists:
['/nfs/TemporalSummarization/ts13/results/updates_sampled.extended.tsv', '/nfs/TemporalSummarization/ts15/results/updates_sampled.extended.tsv']

Number of streamids searching for: 53141


HBox(children=(IntProgress(value=0, max=45), HTML(value='')))

Previously processed 241 of 241 paths for topic 1


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 241 of 241 paths for topic 2


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 241 of 241 paths for topic 3


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 241 of 241 paths for topic 4


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 241 of 241 paths for topic 5


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 241 of 241 paths for topic 6


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 241 of 241 paths for topic 8


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 241 of 241 paths for topic 9


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 241 of 241 paths for topic 10


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 435 of 435 paths for topic 11


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 649 of 649 paths for topic 12


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 313 of 313 paths for topic 13


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 126 of 126 paths for topic 14


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 241 of 241 paths for topic 15


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 175 of 175 paths for topic 16


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 97 of 97 paths for topic 17


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 505 of 505 paths for topic 18


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 337 of 337 paths for topic 19


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 313 of 313 paths for topic 20


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 241 of 241 paths for topic 21


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 264 of 264 paths for topic 22


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 432 of 432 paths for topic 23


HBox(children=(IntProgress(value=1, bar_style='info', max=1), HTML(value='')))


Previously processed 106 of 288 paths for topic 24
Processing topic 24


HBox(children=(IntProgress(value=0, max=182), HTML(value='')))

Processing file at: /nfs/trects-kba2014/24/2013-02-14-16.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-14-16.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-16-05.gz
len matches: 60
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-16-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-14-19.gz
len matches: 14
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-14-19.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-18-11.gz
len matches: 33
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-18-11.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-15-15.gz
len matches: 326
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-15-15.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-16-19.gz
len matches: 62
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-16-19.gz
saved proc_history
Proc

len matches: 41
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-08-15.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-16-22.gz
len matches: 59
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-16-22.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-16-11.gz
len matches: 37
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-16-11.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-12-00.gz
len matches: 20
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-12-00.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-14-12.gz
len matches: 10
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-14-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-17-12.gz
len matches: 26
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-17-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-13-08.gz
len 

len matches: 36
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-12-06.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-18-08.gz
len matches: 27
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-18-08.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-08-23.gz
len matches: 13
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-08-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-14-01.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-14-01.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-16-16.gz
len matches: 77
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-16-16.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-18-10.gz
len matches: 31
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-18-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-12-17.gz
len m

len matches: 55
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-13-00.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-11-06.gz
len matches: 33
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-11-06.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-18-13.gz
len matches: 40
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-18-13.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-13-22.gz
len matches: 24
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-13-22.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-13-10.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-13-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-17-10.gz
len matches: 37
File written to: /nfs/mine-trects-kba2014-filtered/24/2013-02-17-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/24/2013-02-10-23.gz
len m

HBox(children=(IntProgress(value=0, max=248), HTML(value='')))

Processing file at: /nfs/trects-kba2014/25/2013-02-07-19.gz
len matches: 130
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-07-19.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-12-19.gz
len matches: 20
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-12-19.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-05-13.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-05-13.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-07-10.gz
len matches: 18
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-07-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-11-03.gz
len matches: 24
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-11-03.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-04-05.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-04-05.gz
saved proc_history
Proce

len matches: 8
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-04-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-03-09.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-03-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-05-09.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-05-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-06-01.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-04-23.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-04-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-09-08.gz
len matches: 32
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-09-08.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-11-13.gz
len matches: 53
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-11

len matches: 14
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-09-04.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-03-14.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-03-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-12-23.gz
len matches: 55
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-12-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-10-12.gz
len matches: 17
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-10-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-07-02.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-07-02.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-06-13.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-08-04.gz
len matches: 38
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-

len matches: 42
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-10-20.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-04-04.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-04-04.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-11-11.gz
len matches: 23
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-11-11.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-03-06.gz
len matches: 8
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-03-06.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-09-03.gz
len matches: 21
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-09-03.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-04-07.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-04-07.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-10-00.gz
len mat

len matches: 44
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-10-19.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-12-15.gz
len matches: 13
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-12-15.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-11-17.gz
len matches: 11
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-11-17.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-08-02.gz
len matches: 63
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-08-02.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-10-09.gz
len matches: 46
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-10-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-04-08.gz
len matches: 10
File written to: /nfs/mine-trects-kba2014-filtered/25/2013-02-04-08.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/25/2013-02-07-05.gz
len 

HBox(children=(IntProgress(value=0, max=361), HTML(value='')))

Processing file at: /nfs/trects-kba2014/26/2013-01-26-13.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-26-13.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-31-06.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-21-14.gz
len matches: 10
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-21-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-20-14.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-20-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-26-10.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-26-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-26-12.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-26-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-25-19.gz
len matches: 4
File 

len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-29-17.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-23-13.gz
len matches: 8
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-23-13.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-21-01.gz
len matches: 38
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-21-01.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-22-00.gz
len matches: 30
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-22-00.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-18-02.gz
len matches: 176
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-18-02.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-16-08.gz
len matches: 22
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-16-08.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-22-18.gz
len m

len matches: 124
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-18-19.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-27-06.gz
len matches: 33
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-27-06.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-27-16.gz
len matches: 7
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-27-16.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-21-04.gz
len matches: 30
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-21-04.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-24-08.gz
len matches: 23
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-24-08.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-19-17.gz
len matches: 60
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-19-17.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-30-15.gz
len 

len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-23-19.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-26-19.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-26-19.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-24-15.gz
len matches: 10
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-24-15.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-17-23.gz
len matches: 41
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-17-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-19-09.gz
len matches: 33
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-19-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-18-05.gz
len matches: 56
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-18-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-24-02.gz
len ma

len matches: 26
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-27-20.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-26-02.gz
len matches: 8
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-26-02.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-25-06.gz
len matches: 21
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-25-06.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-27-01.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-27-01.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-21-02.gz
len matches: 12
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-21-02.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-29-00.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-29-00.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-30-23.gz
len mat

len matches: 29
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-28-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-29-16.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-29-16.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-26-21.gz
len matches: 13
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-26-21.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-21-03.gz
len matches: 40
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-21-03.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-25-08.gz
len matches: 32
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-25-08.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-21-07.gz
len matches: 45
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-21-07.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-20-01.gz
len m

len matches: 108
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-18-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-16-19.gz
len matches: 76
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-16-19.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-30-01.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-28-17.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-28-17.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-19-18.gz
len matches: 40
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-19-18.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-21-09.gz
len matches: 17
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-01-21-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/26/2013-01-17-13.gz
len matches: 72
File written to: /nfs/mine-trects-kba2014-filtered/26/2013-0

HBox(children=(IntProgress(value=0, max=145), HTML(value='')))

Processing file at: /nfs/trects-kba2014/27/2012-10-30-04.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/27/2012-10-30-04.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/27/2012-10-28-14.gz
len matches: 10
File written to: /nfs/mine-trects-kba2014-filtered/27/2012-10-28-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/27/2012-10-31-18.gz
len matches: 14
File written to: /nfs/mine-trects-kba2014-filtered/27/2012-10-31-18.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/27/2012-10-27-13.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/27/2012-10-27-13.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/27/2012-10-29-23.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/27/2012-10-29-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/27/2012-10-27-10.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/27/2012-10-27-10.gz
saved proc_history
Processi

len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/27/2012-10-29-11.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/27/2012-10-27-03.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/27/2012-10-29-05.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/27/2012-10-29-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/27/2012-11-01-11.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/27/2012-11-01-11.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/27/2012-11-01-05.gz
len matches: 8
File written to: /nfs/mine-trects-kba2014-filtered/27/2012-11-01-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/27/2012-10-30-22.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/27/2012-10-30-22.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/27/2012-10-29-08.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/27/2012-10-29-0

len matches: 16
File written to: /nfs/mine-trects-kba2014-filtered/27/2012-10-31-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/27/2012-10-31-07.gz
len matches: 25
File written to: /nfs/mine-trects-kba2014-filtered/27/2012-10-31-07.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/27/2012-10-29-10.gz
len matches: 7
File written to: /nfs/mine-trects-kba2014-filtered/27/2012-10-29-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/27/2012-10-29-03.gz
len matches: 8
File written to: /nfs/mine-trects-kba2014-filtered/27/2012-10-29-03.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/27/2012-11-01-04.gz
len matches: 8
File written to: /nfs/mine-trects-kba2014-filtered/27/2012-11-01-04.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/27/2012-11-01-12.gz
len matches: 11
File written to: /nfs/mine-trects-kba2014-filtered/27/2012-11-01-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/27/2012-10-30-17.gz
len mat

HBox(children=(IntProgress(value=0, max=168), HTML(value='')))

Processing file at: /nfs/trects-kba2014/28/2013-04-30-13.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-30-13.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-27-15.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-27-15.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-28-05.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-28-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-24-18.gz
len matches: 8
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-24-18.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-25-01.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-25-01.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-29-05.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-29-05.gz
saved proc_history
Processing

len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-28-22.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-25-16.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-25-16.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-29-18.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-29-18.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-28-14.gz
len matches: 13
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-28-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-26-07.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-26-07.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-24-03.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-24-03.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-25-04.gz
len match

len matches: 8
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-29-20.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-25-00.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-25-00.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-30-05.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-30-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-26-23.gz
len matches: 8
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-26-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-27-10.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-27-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-26-05.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-24-17.gz
len matches: 7
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-24-1

len matches: 13
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-25-21.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-30-12.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-30-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-30-08.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-30-08.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-25-05.gz
len matches: 7
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-25-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-26-02.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-26-02.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-30-23.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/28/2013-04-30-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/28/2013-04-29-10.gz
len match

HBox(children=(IntProgress(value=0, max=241), HTML(value='')))

Processing file at: /nfs/trects-kba2014/29/2013-02-25-17.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/29/2013-02-25-17.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-26-21.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/29/2013-02-26-21.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-26-02.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-24-18.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-03-02-19.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-03-02-01.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-27-04.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-25-21.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/29/2013-02-25-21.gz
saved proc_history
Processing file at: /nfs/trects-kba

len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-03-02-12.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-03-02-23.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-03-01-22.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-22-20.gz
len matches: 22
File written to: /nfs/mine-trects-kba2014-filtered/29/2013-02-22-20.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-25-23.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/29/2013-02-25-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-03-03-05.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-25-06.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-28-06.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-26-04.gz
len matches

len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/29/2013-02-23-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-26-07.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-22-08.gz
len matches: 33
File written to: /nfs/mine-trects-kba2014-filtered/29/2013-02-22-08.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-26-08.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-24-09.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/29/2013-02-24-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-28-16.gz
len matches: 7
File written to: /nfs/mine-trects-kba2014-filtered/29/2013-02-28-16.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-03-01-18.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/29/2013-03-01-18.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/29/201

len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-24-00.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/29/2013-02-24-00.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-03-03-00.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-26-19.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/29/2013-02-26-19.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-23-22.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/29/2013-02-23-22.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-03-02-08.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-28-02.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-24-21.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-25-20.gz
len matches: 0
saved proc_history
P

len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/29/2013-02-22-01.gz
len matches: 14
File written to: /nfs/mine-trects-kba2014-filtered/29/2013-02-22-01.gz
saved proc_history

Created new directory at /nfs/mine-trects-kba2014-filtered/30
Previously processed 0 of 241 paths for topic 30
Processing topic 30


HBox(children=(IntProgress(value=0, max=241), HTML(value='')))

Processing file at: /nfs/trects-kba2014/30/2012-03-07-10.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-07-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-05-06.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-05-06.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-06-08.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-06-08.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-14-05.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-12-23.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-09-19.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-09-20.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-05-15.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/30/

len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-09-06.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-04-21.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-04-21.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-08-15.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-08-15.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-06-15.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-06-15.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-08-16.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-08-16.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-05-03.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-05-03.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-06-03.gz
len matche

len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-06-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-08-14.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-08-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-08-21.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-08-21.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-08-03.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-08-03.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-08-20.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-13-14.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-13-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-05-18.gz
len matches: 10
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-05-

len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-09-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-10-21.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-10-03.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-11-21.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-11-21.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-11-06.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-13-01.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-13-01.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-05-19.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-05-19.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-12-16.gz
len matches: 0
saved proc_history
Processing file at: /nfs

len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-08-02.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-08-02.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-07-16.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-07-16.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-07-07.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-13-23.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-13-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-07-01.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-07-01.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012-03-09-07.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/30/2012-03-09-07.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/30/2012

HBox(children=(IntProgress(value=0, max=119), HTML(value='')))

Processing file at: /nfs/trects-kba2014/31/2012-08-03-18.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/31/2012-08-03-02.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/31/2012-08-01-22.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/31/2012-08-01-22.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/31/2012-07-31-16.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/31/2012-07-31-11.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/31/2012-07-31-11.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/31/2012-08-02-18.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/31/2012-08-01-13.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/31/2012-08-01-13.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/31/2012-08-01-18.gz
len matches: 19
File written to: /nfs/mine-trects-kba2014-filtered/31

len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/31/2012-08-03-11.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/31/2012-07-30-02.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/31/2012-07-31-03.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/31/2012-08-03-10.gz
len matches: 24
File written to: /nfs/mine-trects-kba2014-filtered/31/2012-08-03-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/31/2012-07-30-21.gz
len matches: 56
File written to: /nfs/mine-trects-kba2014-filtered/31/2012-07-30-21.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/31/2012-07-31-01.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/31/2012-07-31-13.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/31/2012-07-31-13.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/31/2012-08-01-06.gz
len matches: 1
File written to: /

HBox(children=(IntProgress(value=0, max=457), HTML(value='')))

Processing file at: /nfs/trects-kba2014/32/2012-09-11-08.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-11-08.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-28-14.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-21-08.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-11-06.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-11-06.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-22-16.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-15-07.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-15-07.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-22-11.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-22-11.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-23-19.g

len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-12-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-14-21.gz
len matches: 12
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-14-21.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-14-05.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-14-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-28-09.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-28-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-23-08.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-25-18.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-25-18.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-13-00.gz
len matches: 24
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-13

len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-15-04.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-16-01.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-16-01.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-23-03.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-21-01.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-21-01.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-29-09.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-29-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-20-18.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-20-18.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-13-10.gz
len matches: 11
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-13-

len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-25-10.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-21-23.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-21-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-18-02.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-22-00.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-19-06.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-19-06.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-11-12.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-11-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-28-21.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-28-11.gz
len matches: 0
saved proc_history
P

len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-27-20.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-20-07.gz
len matches: 8
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-20-07.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-24-12.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-23-22.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-23-22.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-11-11.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-11-11.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-27-01.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-19-04.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-19-04.gz
saved proc_history
Processing file at: /nfs

len matches: 25
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-12-21.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-20-00.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-12-11.gz
len matches: 25
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-12-11.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-12-14.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-12-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-28-00.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-28-00.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-19-13.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-26-12.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-26-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/20

len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-19-20.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-12-13.gz
len matches: 30
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-12-13.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-25-21.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-25-21.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-28-03.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-28-03.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-21-22.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-26-11.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-26-11.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-11-19.gz
len matches: 44
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-11

len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-20-22.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-22-14.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-21-18.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-21-18.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-11-07.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-11-07.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-16-12.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-16-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-28-16.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012-09-22-10.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/32/2012-09-22-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/32/2012

HBox(children=(IntProgress(value=0, max=217), HTML(value='')))

Processing file at: /nfs/trects-kba2014/33/2013-01-10-07.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-15-14.gz
len matches: 24
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-15-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-14-20.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-14-20.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-15-10.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-15-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-11-14.gz
len matches: 8
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-11-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-18-07.gz
len matches: 82
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-18-07.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-17-00.gz
len matches: 103
Fi

len matches: 84
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-18-16.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-13-08.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-13-08.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-16-22.gz
len matches: 10
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-16-22.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-11-05.gz
len matches: 15
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-11-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-11-12.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-11-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-10-11.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-10-11.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-13-03.gz
len mat

len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-16-06.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-16-09.gz
len matches: 38
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-16-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-13-20.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-13-20.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-15-04.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-18-21.gz
len matches: 77
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-18-21.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-16-12.gz
len matches: 11
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-16-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-12-09.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-1

len matches: 29
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-15-13.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-17-10.gz
len matches: 54
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-17-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-11-11.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-11-11.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-14-09.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-14-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-12-06.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-10-05.gz
len matches: 88
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-10-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-10-06.gz
len matches: 93
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-

len matches: 11
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-12-15.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-15-01.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-11-00.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-10-18.gz
len matches: 10
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-10-18.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-12-18.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-12-18.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-12-05.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-12-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/2013-01-11-07.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/33/2013-01-11-07.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/33/20

HBox(children=(IntProgress(value=0, max=97), HTML(value='')))

Processing file at: /nfs/trects-kba2014/34/2013-02-19-13.gz
len matches: 27
File written to: /nfs/mine-trects-kba2014-filtered/34/2013-02-19-13.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/34/2013-02-17-04.gz
len matches: 47
File written to: /nfs/mine-trects-kba2014-filtered/34/2013-02-17-04.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/34/2013-02-17-05.gz
len matches: 62
File written to: /nfs/mine-trects-kba2014-filtered/34/2013-02-17-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/34/2013-02-19-10.gz
len matches: 21
File written to: /nfs/mine-trects-kba2014-filtered/34/2013-02-19-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/34/2013-02-17-21.gz
len matches: 29
File written to: /nfs/mine-trects-kba2014-filtered/34/2013-02-17-21.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/34/2013-02-20-00.gz
len matches: 15
File written to: /nfs/mine-trects-kba2014-filtered/34/2013-02-20-00.gz
saved proc_history
Proc

len matches: 12
File written to: /nfs/mine-trects-kba2014-filtered/34/2013-02-18-03.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/34/2013-02-16-06.gz
len matches: 56
File written to: /nfs/mine-trects-kba2014-filtered/34/2013-02-16-06.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/34/2013-02-17-23.gz
len matches: 37
File written to: /nfs/mine-trects-kba2014-filtered/34/2013-02-17-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/34/2013-02-19-06.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/34/2013-02-19-06.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/34/2013-02-17-01.gz
len matches: 51
File written to: /nfs/mine-trects-kba2014-filtered/34/2013-02-17-01.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/34/2013-02-18-14.gz
len matches: 26
File written to: /nfs/mine-trects-kba2014-filtered/34/2013-02-18-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/34/2013-02-17-08.gz
len m

HBox(children=(IntProgress(value=0, max=121), HTML(value='')))

Processing file at: /nfs/trects-kba2014/35/2013-04-17-23.gz
len matches: 58
File written to: /nfs/mine-trects-kba2014-filtered/35/2013-04-17-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/35/2013-04-19-10.gz
len matches: 42
File written to: /nfs/mine-trects-kba2014-filtered/35/2013-04-19-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/35/2013-04-16-01.gz
len matches: 119
File written to: /nfs/mine-trects-kba2014-filtered/35/2013-04-16-01.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/35/2013-04-18-11.gz
len matches: 34
File written to: /nfs/mine-trects-kba2014-filtered/35/2013-04-18-11.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/35/2013-04-18-00.gz
len matches: 50
File written to: /nfs/mine-trects-kba2014-filtered/35/2013-04-18-00.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/35/2013-04-15-13.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/35/2013-04-15-13.gz
saved proc_history
Proc

len matches: 7
File written to: /nfs/mine-trects-kba2014-filtered/35/2013-04-15-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/35/2013-04-18-21.gz
len matches: 48
File written to: /nfs/mine-trects-kba2014-filtered/35/2013-04-18-21.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/35/2013-04-18-18.gz
len matches: 20
File written to: /nfs/mine-trects-kba2014-filtered/35/2013-04-18-18.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/35/2013-04-18-07.gz
len matches: 62
File written to: /nfs/mine-trects-kba2014-filtered/35/2013-04-18-07.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/35/2013-04-16-05.gz
len matches: 61
File written to: /nfs/mine-trects-kba2014-filtered/35/2013-04-16-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/35/2013-04-19-13.gz
len matches: 77
File written to: /nfs/mine-trects-kba2014-filtered/35/2013-04-19-13.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/35/2013-04-15-19.gz
len m

len matches: 73
File written to: /nfs/mine-trects-kba2014-filtered/35/2013-04-19-18.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/35/2013-04-15-23.gz
len matches: 157
File written to: /nfs/mine-trects-kba2014-filtered/35/2013-04-15-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/35/2013-04-15-01.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/35/2013-04-15-01.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/35/2013-04-17-16.gz
len matches: 127
File written to: /nfs/mine-trects-kba2014-filtered/35/2013-04-17-16.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/35/2013-04-17-13.gz
len matches: 56
File written to: /nfs/mine-trects-kba2014-filtered/35/2013-04-17-13.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/35/2013-04-17-01.gz
len matches: 67
File written to: /nfs/mine-trects-kba2014-filtered/35/2013-04-17-01.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/35/2013-04-18-03.gz
len

HBox(children=(IntProgress(value=0, max=121), HTML(value='')))

Processing file at: /nfs/trects-kba2014/36/2013-03-19-00.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/36/2013-03-19-00.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/36/2013-03-20-05.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/36/2013-03-20-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/36/2013-03-20-00.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/36/2013-03-20-00.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/36/2013-03-22-12.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/36/2013-03-22-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/36/2013-03-23-20.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/36/2013-03-23-20.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/36/2013-03-20-21.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/36/2013-03-20-21.gz
saved proc_history
Processing

len matches: 20
File written to: /nfs/mine-trects-kba2014-filtered/36/2013-03-19-13.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/36/2013-03-20-17.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/36/2013-03-20-17.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/36/2013-03-20-18.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/36/2013-03-20-18.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/36/2013-03-22-20.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/36/2013-03-22-20.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/36/2013-03-23-15.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/36/2013-03-23-15.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/36/2013-03-23-11.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/36/2013-03-23-11.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/36/2013-03-19-01.gz
len match

len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/36/2013-03-21-15.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/36/2013-03-22-02.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/36/2013-03-22-11.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/36/2013-03-21-04.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/36/2013-03-21-04.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/36/2013-03-21-01.gz
len matches: 7
File written to: /nfs/mine-trects-kba2014-filtered/36/2013-03-21-01.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/36/2013-03-21-07.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/36/2013-03-21-07.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/36/2013-03-21-05.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/36/2013-03-21-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/36/2013

HBox(children=(IntProgress(value=0, max=169), HTML(value='')))

Processing file at: /nfs/trects-kba2014/37/2011-12-29-12.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/37/2011-12-29-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2012-01-03-10.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/37/2012-01-03-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2011-12-29-20.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2011-12-29-10.gz
len matches: 8
File written to: /nfs/mine-trects-kba2014-filtered/37/2011-12-29-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2012-01-01-08.gz
len matches: 7
File written to: /nfs/mine-trects-kba2014-filtered/37/2012-01-01-08.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2011-12-30-22.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/37/2011-12-30-22.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2011-12-29-18.gz
len matches: 3
File w

len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/37/2011-12-31-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2011-12-30-17.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/37/2011-12-30-17.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2012-01-01-23.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/37/2012-01-01-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2012-01-04-07.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/37/2012-01-04-07.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2012-01-03-11.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/37/2012-01-03-11.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2012-01-04-20.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/37/2012-01-04-20.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2012-01-04-18.gz
len matche

len matches: 11
File written to: /nfs/mine-trects-kba2014-filtered/37/2012-01-03-06.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2012-01-03-08.gz
len matches: 8
File written to: /nfs/mine-trects-kba2014-filtered/37/2012-01-03-08.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2012-01-03-18.gz
len matches: 12
File written to: /nfs/mine-trects-kba2014-filtered/37/2012-01-03-18.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2012-01-02-01.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2012-01-02-08.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/37/2012-01-02-08.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2011-12-31-22.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/37/2011-12-31-22.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2012-01-01-03.gz
len matches: 12
File written to: /nfs/mine-trects-kba2014-filtered/37/2012-01-0

len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/37/2012-01-04-16.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2012-01-03-20.gz
len matches: 12
File written to: /nfs/mine-trects-kba2014-filtered/37/2012-01-03-20.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2012-01-01-07.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/37/2012-01-01-07.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2012-01-02-03.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/37/2012-01-02-03.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2011-12-30-09.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/37/2011-12-30-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2012-01-02-16.gz
len matches: 10
File written to: /nfs/mine-trects-kba2014-filtered/37/2012-01-02-16.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/37/2011-12-31-08.gz
len matc

HBox(children=(IntProgress(value=0, max=217), HTML(value='')))

Processing file at: /nfs/trects-kba2014/38/2013-04-06-05.gz
len matches: 16
File written to: /nfs/mine-trects-kba2014-filtered/38/2013-04-06-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-06-03.gz
len matches: 10
File written to: /nfs/mine-trects-kba2014-filtered/38/2013-04-06-03.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-06-22.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/38/2013-04-06-22.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-11-22.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/38/2013-04-11-22.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-10-18.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-04-16.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/38/2013-04-04-16.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-08-03.gz
len matches: 0
save

len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/38/2013-04-07-03.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-09-14.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/38/2013-04-09-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-12-14.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/38/2013-04-12-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-09-07.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-04-05.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-11-04.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/38/2013-04-11-04.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-05-22.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/38/2013-04-05-22.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013

len matches: 8
File written to: /nfs/mine-trects-kba2014-filtered/38/2013-04-07-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-11-09.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/38/2013-04-11-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-07-23.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/38/2013-04-07-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-09-00.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-07-17.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/38/2013-04-07-17.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-08-14.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/38/2013-04-08-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-08-12.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013

len matches: 7
File written to: /nfs/mine-trects-kba2014-filtered/38/2013-04-05-17.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-09-19.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-12-18.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-12-09.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-10-11.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/38/2013-04-10-11.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-05-10.gz
len matches: 13
File written to: /nfs/mine-trects-kba2014-filtered/38/2013-04-05-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-09-05.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/38/2013-04-08-18.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/38/2013-04-08-18.gz
saved proc_history
Processing file at: /nf

HBox(children=(IntProgress(value=0, max=97), HTML(value='')))

Processing file at: /nfs/trects-kba2014/39/2013-02-01-05.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/39/2013-02-02-23.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/39/2013-02-02-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/39/2013-02-04-05.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/39/2013-02-04-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/39/2013-02-02-16.gz
len matches: 8
File written to: /nfs/mine-trects-kba2014-filtered/39/2013-02-02-16.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/39/2013-02-03-17.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/39/2013-02-03-17.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/39/2013-02-01-15.gz
len matches: 17
File written to: /nfs/mine-trects-kba2014-filtered/39/2013-02-01-15.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/39/2013-02-04-14.gz
len matches: 8
File 

len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/39/2013-02-04-03.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/39/2013-02-03-16.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/39/2013-02-03-16.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/39/2013-02-02-17.gz
len matches: 14
File written to: /nfs/mine-trects-kba2014-filtered/39/2013-02-02-17.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/39/2013-02-01-16.gz
len matches: 11
File written to: /nfs/mine-trects-kba2014-filtered/39/2013-02-01-16.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/39/2013-02-02-22.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/39/2013-02-02-22.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/39/2013-02-01-00.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/39/2013-02-04-21.gz
len matches: 8
File written to: /nfs/mine-trects-kba2014-filtered/39/2013-02-04

HBox(children=(IntProgress(value=0, max=121), HTML(value='')))

Processing file at: /nfs/trects-kba2014/40/2011-12-22-17.gz
len matches: 10
File written to: /nfs/mine-trects-kba2014-filtered/40/2011-12-22-17.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/40/2011-12-25-12.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/40/2011-12-25-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/40/2011-12-24-05.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/40/2011-12-24-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/40/2011-12-22-01.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/40/2011-12-22-01.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/40/2011-12-26-04.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/40/2011-12-26-04.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/40/2011-12-24-22.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/40/2011-12-24-22.gz
saved proc_history
Processin

len matches: 14
File written to: /nfs/mine-trects-kba2014-filtered/40/2011-12-22-15.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/40/2011-12-22-00.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/40/2011-12-22-00.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/40/2011-12-22-20.gz
len matches: 21
File written to: /nfs/mine-trects-kba2014-filtered/40/2011-12-22-20.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/40/2011-12-22-03.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/40/2011-12-22-03.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/40/2011-12-24-19.gz
len matches: 7
File written to: /nfs/mine-trects-kba2014-filtered/40/2011-12-24-19.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/40/2011-12-26-02.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/40/2011-12-26-02.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/40/2011-12-24-11.gz
len matc

len matches: 11
File written to: /nfs/mine-trects-kba2014-filtered/40/2011-12-23-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/40/2011-12-21-21.gz
len matches: 7
File written to: /nfs/mine-trects-kba2014-filtered/40/2011-12-21-21.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/40/2011-12-22-13.gz
len matches: 16
File written to: /nfs/mine-trects-kba2014-filtered/40/2011-12-22-13.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/40/2011-12-26-05.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/40/2011-12-24-15.gz
len matches: 12
File written to: /nfs/mine-trects-kba2014-filtered/40/2011-12-24-15.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/40/2011-12-23-20.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/40/2011-12-23-20.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/40/2011-12-26-11.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/40/2011-12-2

HBox(children=(IntProgress(value=0, max=241), HTML(value='')))

Processing file at: /nfs/trects-kba2014/41/2013-01-15-14.gz
len matches: 24
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-15-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-21-14.gz
len matches: 10
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-21-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-20-14.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-20-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-23-07.gz
len matches: 31
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-23-07.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-19-12.gz
len matches: 60
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-19-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-23-05.gz
len matches: 26
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-23-05.gz
saved proc_history
Proce

len matches: 17
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-24-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-18-10.gz
len matches: 68
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-18-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-17-09.gz
len matches: 58
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-17-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-15-17.gz
len matches: 16
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-15-17.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-24-00.gz
len matches: 36
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-24-00.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-22-01.gz
len matches: 28
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-22-01.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-16-15.gz
len 

len matches: 10
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-24-15.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-17-23.gz
len matches: 41
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-17-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-19-09.gz
len matches: 33
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-19-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-18-05.gz
len matches: 56
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-18-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-24-02.gz
len matches: 30
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-24-02.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-22-14.gz
len matches: 10
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-22-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-18-03.gz
len 

len matches: 79
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-18-11.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-19-16.gz
len matches: 88
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-19-16.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-16-13.gz
len matches: 18
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-16-13.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-20-08.gz
len matches: 22
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-20-08.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-18-08.gz
len matches: 97
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-18-08.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-20-11.gz
len matches: 20
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-20-11.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-19-21.gz
len 

len matches: 41
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-24-04.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-15-21.gz
len matches: 10
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-15-21.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-20-07.gz
len matches: 28
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-20-07.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-16-23.gz
len matches: 27
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-16-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-21-00.gz
len matches: 27
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-21-00.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-22-17.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/41/2013-01-22-17.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/41/2013-01-18-12.gz
len m

HBox(children=(IntProgress(value=0, max=121), HTML(value='')))

Processing file at: /nfs/trects-kba2014/42/2013-02-12-19.gz
len matches: 20
File written to: /nfs/mine-trects-kba2014-filtered/42/2013-02-12-19.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/42/2013-02-13-13.gz
len matches: 11
File written to: /nfs/mine-trects-kba2014-filtered/42/2013-02-13-13.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/42/2013-02-11-03.gz
len matches: 24
File written to: /nfs/mine-trects-kba2014-filtered/42/2013-02-11-03.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/42/2013-02-12-22.gz
len matches: 66
File written to: /nfs/mine-trects-kba2014-filtered/42/2013-02-12-22.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/42/2013-02-10-14.gz
len matches: 59
File written to: /nfs/mine-trects-kba2014-filtered/42/2013-02-10-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/42/2013-02-13-16.gz
len matches: 7
File written to: /nfs/mine-trects-kba2014-filtered/42/2013-02-13-16.gz
saved proc_history
Proce

len matches: 17
File written to: /nfs/mine-trects-kba2014-filtered/42/2013-02-10-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/42/2013-02-10-07.gz
len matches: 17
File written to: /nfs/mine-trects-kba2014-filtered/42/2013-02-10-07.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/42/2013-02-10-04.gz
len matches: 41
File written to: /nfs/mine-trects-kba2014-filtered/42/2013-02-10-04.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/42/2013-02-10-02.gz
len matches: 18
File written to: /nfs/mine-trects-kba2014-filtered/42/2013-02-10-02.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/42/2013-02-11-20.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/42/2013-02-11-20.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/42/2013-02-11-01.gz
len matches: 34
File written to: /nfs/mine-trects-kba2014-filtered/42/2013-02-11-01.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/42/2013-02-14-03.gz
len m

len matches: 11
File written to: /nfs/mine-trects-kba2014-filtered/42/2013-02-11-17.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/42/2013-02-13-09.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/42/2013-02-13-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/42/2013-02-10-09.gz
len matches: 46
File written to: /nfs/mine-trects-kba2014-filtered/42/2013-02-10-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/42/2013-02-13-19.gz
len matches: 14
File written to: /nfs/mine-trects-kba2014-filtered/42/2013-02-13-19.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/42/2013-02-13-02.gz
len matches: 31
File written to: /nfs/mine-trects-kba2014-filtered/42/2013-02-13-02.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/42/2013-02-12-09.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/42/2013-02-12-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/42/2013-02-11-14.gz
len ma

HBox(children=(IntProgress(value=0, max=121), HTML(value='')))

Processing file at: /nfs/trects-kba2014/43/2013-01-21-14.gz
len matches: 10
File written to: /nfs/mine-trects-kba2014-filtered/43/2013-01-21-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/43/2013-01-20-14.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/43/2013-01-20-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/43/2013-01-19-12.gz
len matches: 60
File written to: /nfs/mine-trects-kba2014-filtered/43/2013-01-19-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/43/2013-01-19-14.gz
len matches: 89
File written to: /nfs/mine-trects-kba2014-filtered/43/2013-01-19-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/43/2013-01-19-04.gz
len matches: 34
File written to: /nfs/mine-trects-kba2014-filtered/43/2013-01-19-04.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/43/2013-01-21-13.gz
len matches: 10
File written to: /nfs/mine-trects-kba2014-filtered/43/2013-01-21-13.gz
saved proc_history
Proce

len matches: 33
File written to: /nfs/mine-trects-kba2014-filtered/43/2013-01-19-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/43/2013-01-18-05.gz
len matches: 56
File written to: /nfs/mine-trects-kba2014-filtered/43/2013-01-18-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/43/2013-01-18-03.gz
len matches: 38
File written to: /nfs/mine-trects-kba2014-filtered/43/2013-01-18-03.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/43/2013-01-18-04.gz
len matches: 42
File written to: /nfs/mine-trects-kba2014-filtered/43/2013-01-18-04.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/43/2013-01-18-00.gz
len matches: 80
File written to: /nfs/mine-trects-kba2014-filtered/43/2013-01-18-00.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/43/2013-01-18-23.gz
len matches: 52
File written to: /nfs/mine-trects-kba2014-filtered/43/2013-01-18-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/43/2013-01-19-11.gz
len 

len matches: 55
File written to: /nfs/mine-trects-kba2014-filtered/43/2013-01-17-06.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/43/2013-01-20-09.gz
len matches: 33
File written to: /nfs/mine-trects-kba2014-filtered/43/2013-01-20-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/43/2013-01-18-06.gz
len matches: 73
File written to: /nfs/mine-trects-kba2014-filtered/43/2013-01-18-06.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/43/2013-01-20-07.gz
len matches: 28
File written to: /nfs/mine-trects-kba2014-filtered/43/2013-01-20-07.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/43/2013-01-21-00.gz
len matches: 27
File written to: /nfs/mine-trects-kba2014-filtered/43/2013-01-21-00.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/43/2013-01-18-12.gz
len matches: 108
File written to: /nfs/mine-trects-kba2014-filtered/43/2013-01-18-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/43/2013-01-19-18.gz
len

HBox(children=(IntProgress(value=0, max=121), HTML(value='')))

Processing file at: /nfs/trects-kba2014/44/2012-04-13-00.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014-filtered/44/2012-04-13-00.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/44/2012-04-12-14.gz
len matches: 12
File written to: /nfs/mine-trects-kba2014-filtered/44/2012-04-12-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/44/2012-04-14-13.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/44/2012-04-14-13.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/44/2012-04-15-18.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/44/2012-04-15-18.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/44/2012-04-13-17.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/44/2012-04-13-17.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/44/2012-04-15-03.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/44/2012-04-15-03.gz
saved proc_history
Processin

len matches: 27
File written to: /nfs/mine-trects-kba2014-filtered/44/2012-04-11-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/44/2012-04-14-17.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/44/2012-04-12-04.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/44/2012-04-12-04.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/44/2012-04-12-22.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/44/2012-04-12-22.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/44/2012-04-13-09.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/44/2012-04-13-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/44/2012-04-12-06.gz
len matches: 7
File written to: /nfs/mine-trects-kba2014-filtered/44/2012-04-12-06.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/44/2012-04-15-09.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/44/2012-04-15-

len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/44/2012-04-14-08.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/44/2012-04-11-17.gz
len matches: 14
File written to: /nfs/mine-trects-kba2014-filtered/44/2012-04-11-17.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/44/2012-04-15-04.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/44/2012-04-15-04.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/44/2012-04-13-10.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/44/2012-04-13-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/44/2012-04-15-21.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/44/2012-04-15-21.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/44/2012-04-11-10.gz
len matches: 40
File written to: /nfs/mine-trects-kba2014-filtered/44/2012-04-11-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/44/2012-04-14-09.gz
len matc

HBox(children=(IntProgress(value=0, max=241), HTML(value='')))

Processing file at: /nfs/trects-kba2014/45/2012-11-06-21.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-11-06-21.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-10-30-04.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-10-30-04.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-10-28-14.gz
len matches: 10
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-10-28-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-10-31-18.gz
len matches: 14
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-10-31-18.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-11-04-18.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-10-29-23.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-10-29-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-11-07-03.gz
len matches: 0
save

len matches: 7
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-10-30-10.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-10-31-17.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-10-31-17.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-11-05-15.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-11-05-15.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-10-28-19.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-10-28-19.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-11-04-09.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-11-03-13.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-11-03-13.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-10-30-07.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-10-30-0

len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-11-02-16.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-11-03-15.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-10-28-22.gz
len matches: 13
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-10-28-22.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-10-28-15.gz
len matches: 10
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-10-28-15.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-10-29-14.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-10-29-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-11-04-13.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-11-02-07.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-10-28-23.gz
len matches: 6
File written to: /nfs/mine-trects-kba2014

len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-11-04-03.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-11-01-12.gz
len matches: 11
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-11-01-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-10-30-17.gz
len matches: 7
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-10-30-17.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-10-29-16.gz
len matches: 41
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-10-29-16.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-10-30-03.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-10-30-03.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-11-01-21.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-11-01-21.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/20

len matches: 10
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-10-30-06.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-11-02-23.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-11-02-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-10-30-02.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-10-30-02.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-11-03-23.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-11-03-23.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-11-01-16.gz
len matches: 27
File written to: /nfs/mine-trects-kba2014-filtered/45/2012-11-01-16.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-11-06-15.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/45/2012-11-04-11.gz
len matches: 0
saved proc_history
Processing file at: /nfs/trects-kba2014/45/20

HBox(children=(IntProgress(value=0, max=121), HTML(value='')))

Processing file at: /nfs/trects-kba2014/46/2012-09-11-08.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/46/2012-09-11-08.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/46/2012-09-11-06.gz
len matches: 1
File written to: /nfs/mine-trects-kba2014-filtered/46/2012-09-11-06.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/46/2012-09-15-07.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/46/2012-09-15-07.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/46/2012-09-15-05.gz
len matches: 14
File written to: /nfs/mine-trects-kba2014-filtered/46/2012-09-15-05.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/46/2012-09-15-15.gz
len matches: 8
File written to: /nfs/mine-trects-kba2014-filtered/46/2012-09-15-15.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/46/2012-09-12-10.gz
len matches: 9
File written to: /nfs/mine-trects-kba2014-filtered/46/2012-09-12-10.gz
saved proc_history
Processin

len matches: 11
File written to: /nfs/mine-trects-kba2014-filtered/46/2012-09-15-21.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/46/2012-09-12-09.gz
len matches: 14
File written to: /nfs/mine-trects-kba2014-filtered/46/2012-09-12-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/46/2012-09-14-04.gz
len matches: 3
File written to: /nfs/mine-trects-kba2014-filtered/46/2012-09-14-04.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/46/2012-09-15-06.gz
len matches: 8
File written to: /nfs/mine-trects-kba2014-filtered/46/2012-09-15-06.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/46/2012-09-14-11.gz
len matches: 23
File written to: /nfs/mine-trects-kba2014-filtered/46/2012-09-14-11.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/46/2012-09-11-15.gz
len matches: 7
File written to: /nfs/mine-trects-kba2014-filtered/46/2012-09-11-15.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/46/2012-09-13-16.gz
len mat

len matches: 8
File written to: /nfs/mine-trects-kba2014-filtered/46/2012-09-15-02.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/46/2012-09-14-14.gz
len matches: 4
File written to: /nfs/mine-trects-kba2014-filtered/46/2012-09-14-14.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/46/2012-09-15-09.gz
len matches: 12
File written to: /nfs/mine-trects-kba2014-filtered/46/2012-09-15-09.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/46/2012-09-12-19.gz
len matches: 7
File written to: /nfs/mine-trects-kba2014-filtered/46/2012-09-12-19.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/46/2012-09-15-12.gz
len matches: 2
File written to: /nfs/mine-trects-kba2014-filtered/46/2012-09-15-12.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/46/2012-09-11-02.gz
len matches: 5
File written to: /nfs/mine-trects-kba2014-filtered/46/2012-09-11-02.gz
saved proc_history
Processing file at: /nfs/trects-kba2014/46/2012-09-12-08.gz
len match

## Control Generate and Load a Corpus

In [889]:
class CorpusGenerator:
    def __init__(self, proj_dir, corpus_split_step=200, embedding_split_step=50, load_default_emb_model=True):
        self.proj_dir = proj_dir
        self.path_handler = FilePathHandler(proj_dir)
        # ["topics", "corpus", "nuggets", "embed_labels", "updates"]
        self.file_purposes = ["topics", "corpus", "nuggets", "embeddings", "labels"]
        self.corpus_split_step = corpus_split_step
        self.embedding_split_step = embedding_split_step
        self.topic_dfs = {}  # dict of topic dfs per corpus_name
        self.force_reload_options = ["topics", "corpus", "nuggets", "embeddings", "labels"]
        self.file_type_options = [".csv.gz", ".hdf"]
        if load_default_emb_model:
            self.emb_model_dict = {"model":SentenceTransformer('distilbert-base-nli-stsb-mean-tokens'),
                                  "name":'distilbert-base-nli-stsb-mean-tokens'}
        
    def generate(self, file_type=".hdf", selection=None, corpus_names=None, new_corpuses=None, 
            force_reload=False, save=True, verbose=True, emb_model_dict=None):
        """
        Parameters:
            file_type: output file type for dataframes, can be either "csv" or "hdf"
            
            force_reload: force generate new dataframes if files already exist
                options: True, False or list of selection i.e. ["topics", "corpus", "nuggets", "embeddings", "labels"]
            
            emb_model_dict: a dictionary containing a model to generate sentence embeddings with .encode(),
                            and a name to act as an identifier in a filename. If None, use default defined
                            in __init__() method
        """
        # interpret force_reload input
        if type(force_reload) is not list:
            if type(force_reload) is bool:
                if force_reload == True:
                    force_reload = self.force_reload_options
                else:
                    force_reload = []  # empty list, no chosen selection
        else:
            for select in force_reload:
                if select not in self.force_reload_options:
                    raise ValueError(str(select) + " is not a force_reload option")
        # interpret file_type input
        if file_type not in self.file_type_options:
            raise Exception(str(file_type) + " is not a valid file_type option to save dataframes")
        
        if verbose:
            if len(force_reload) != 0:
                print("force reloading the following selection: " + str(force_reload))
        
        if emb_model_dict is not None:  # replace default emb_model
            self.emb_model_dict = emb_model_dict
        
        # add new corpuses to load
        if new_corpuses is not None:
            for new_corpus in new_corpuses:
                self.path_handler.add_corpus_source(new_corpus, overwrite=True)
        # get corpus paths to load from (if corpus_names is None loads all)
        self.corpus_sources = self.path_handler.get_corpus_sources(corpus_names=corpus_names)
        corpus_names = self.corpus_sources.keys()
        
        if selection is None:  # if none do all
            selection = self.file_purposes
        
        for corpus_name in corpus_names:
            print("corpus_name generate loop: " + str(corpus_name))
            if "topics" in selection:
                # create topic_df for corpus_name
                print("Generating topic_df")
                topic_reload = self.check_force_reload(corpus_name, "topics", file_type, force_reload, change_paths=True,
                                                      verbose=verbose)
                self.load_topic_df_control(corpus_name, file_type=file_type, save=save, force_reload=topic_reload, 
                                        verbose=verbose, add_path=True)
                
            if "corpus" in selection:
                # create corpus df csvs
                print("Generating corpus_dfs")
                corp_reload = self.check_force_reload(corpus_name, "corpus", file_type, force_reload, change_paths=True,
                                                      split_step=self.corpus_split_step, verbose=verbose)
                self.corpus_splitter(corpus_name, file_type, force_reload=corp_reload, verbose=False)
                
            if "nuggets" in selection:
                print("Generating nugget files")
                nug_reload = self.check_force_reload(corpus_name, "nuggets", file_type, force_reload, change_paths=True,
                                                    verbose=verbose)
                self.nuggets_generator(corpus_name, file_type, force_reload=nug_reload, verbose=verbose)
                
            if "embeddings" in selection:
                print("Generating embedding files")
                nested_dir = self.emb_model_dict['name']
                embed_reload = self.check_force_reload(corpus_name, "embeddings", file_type, force_reload, change_paths=True,
                                                       nested_dir=nested_dir, split_step=self.embedding_split_step,
                                                       verbose=verbose)
                self.embedding_generator(corpus_name, file_type, force_reload=force_reload, verbose=verbose)
            if "labels" in selection:
                label_gen = EmbeddingLabelGenerator(self.proj_dir)
                # get each nested_dir/embedding type to load labels for
                nested_dirs = self.path_handler.load_path_df_slice(corpus_name, "embeddings", exists=True,
                                                                  file_type=file_type)
                nested_dirs = nested_dirs['nested_dir'].unique()
                
                for nested_dir in nested_dirs:
                    label_gen.add_cosine_label(corpus_name, nested_dir, verbose=verbose, force_reload=force_reload,
                                              save=save, split_step=self.embedding_split_step, file_type=file_type)
                
            
        print("Finished generating files")
        
    def check_force_reload(self, corpus_name, select, file_type, force_reload, change_paths=True, 
                           inst_identifier=None, nested_dir=None, split_step=None, verbose=True):
        if select in force_reload:
            if change_paths:
                self.change_force_reload_paths(corpus_name, select, file_type, inst_identifier=inst_identifier, 
                                              nested_dir=nested_dir, split_step=split_step)
                if verbose:
                    print("Changed paths for " + str(select) + " in " + str(corpus_name))
            return True
        else:
            return False
        
    def change_force_reload_paths(self, corpus_name, select, file_type, inst_identifier=None, nested_dir=None,
                                 split_step=None):
        # change path df paths to not exists if force_reload
        targ = self.path_handler.path_df
        targ = targ[targ['corpus_name'] == corpus_name]
        targ = targ[targ['file_purpose'] == select]
        targ = targ[targ['file_type'] == file_type]
        if inst_identifier is not None:
            targ = targ[targ['instance_identifier'] == inst_identifier]
        if nested_dir is not None:
            targ = targ[targ['nested_dir'] == nested_dir]
        if split_step is not None:
            targ = targ[targ['split_step'] == split_step]
        targ['exists'] = False  # set value for all items
        self.path_handler.save_path_df()
        
    def embedding_generator(self, corpus_name, file_type, force_reload=False, verbose=True):
        # if not exists load topics
        self.load_topic_df_control(corpus_name, force_reload=False, verbose=False, file_type=file_type)
        
        # create nested dir of description of embeddings
        nested_dir = self.emb_model_dict['name']
        
        # model to create sentence embeddings
        emb_model = self.emb_model_dict['model']
        
        topic_df = self.topic_dfs[corpus_name]
        topic_ids = list(topic_df['id'].unique())
        # create embedding dfs per topic
        for topic_id in tqdm_notebook(topic_ids, position=0, leave=True):
            # paths for corpus_df and nugget_df for topic
            corp_paths = self.load_corpus_paths_control(corpus_name, topic_id, file_type=file_type)
            nug_path = self.path_handler.load_path_df_slice(corpus_name, "nuggets", file_type=file_type,
                                                            split_identifier=topic_id, exists=True)
            if len(nug_path) != 1:
                raise Exception("There are " + str(len(nug_path)) + " nugget paths for topic " + str(topic_id))
            nug_path = list(nug_path['path'])[0]
            
            # load corpus and nuggets
            corpus_df = load_corpus(corp_paths, verbose=False)
            nugget_df = load_nugget_df(nug_path, verbose=False)

            
            # get emb paths for this topic
            emb_paths = self.path_handler.load_path_df_slice(corpus_name, "embeddings", exists=True, 
                                                instance_identifier=str(topic_id), nested_dir=nested_dir,
                                               split_step=self.embedding_split_step, file_type=file_type)
            
            # first step computed will be after this step
            # adding emb_split_step to this number will give us our start point  (e.g. 0)
            prev_split = 0 - self.embedding_split_step
            num_splits = 0
            
            if len(emb_paths) == 0 or force_reload:
                # prev_split unchanged, start from initial value
                if verbose:
                    print("Processing topic " + str(topic_id) + " from beginning")
            else:
                # check what has already been processed
                num_splits = list(emb_paths['num_splits'])[0]
                if len(emb_paths) == num_splits:  # no missing paths, already processed
                    continue
                else:  # missing paths, continue where left off
                    prev_split = max(list(map(int, list(emb_paths['split_identifier']))))
            
            
            # split every self.embedding_split_step
            split_indexes = []
            temp = prev_split + self.embedding_split_step
            while temp < len(corpus_df):
                split_indexes.append(temp)
                temp = split_indexes[-1] + self.embedding_split_step
                
            if num_splits == 0:  # no previous splits processed
                num_splits = len(split_indexes)
                
            for split_index in split_indexes:
                # make sure final index doesn't go over corpus length
                end_index = split_index + self.embedding_split_step
                if end_index > len(corpus_df):
                    end_index = len(corpus_df)
                # get desired section of corpus_df to pass through
                split_corpus_df = corpus_df[split_index:end_index]
                
                # create save path for resultant file
                save_path = self.path_handler.get_path(corpus_name, "embeddings", str(topic_id), file_type,
                                        split_identifier=split_index, num_splits=num_splits, 
                                        split_step=self.embedding_split_step, nested_dir=nested_dir, add_path=True)
                if verbose:
                    print("Loading topic " + str(topic_id) + " documents. " + str(split_index) 
                          + " - " + str(end_index) + " (Total: " + str(len(corpus_df)) + ")")
                
                # create embeddings for this section of corpus_df for this topic
                load_embeddings(save_path, emb_model=emb_model, corpus_df=split_corpus_df, nugget_df=nugget_df,
                               sents_default="splitlines", nlp=None, force_reload=force_reload, save=True,
                               verbose=verbose, path_handler=self.path_handler, only_docs_with_nugs=False)
            
            
    def nuggets_generator(self, corpus_name, file_type, force_reload=False, verbose=True):
        # if not exists load topics
        self.load_topic_df_control(corpus_name, force_reload=False, verbose=False, file_type=file_type)
        
        # this is used for the filename of the resulting saved file
        base_identifier = "nuggets"
        
        topic_df = self.topic_dfs[corpus_name]
#         topic_df = self.remove_topic_and_save(corpus_name, topic_df, 7, 
#                                               file_type=file_type, verbose=True)  # temp debug
        topic_ids = list(topic_df['id'].unique())
        # create nuggets_df per topic
        for topic_id in tqdm_notebook(topic_ids):
            # get paths for corpus files for this topic_id (inelegantly)
            print("nuggets_generator topic_id: " + str(topic_id))
            corp_paths = self.load_corpus_paths_control(corpus_name, topic_id, file_type=file_type)
            
            # load corpus for a given topic
            corpus_df = load_corpus(corp_paths, save=False,
                                    force_reload=False, verbose=False, path_handler=self.path_handler)
            
            # nugget save destination
            save_path = self.path_handler.get_path(corpus_name, "nuggets", base_identifier, file_type,
                                        split_identifier=str(topic_id), num_splits=len(topic_ids), 
                                        add_path=True)
            
            nuggets_tsv_path = self.corpus_sources[corpus_name]['nuggets_path']
            matches_tsv_path = self.corpus_sources[corpus_name]['matches_path']
            # generate nugget file
            load_nugget_df(save_path, corpus_df=corpus_df, topic_ids=[topic_id], matches_tsv_path=matches_tsv_path, 
                            nuggets_tsv_path=nuggets_tsv_path, save=True, force_reload=force_reload, 
                           verbose=verbose, path_handler=self.path_handler, spacy_if_not_found=True)
            
            
                    
    def corpus_splitter(self, corpus_name, file_type, force_reload=False, verbose=True):
        # split by topic and then every 200 html gz files, then parse together in loading
        # add check for what's been done already (i.e. check current topics, if all splits taken place)
        
        # if not exists load topics
        self.load_topic_df_control(corpus_name, force_reload=False, verbose=verbose, file_type=file_type)
        topic_df = self.topic_dfs[corpus_name]
        
        corpus_dir = self.corpus_sources[corpus_name]["dir_path"]
        if verbose:
            print("corpus_dir:" + str(corpus_dir))
            
        for topic_id in tqdm_notebook(topic_df['id'].unique()):
            # confirm dir exists
            t_dir = corpus_dir + '/' + str(topic_id)
            if not file_exists(t_dir):
                warnings.warn("Corpus loading path at " + t_dir + " does not exist. Removing from topic_df")
                topic_df = self.remove_topic_and_save(corpus_name, topic_df, topic_id, 
                                                      file_type=file_type, verbose=verbose)
                continue
            
            # check if path exists
            t_df_paths = self.path_handler.load_path_df_slice(corpus_name, "corpus", 
                                                              instance_identifier=str(topic_id), exists=True,
                                                             file_type=file_type,
                                                             split_step=self.corpus_split_step)
            start_split = 0
            num_splits = 0
            if len(t_df_paths) == 0 or force_reload:  # not yet processed
                start_split = 0
            else:
                # check if all splits been processed
                num_splits = list(t_df_paths['num_splits'])[0]  # ensure same num_splits is inputted into path_df
                if len(t_df_paths) == num_splits:  # already fully processed
                    continue
                # get start point if partway through
                start_split = max(list(map(int, list(t_df_paths['split_identifier']))))
            
            num_files = len(search_dir(t_dir))
            if num_files == 0:
                warnings.warn("No files found in directory " + str(t_dir) + ". Removing " + str(topic_id)
                             + " from topic_df")
                topic_df = self.remove_topic_and_save(corpus_name, topic_df, topic_id, verbose=verbose)
            
            # create split indexes to feed to load_corpus
            splits = [start_split]
            add = splits[-1] + self.corpus_split_step
            while add < num_files:
                splits.append(int(add))
                add = splits[-1] + self.corpus_split_step
            
            if start_split == 0:  
                num_splits = int(len(splits))  # for inputting into path_df
            
            if verbose:
                print("creating corpus df for topic " + str(topic_id) + " starting at file no. " 
                      + str(split_start_doc) + " of " + str(num_files) + " splitting every " 
                      + str(split_every) + " files")
            # create corpus_df files
            for split_num in splits:
                # get save path
                save_path = self.path_handler.get_path(corpus_name, "corpus", str(topic_id), file_type,
                                        split_identifier=str(split_num), num_splits=num_splits, 
                                        split_step=self.corpus_split_step, add_path=True)
                
                load_corpus(save_path, corpus_dir=corpus_dir, topic_ids=[topic_id], 
                            split_every=self.corpus_split_step, split_start_doc=split_num, 
                            save=True, force_reload=force_reload, 
                            verbose=verbose, path_handler=self.path_handler)
                
    def load_corpus_paths_control(self, corpus_name, topic_id, file_type=".csv.gz", verbose=False):
        corp_paths = self.path_handler.load_path_df_slice(corpus_name, "corpus", instance_identifier=str(topic_id),
                                                exists=True, split_step=self.corpus_split_step) 
        # try loading selected file type, otherwise any file type
        corp_file_types = corp_paths['file_type'].unique()
        if len(corp_file_types) > 0:
            load_type = corp_file_types[0]
            if file_type in corp_file_types:
                load_type = file_type
            corp_paths = corp_paths[corp_paths['file_type'] == load_type]
        else:
            # repeating this for speed of refactoring
            raise Exception("Corpus files for topic " + str(topic_id) + " have not been fully loaded")
        # check that all paths are loaded
        try:
            num_split = int(corp_paths.iloc[0]['num_splits'])
            if num_split < len(corp_paths):
                raise IndexError()
        except IndexError:
            if verbose:
                print(display(corp_paths))
            raise Exception("Corpus files for topic " + str(topic_id) + " have not been fully loaded")
        # return the list of path addresses
        corp_paths = list(corp_paths['path'])
        return corp_paths
    
    def remove_topic_and_save(self, corpus_name, topic_df, topic_id, file_type=".csv.gz", verbose=True):
        # remove topic_id from topic_df and save
        topic_df = topic_df[topic_df['id'] != topic_id]
        path = self.path_handler.path_df
        path = path[(path['corpus_name'] == corpus_name) & (path['file_purpose'] == "topics")
                   & (path['file_type'] == file_type)]['path']
        path = list(path)[0]
        save_df_file_type(topic_df, path, verbose=verbose)
        if verbose:
            print(str(topic_id) + " removed from topic_df and saved to " + str(path))
        return topic_df
                
    def load_topic_df_control(self, corpus_name, save=True, force_reload=False, verbose=True, add_path=False,
                             file_type=".csv.gz"):
        if self.topic_dfs is None:
            self.topic_dfs = {}
        if corpus_name not in self.topic_dfs:
            self.topic_dfs[corpus_name] = self.load_topic_df(corpus_name, save=save, force_reload=force_reload,
                                                verbose=verbose, add_path=add_path, file_type=file_type)
                

    def load_topic_df(self, corpus_name, save=True, force_reload=False, verbose=True, add_path=False,
                     file_type=".csv.gz"):
        load_path = self.corpus_sources[corpus_name]["topics_path"]
        save_path = self.path_handler.get_path(corpus_name, "topics", "topics_df", file_type, add_path=add_path)
        
        topic_df = load_topics(save_path, load_path=load_path, save=save, force_reload=force_reload, 
                               verbose=verbose, path_handler=self.path_handler)
        return topic_df

In [892]:
proj_dir = '/nfs/proj-repo/AAARG-dissertation'
orig_tr14_filtered_dict = { "corpus_name":"original-trects-kba2014-filtered",
                        "dir_path":"/nfs/original-trects-kba2014-filtered", 
                      "topics_path":"/nfs/original-trects-kba2014-filtered/test-topics.xml", 
                      "nuggets_path":"/nfs/TemporalSummarization/ts13/results/nuggets.tsv",
                        "matches_path":"/nfs/TemporalSummarization/ts13/results/matches.tsv"}

mine_tr14_filtered_dict = {"corpus_name":"mine-trects-kba2014-filtered",
                          "dir_path":"/nfs/mine-trects-kba2014-filtered",
                          "topics_path":["/nfs/TemporalSummarization/ts13/test-topics.xml",
                                        "/nfs/TemporalSummarization/ts14/trec2014-ts-topics-test.xml",
                                        "/nfs/TemporalSummarization/ts15/trec2015-ts-topics-test.xml"],
                          "nuggets_path":["/nfs/TemporalSummarization/ts13/results/nuggets.tsv",
                                         "/nfs/TemporalSummarization/ts14/results/nuggets.tsv",
                                         "/nfs/TemporalSummarization/ts15/results/nuggets.tsv"],
                          "matches_path":["/nfs/TemporalSummarization/ts13/results/matches.tsv",
                                         "/nfs/TemporalSummarization/ts14/results/matches.tsv",
                                         "/nfs/TemporalSummarization/ts15/results/matches.tsv"]}


# corp_gen = CorpusGenerator(proj_dir)
corp_gen = CorpusGenerator(proj_dir, embedding_split_step=200)

force_reload = False
# force_reload = ["embeddings"]
# selection = ["labels"]
selection = None
# corpus_names = ["original-trects-kba2014-filtered", "mine-trects-kba2014-filtered"]
# corpus_names = ["mine-trects-kba2014-filtered", "original-trects-kba2014-filtered"]
corpus_names = ["original-trects-kba2014-filtered"]
# corpus_names = ["mine-trects-kba2014-filtered"]
file_type = ".hdf"
# file_type = ".csv.gz"

corp_gen.generate(new_corpuses=[orig_tr14_filtered_dict, mine_tr14_filtered_dict], corpus_names=corpus_names, 
                  force_reload=force_reload, verbose=True, selection=selection, file_type=file_type)
p_han = FilePathHandler(proj_dir)
print(display(p_han.path_df[0:10]))

I0126 13:41:01.659325 140036280194880 SentenceTransformer.py:39] Load pretrained SentenceTransformer: distilbert-base-nli-stsb-mean-tokens
I0126 13:41:01.660037 140036280194880 SentenceTransformer.py:43] Did not find folder distilbert-base-nli-stsb-mean-tokens
I0126 13:41:01.660331 140036280194880 SentenceTransformer.py:49] Try to download model from server: https://sbert.net/models/distilbert-base-nli-stsb-mean-tokens.zip
I0126 13:41:01.660820 140036280194880 SentenceTransformer.py:100] Load SentenceTransformer from folder: /root/.cache/torch/sentence_transformers/sbert.net_models_distilbert-base-nli-stsb-mean-tokens
I0126 13:41:02.505300 140036280194880 SentenceTransformer.py:124] Use pytorch device: cuda


corpus_name generate loop: original-trects-kba2014-filtered
Generating topic_df
Loading topics
loaded from .hdf file


Unnamed: 0,id,title,description,start,end,query,type
0,1,2012 Buenos Aires Rail Disaster,http://en.wikipedia.org/wiki/2012_Buenos_Aires...,1329910380,1330774380,buenos aires train crash,accident
1,2,2012 Pakistan garment factory fires,http://en.wikipedia.org/wiki/2012_Pakistan_gar...,1347368400,1348232400,pakistan factory fire,accident
2,3,2012 Aurora shooting,http://en.wikipedia.org/wiki/2012_Aurora_shooting,1342766280,1343630280,colorado shooting,shooting
3,4,Wisconsin Sikh temple shooting,http://en.wikipedia.org/wiki/Wisconsin_Sikh_te...,1344180300,1345044300,sikh temple shooting,shooting


None
Generating corpus_dfs


HBox(children=(IntProgress(value=0, max=9), HTML(value='')))


Generating nugget files


HBox(children=(IntProgress(value=0, max=9), HTML(value='')))

nuggets_generator topic_id: 1
loaded from .hdf file


Unnamed: 0,topic_id,match_start,match_end,nugget_id,importance,nugget_len,nugget_text,sent_in_text,sent_id,technique,docid,streamid,epoch
0,1,35,53,VMTS13.01.077,1,2,676+ injuries,"Buenos Aires Train Crash Kills 49, Injuries ov...",1,splitlines,a361ee100c9b058a0f0f4355cec64047,1330003205-a361ee100c9b058a0f0f4355cec64047,1330003205
1,1,50,74,VMTS13.01.050,3,6,"train accident in Buenos Aires, Argentina.","""The train was full and the impact was 49 dead...",3,splitlines,06251bb5df849f4e5efe4245acb1c342,1329996913-06251bb5df849f4e5efe4245acb1c342,1329996913
2,1,50,101,VMTS13.01.050,3,6,"train accident in Buenos Aires, Argentina.",Firemen rescue wounded passengers from a commu...,2,splitlines,a8bb0847959520d7f32b2b12d486e33e,1329996320-a8bb0847959520d7f32b2b12d486e33e,1329996320
3,1,0,8,VMTS13.01.078,1,3,49 confirmed deaths,"49 dead, hundreds injured in Buenos Aires trai...",1,splitlines,a8bb0847959520d7f32b2b12d486e33e,1329996320-a8bb0847959520d7f32b2b12d486e33e,1329996320


None
nuggets_generator topic_id: 2
loaded from .hdf file


Unnamed: 0,topic_id,match_start,match_end,nugget_id,importance,nugget_len,nugget_text,sent_in_text,sent_id,technique,docid,streamid,epoch
0,2,30,72,VMTS13.02.054,3,7,Pakistan garment factory fires 11 September 2012,The fire in Karachi was one of two deadly blaz...,3,splitlines,176fb48ae343811f53922a9312d34f55,1347480009-176fb48ae343811f53922a9312d34f55,1347480009
1,2,0,20,VMTS13.02.056,3,1,Karachi,The fire in Karachi was one of two deadly blaz...,3,splitlines,176fb48ae343811f53922a9312d34f55,1347480009-176fb48ae343811f53922a9312d34f55,1347480009
2,2,0,45,VMTS13.02.067,3,3,Total killed 315,Deaths in factory fires in Pakistan up to 314 ...,33,splitlines,0474a30204e831c3dcc81ba93b7d8193,1347489318-0474a30204e831c3dcc81ba93b7d8193,1347489318
3,2,55,109,VMTS13.02.067,3,3,Total killed 315,Pakistan factory fires death toll hits 2 The n...,31,splitlines,d7774dc37afe6c977309ed94977599f3,1347483745-d7774dc37afe6c977309ed94977599f3,1347483745


None
nuggets_generator topic_id: 3
loaded from .hdf file


Unnamed: 0,topic_id,match_start,match_end,nugget_id,importance,nugget_len,nugget_text,sent_in_text,sent_id,technique,docid,streamid,epoch
0,3,71,190,VMTS13.04.165,1,36,"The victims include: Veronica Moser (6), Matt ...","The Aurora Sentinel is updating as they can, b...",10,splitlines,9a1a165564271233b3acd7e53119dd2c,1342974777-9a1a165564271233b3acd7e53119dd2c,1342974777
1,3,0,43,VMTS13.04.114,1,20,"Aurora Chief of Police, Daniel Oates, updated ...","FirstDude • Greenwood , Indiana • 2 mins 51 se...",41,splitlines,d6d75cc5f684df23d9ba5cbe932f6b81,1342998315-d6d75cc5f684df23d9ba5cbe932f6b81,1342998315
2,3,71,174,VMTS13.04.165,1,36,"The victims include: Veronica Moser (6), Matt ...",College students who moved to Colorado to blaz...,4,splitlines,5dfefea0666b6c9a325c562a62c8eb73,1343009373-5dfefea0666b6c9a325c562a62c8eb73,1343009373
3,3,0,1,VMTS13.04.175,1,22,"Among foreigner victims, three Indonesians are...",College students who moved to Colorado to blaz...,4,splitlines,5dfefea0666b6c9a325c562a62c8eb73,1343009373-5dfefea0666b6c9a325c562a62c8eb73,1343009373


None
nuggets_generator topic_id: 4
loaded from .hdf file


Unnamed: 0,topic_id,match_start,match_end,nugget_id,importance,nugget_len,nugget_text,sent_in_text,sent_id,technique,docid,streamid,epoch
0,4,154,272,VMTS13.06.060,1,2,Suspect's motivation-unclear.,Adam Harrington | CBS 2 Chicago WBBM -TV: Wate...,51,splitlines,19f9754f77b78dcb183a76e003ac7488,1344276214-19f9754f77b78dcb183a76e003ac7488,1344276214
1,4,0,52,VMTS13.06.010,3,5,Seven fatalities including shooter,"At least seven people are dead, including the ...",62,splitlines,19f9754f77b78dcb183a76e003ac7488,1344276214-19f9754f77b78dcb183a76e003ac7488,1344276214
2,4,35,88,VMTS13.06.001,3,4,'Wisconsin Sikh Temple shooting,Scared Monkeys | The Tipping Point Mass Shooti...,76,splitlines,19f9754f77b78dcb183a76e003ac7488,1344276214-19f9754f77b78dcb183a76e003ac7488,1344276214
3,4,35,88,VMTS13.06.009,1,6,"Shooting occurs in Oak Creek, Wisconsin",Scared Monkeys | The Tipping Point Mass Shooti...,76,splitlines,19f9754f77b78dcb183a76e003ac7488,1344276214-19f9754f77b78dcb183a76e003ac7488,1344276214


None
nuggets_generator topic_id: 5
loaded from .hdf file


Unnamed: 0,topic_id,match_start,match_end,nugget_id,importance,nugget_len,nugget_text,sent_in_text,sent_id,technique,docid,streamid,epoch
0,5,26,85,VMTS13.08.090,3,6,Hurricane Isaac - catagory one hurricane,National Hurricane Center predicted Isaac woul...,7,splitlines,753845f8ad64dd620e5019ff39e9310c,1346181303-753845f8ad64dd620e5019ff39e9310c,1346181303
1,5,59473,59526,VMTS13.08.104,2,13,The president signed an emergency declaration ...,ISAAC ON SAME PATH AS KATRINA. 7th ANNIVERSARY...,142,splitlines,ecec234e3610a929c59042e225140664,1346174526-ecec234e3610a929c59042e225140664,1346174526
2,5,59644,59716,VMTS13.08.098,2,11,President Obama ordered federal aid to Louisia...,ISAAC ON SAME PATH AS KATRINA. 7th ANNIVERSARY...,142,splitlines,ecec234e3610a929c59042e225140664,1346174526-ecec234e3610a929c59042e225140664,1346174526
3,5,55920,55995,VMTS13.08.094,2,20,Approx 78% of the Gulf's crude oil production ...,ISAAC ON SAME PATH AS KATRINA. 7th ANNIVERSARY...,142,splitlines,ecec234e3610a929c59042e225140664,1346174526-ecec234e3610a929c59042e225140664,1346174526


None
nuggets_generator topic_id: 6
loaded from .hdf file


Unnamed: 0,topic_id,match_start,match_end,nugget_id,importance,nugget_len,nugget_text,sent_in_text,sent_id,technique,docid,streamid,epoch
0,6,130,161,VMTS13.09.041,2,5,3 killed in Haiti,"An estimated 15,000 lose their lives. Oct. 25,...",11,splitlines,fbee3f4f14962fbfd4448a4f6bf14f8e,1351805764-fbee3f4f14962fbfd4448a4f6bf14f8e,1351805764
1,6,53,124,VMTS13.09.024,2,4,One killed in Jamaica,"An estimated 15,000 lose their lives. Oct. 25,...",11,splitlines,fbee3f4f14962fbfd4448a4f6bf14f8e,1351805764-fbee3f4f14962fbfd4448a4f6bf14f8e,1351805764
2,6,0,51,VMTS13.09.095,2,7,"Haiti, at least 51 people have died",Hurricane Sandy killed at least 52 people in H...,15,splitlines,bd3ded1f9fe6ae3a18c100b94472cc24,1351800696-bd3ded1f9fe6ae3a18c100b94472cc24,1351800696
3,6,0,78,VMTS13.09.093,2,6,"Haiti estimated 200,000 are left homeless",Hurricane Sandy killed at least 52 people in H...,15,splitlines,bd3ded1f9fe6ae3a18c100b94472cc24,1351800696-bd3ded1f9fe6ae3a18c100b94472cc24,1351800696


None
nuggets_generator topic_id: 8
loaded from .hdf file


Unnamed: 0,topic_id,match_start,match_end,nugget_id,importance,nugget_len,nugget_text,sent_in_text,sent_id,technique,docid,streamid,epoch
0,8,0,40,VMTS13.12.095,3,19,"On 8 December, forecasters said the storm had ...",Typhoon Bopha returns to the Philippines .,2,splitlines,22ed6799f7b89147d4711dd3b4626099,1355025957-22ed6799f7b89147d4711dd3b4626099,1355025957
1,8,434,498,VMTS13.12.101,3,16,"As of December 10, the storm had caused crop d...",Weakened typhoon set to make second landfall i...,1,splitlines,188094a5273b5086a65e11f3a6189ff8,1355032068-188094a5273b5086a65e11f3a6189ff8,1355032068
2,8,844,874,VMTS13.12.107,3,25,The death toll from the typhoon reached 902 an...,Weakened typhoon set to make second landfall i...,1,splitlines,188094a5273b5086a65e11f3a6189ff8,1355032068-188094a5273b5086a65e11f3a6189ff8,1355032068
3,8,0,82,VMTS13.12.107,3,25,The death toll from the typhoon reached 902 an...,Authorities feared the number of fatalities fr...,2,splitlines,b18b53383cd7a55a6c61929f2bdca0b0,1355033632-b18b53383cd7a55a6c61929f2bdca0b0,1355033632


None
nuggets_generator topic_id: 9
loaded from .hdf file


Unnamed: 0,topic_id,match_start,match_end,nugget_id,importance,nugget_len,nugget_text,sent_in_text,sent_id,technique,docid,streamid,epoch
0,9,173,224,VMTS13.14.004,1,7,"epicenter was located in the Pacific Ocean,",CBS has this report: “The quake shook building...,4,splitlines,216e533c16cbb355156925963edc3c65,1352355817-216e533c16cbb355156925963edc3c65,1352355817
1,9,88,114,VMTS13.14.018,1,5,telephone and internet services interrupted,"PST , November 7, 2012 MEXICO CITY — A deadly ...",5,splitlines,94ce35166d6a1cac7897a69ad891658c,1352353329-94ce35166d6a1cac7897a69ad891658c,1352353329
2,9,184,201,VMTS13.14.014,2,6,many are missing after quake,"PST , November 7, 2012 MEXICO CITY — A deadly ...",5,splitlines,94ce35166d6a1cac7897a69ad891658c,1352353329-94ce35166d6a1cac7897a69ad891658c,1352353329
3,9,151,177,VMTS13.14.012,3,5,39 casualties reported in Guatamala,"PST , November 7, 2012 MEXICO CITY — A deadly ...",5,splitlines,94ce35166d6a1cac7897a69ad891658c,1352353329-94ce35166d6a1cac7897a69ad891658c,1352353329


None
nuggets_generator topic_id: 10
loaded from .hdf file


Unnamed: 0,topic_id,match_start,match_end,nugget_id,importance,nugget_len,nugget_text,sent_in_text,sent_id,technique,docid,streamid,epoch
0,10,953,984,VMTS13.16.001,3,4,Tel Aviv bus bombing,Israel arrests suspects in Tel Aviv bus bombin...,1,splitlines,0c46b448c2b5a925980a36ad05703040,1353628046-0c46b448c2b5a925980a36ad05703040,1353628046
1,10,1395,1467,VMTS13.16.020,2,13,The White House called the bombing a terrorist...,Israel arrests suspects in Tel Aviv bus bombin...,1,splitlines,0c46b448c2b5a925980a36ad05703040,1353628046-0c46b448c2b5a925980a36ad05703040,1353628046
2,10,1009,1072,VMTS13.16.022,1,9,21 wounded in terror attack on Tel Aviv bus,Israel arrests suspects in Tel Aviv bus bombin...,1,splitlines,0c46b448c2b5a925980a36ad05703040,1353628046-0c46b448c2b5a925980a36ad05703040,1353628046
3,10,168,215,VMTS13.16.028,3,3,Terror suspects arrested,Israel arrests suspects in Tel Aviv bus bombin...,1,splitlines,0c46b448c2b5a925980a36ad05703040,1353628046-0c46b448c2b5a925980a36ad05703040,1353628046


None

Generating embedding files


HBox(children=(IntProgress(value=0, max=9), HTML(value='')))

Processing topic 4 from beginning
Loading topic 4 documents. 0 - 200 (Total: 1297)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 45258 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/4_0.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,4,1344207553-11d96befc369e806bd0f93d7cbc1f327,0,,"[-0.33302265, 0.40712184, 0.46675143, -0.49605...",False,splitlines
1,4,1344207553-11d96befc369e806bd0f93d7cbc1f327,1,Obama Reacts To Wisconsin Shooting HuffPost's ...,"[-0.24373458, 0.23686735, -0.7895195, -0.60246...",False,splitlines
2,4,1344207553-11d96befc369e806bd0f93d7cbc1f327,2,CA Canada Québec FR France ES Spain US United ...,"[0.9797202, 0.546941, 0.46263462, -0.47200182,...",False,splitlines
3,4,1344207553-11d96befc369e806bd0f93d7cbc1f327,3,186 Inside The Weird World Of Fake 'Newsroom' ...,"[0.5551974, 0.58452404, 0.9484223, -0.66795886...",False,splitlines


None
Loading topic 4 documents. 200 - 400 (Total: 1297)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 24410 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/4_200.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,4,1344227223-04765f5242a98ad0367968511b322275,0,,"[-0.33302307, 0.40712148, 0.4667511, -0.496054...",False,splitlines
1,4,1344227223-04765f5242a98ad0367968511b322275,1,Police call shootings at Wisconsin Sikh temple...,"[1.680115, 0.70679045, 0.4224718, -0.23211344,...",False,splitlines
2,4,1344227223-04765f5242a98ad0367968511b322275,2,The suspect was killed outside the temple in a...,"[0.9671155, -0.33465046, 0.64971113, -0.169180...",False,splitlines
3,4,1344227223-04765f5242a98ad0367968511b322275,3,Police called the attack an act of domestic te...,"[0.5012264, 0.02554158, -0.43623593, 0.0319882...",False,splitlines


None
Loading topic 4 documents. 400 - 600 (Total: 1297)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 24988 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/4_400.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,4,1344315413-a01c2bbb46ebc849a485ff45f74b659f,0,,"[-0.333023, 0.4071219, 0.4667511, -0.496054, -...",False,splitlines
1,4,1344315413-a01c2bbb46ebc849a485ff45f74b659f,1,Sikh temple gunman was ex-soldier linked to ra...,"[1.3355914, 0.28989205, 0.2606842, -0.03877188...",False,splitlines
2,4,1344315413-a01c2bbb46ebc849a485ff45f74b659f,2,Army veteran and authorities said they were in...,"[0.54225206, -0.23444273, -0.29778406, -0.4680...",False,splitlines
3,4,1344315413-a01c2bbb46ebc849a485ff45f74b659f,3,"The assailant, shot dead by police at the scen...","[-0.069050916, 0.20301527, 0.28429148, -0.9408...",False,splitlines


None
Loading topic 4 documents. 600 - 800 (Total: 1297)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 35091 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/4_600.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,4,1344787875-eae423b325ec0072def0fed76af8ac7b,0,,"[-0.333023, 0.4071219, 0.4667511, -0.496054, -...",False,splitlines
1,4,1344787875-eae423b325ec0072def0fed76af8ac7b,1,Sikh temple holds 1st Sunday service since att...,"[1.2484345, 0.94498557, 0.47398266, -0.1501192...",False,splitlines
2,4,1344787875-eae423b325ec0072def0fed76af8ac7b,2,The prayer services will be open to the public.,"[0.8377231, 0.577873, 0.98234916, 0.055838335,...",False,splitlines
3,4,1344787875-eae423b325ec0072def0fed76af8ac7b,3,Temple officials say they may have additional ...,"[1.504247, 0.60088575, 1.0099192, -0.2773417, ...",False,splitlines


None
Loading topic 4 documents. 800 - 1000 (Total: 1297)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 27933 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/4_800.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,4,1344292971-04a0316168f003ccc3dec7c783019b9d,0,,"[-0.33302265, 0.40712178, 0.46675128, -0.49605...",False,splitlines
1,4,1344292971-04a0316168f003ccc3dec7c783019b9d,1,Comments on: The Sikh Temple Shooting &amp ; H...,"[0.8845634, 0.6511442, 0.37856126, 0.26014584,...",False,splitlines
2,4,1344343904-096429a9c50b8e33c60d1ee1ebc1db0d,0,,"[-0.33302245, 0.40712228, 0.46675122, -0.49605...",False,splitlines
3,4,1344343904-096429a9c50b8e33c60d1ee1ebc1db0d,1,Accused Sikh temple gunman talked of 'racial h...,"[0.7685413, 0.65215284, 0.6206713, 0.06385146,...",False,splitlines


None
Loading topic 4 documents. 1000 - 1200 (Total: 1297)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 51978 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/4_1000.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,4,1344194362-4f9958bc2d2c219eeceefd15b7f72f4f,0,,"[-0.3330226, 0.40712166, 0.46675116, -0.496054...",False,splitlines
1,4,1344194362-4f9958bc2d2c219eeceefd15b7f72f4f,1,At least seven killed in shooting at Sikh temp...,"[1.1341217, 0.5317594, 0.7397838, -0.6878537, ...",False,splitlines
2,4,1344194362-4f9958bc2d2c219eeceefd15b7f72f4f,2,News Skip to search.,"[-0.6939468, -0.010414826, -0.0067932033, 0.35...",False,splitlines
3,4,1344194362-4f9958bc2d2c219eeceefd15b7f72f4f,3,New User ?,"[-0.938673, 0.07045213, -0.39731646, -1.537582...",False,splitlines


None
Loading topic 4 documents. 1200 - 1297 (Total: 1297)
Loading emb_df


HBox(children=(IntProgress(value=0, max=97), HTML(value='')))


Embeddings generated for 6159 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/4_1200.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,4,1344323614-1d2b0a3d001644900a74d749be41dae4,0,,"[-0.33302265, 0.40712184, 0.46675143, -0.49605...",False,splitlines
1,4,1344323614-1d2b0a3d001644900a74d749be41dae4,1,Sikh temple shooting: Wisconsin gunman had a r...,"[1.3554187, 0.51786673, 0.34319228, -0.2569952...",False,splitlines
2,4,1344323614-1d2b0a3d001644900a74d749be41dae4,2,"Nevertheless , Wade Michael Page , 40, was abl...","[0.33283406, 0.8741371, -0.3527412, -0.4347593...",False,splitlines
3,4,1344323614-1d2b0a3d001644900a74d749be41dae4,3,""" He bought it legally; he was not an ineligib...","[0.41005284, 0.74084234, 0.088782825, -0.13569...",False,splitlines


None
Processing topic 5 from beginning
Loading topic 5 documents. 0 - 200 (Total: 681)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 57727 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/5_0.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,5,1346281181-30a8a79e3b8d24b0453ec1e83fb7850f,0,,"[-0.3330226, 0.40712166, 0.46675116, -0.496054...",False,splitlines
1,5,1346281181-30a8a79e3b8d24b0453ec1e83fb7850f,1,Isaac downgraded to a tropical storm as Gulf r...,"[-0.29566693, 0.7855836, 1.021102, -1.1341419,...",False,splitlines
2,5,1346281181-30a8a79e3b8d24b0453ec1e83fb7850f,2,Photograph: Mario Tama/Getty Images Heavy rain...,"[-0.96379864, -0.17257084, 0.8271428, -1.00182...",False,splitlines
3,5,1346281181-30a8a79e3b8d24b0453ec1e83fb7850f,3,Rescuers picked up dozens of residents who had...,"[0.2970266, -0.9734244, 0.56213456, -0.4251472...",False,splitlines


None
Loading topic 5 documents. 200 - 400 (Total: 681)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 32011 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/5_200.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,5,1346270311-b39ed2c7a226f68a216b66d9eb2fcd82,0,,"[-0.33302295, 0.40712166, 0.46675116, -0.49605...",False,splitlines
1,5,1346270311-b39ed2c7a226f68a216b66d9eb2fcd82,1,NEW ORLEANS - UPDATE: Isaac weakens to a tropi...,"[-0.2737407, -0.10929393, 0.6888096, -0.696476...",False,splitlines
2,5,1346270311-b39ed2c7a226f68a216b66d9eb2fcd82,2,"Meanwhile, the first death blamed on the storm...","[-0.058071785, -1.1852225, -0.3990218, -0.7420...",False,splitlines
3,5,1346270311-b39ed2c7a226f68a216b66d9eb2fcd82,3,"Facts FEW LOCAL IMPACTS In the Florida Keys, M...","[0.25027448, -0.43146375, 0.011363889, 0.00753...",False,splitlines


None
Loading topic 5 documents. 400 - 600 (Total: 681)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 41302 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/5_400.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,5,1346170743-71fa95a38de4eb4fd6f6e51e166e32ca,0,,"[-0.333023, 0.4071219, 0.4667511, -0.496054, -...",False,splitlines
1,5,1346170743-71fa95a38de4eb4fd6f6e51e166e32ca,1,Tracking Isaac : The latest on the storm's pat...,"[-0.013227824, 0.13563502, 0.7916082, -0.60960...",False,splitlines
2,5,1346170743-71fa95a38de4eb4fd6f6e51e166e32ca,2,The center of the storm that was about 80 mile...,"[0.12832403, -0.69834596, 0.9224482, -1.271584...",False,splitlines
3,5,1346170743-71fa95a38de4eb4fd6f6e51e166e32ca,3,It could become the first hurricane to hit the...,"[0.32555598, -1.003842, 0.99972355, -0.9916231...",False,splitlines


None
Loading topic 5 documents. 600 - 681 (Total: 681)
Loading emb_df


HBox(children=(IntProgress(value=0, max=81), HTML(value='')))


Embeddings generated for 11176 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/5_600.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,5,1346185273-606ab829b9088b0a4c2dd075e33a4b66,0,,"[-0.33302295, 0.40712175, 0.4667512, -0.496053...",False,splitlines
1,5,1346185273-606ab829b9088b0a4c2dd075e33a4b66,1,Isaac makes hurricane status on path to Gulf C...,"[0.19648139, 0.13288149, 0.80646557, -0.606525...",False,splitlines
2,5,1346185273-606ab829b9088b0a4c2dd075e33a4b66,2,HALL - McClatchy Newspapers By MELISSA SCALLAN...,"[-0.23427899, -0.052299023, -0.4197834, -0.163...",False,splitlines
3,5,1346185273-606ab829b9088b0a4c2dd075e33a4b66,3,"HALL McClatchy Newspapers GULFPORT , Miss. -- ...","[-0.42889982, -0.10095204, 0.0911535, -0.57481...",False,splitlines


None
Processing topic 6 from beginning
Loading topic 6 documents. 0 - 200 (Total: 1689)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 13720 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/6_0.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,6,1351238065-bf5778ccccf28961dc6dad7dbd866e19,0,,"[-0.33302265, 0.40712184, 0.46675143, -0.49605...",False,splitlines
1,6,1351238065-bf5778ccccf28961dc6dad7dbd866e19,1,Hurricane Sandy: Forecasters warn pre- Hallowe...,"[0.08700484, 0.42302787, 0.81708133, -1.476569...",False,splitlines
2,6,1351238065-bf5778ccccf28961dc6dad7dbd866e19,2,Home News Sport U.S.,"[0.13756534, 0.12281159, -0.45132425, -0.04914...",False,splitlines
3,6,1351238065-bf5778ccccf28961dc6dad7dbd866e19,3,Showbiz Femail Health Science Money RightMinds...,"[0.65385294, 0.30314258, 0.56234145, -0.096857...",False,splitlines


None
Loading topic 6 documents. 200 - 400 (Total: 1689)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 11963 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/6_200.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,6,1351409212-6f0c6e17b0f9f90feda4157d0c084723,0,,"[-0.33302283, 0.40712255, 0.46675134, -0.49605...",False,splitlines
1,6,1351409212-6f0c6e17b0f9f90feda4157d0c084723,1,Hurricane Sandy: Storm threat to key US electi...,"[0.8879293, 0.06757587, 0.6765903, -0.940817, ...",False,splitlines
2,6,1351409212-6f0c6e17b0f9f90feda4157d0c084723,2,President Barack Obama has held a conference c...,"[0.44899583, -0.10533219, 0.5928075, -1.131043...",False,splitlines
3,6,1351409212-6f0c6e17b0f9f90feda4157d0c084723,3,Its sustained winds of 75mph (120km/h) are set...,"[1.2116995, -0.57344395, 1.542781, 0.3745076, ...",False,splitlines


None
Loading topic 6 documents. 400 - 600 (Total: 1689)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 12180 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/6_400.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,6,1351942709-539bd1fa953a4af7f3f93f63a23c32d4,0,,"[-0.3330225, 0.40712184, 0.46675122, -0.496053...",False,splitlines
1,6,1351942709-539bd1fa953a4af7f3f93f63a23c32d4,1,The slow pace of recovery from Hurricane Sandy...,"[0.4000737, -0.21129063, 0.33663103, -0.872805...",False,splitlines
2,6,1351942709-539bd1fa953a4af7f3f93f63a23c32d4,2,"In Staten Island , where 19 people have died a...","[0.41011077, -0.5623359, -0.12017883, -0.15570...",False,splitlines
3,6,1351942709-539bd1fa953a4af7f3f93f63a23c32d4,3,"""I don't see the Corps of Engineers ,"" Jim Bre...","[0.46453437, 0.55884826, -0.23225264, -0.75903...",False,splitlines


None
Loading topic 6 documents. 600 - 800 (Total: 1689)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 12789 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/6_600.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,6,1351379326-401fbc858e32a25e11bba7eba2d17ab8,0,,"[-0.33302283, 0.40712208, 0.4667508, -0.496053...",False,splitlines
1,6,1351379326-401fbc858e32a25e11bba7eba2d17ab8,1,Here Comes Hurricane Sandy: East Coast Scrambl...,"[-0.14303769, 0.3669812, 0.5256424, -0.4276732...",False,splitlines
2,6,1351379326-401fbc858e32a25e11bba7eba2d17ab8,2,"Eliza Shapiro on what to expect: evacuations, ...","[-0.22793935, -0.22846733, -0.0026604987, -0.4...",False,splitlines
3,6,1351379326-401fbc858e32a25e11bba7eba2d17ab8,3,Print Email Comments Anxious hurricane-watcher...,"[-0.1588611, -0.0073131397, 0.22478361, -0.087...",False,splitlines


None
Loading topic 6 documents. 800 - 1000 (Total: 1689)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 13458 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/6_800.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,6,1351812802-c87469dde0815e53a2eec960ef6934aa,0,,"[-0.33302295, 0.40712157, 0.46675092, -0.49605...",False,splitlines
1,6,1351812802-c87469dde0815e53a2eec960ef6934aa,1,US man sentenced to 17 years in terror plot En...,"[0.07669225, 0.608831, 0.0072883507, -0.895598...",False,splitlines
2,6,1351812802-c87469dde0815e53a2eec960ef6934aa,2,"Rezwan Ferdaus , 27, of Massachusetts pleaded ...","[-0.24524145, 0.8763582, -0.7207456, -0.646111...",False,splitlines
3,6,1351812802-c87469dde0815e53a2eec960ef6934aa,3,"Ferdaus delivered a long, soft-spoken statemen...","[0.28179318, 0.39287162, 0.57751554, 0.2655598...",False,splitlines


None
Loading topic 6 documents. 1000 - 1200 (Total: 1689)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 8584 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/6_1000.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,6,1351537373-0c3d103550c5a2a53955c79b712cb841,0,,"[-0.33302283, 0.40712255, 0.46675134, -0.49605...",False,splitlines
1,6,1351537373-0c3d103550c5a2a53955c79b712cb841,1,Out to Crunch : U.S.,"[-0.9900629, 0.25218493, -0.042902235, -0.7902...",False,splitlines
2,6,1351537373-0c3d103550c5a2a53955c79b712cb841,2,Energy Department Unleashes Its Titan Supercom...,"[-0.08701151, 0.62686396, 0.4740625, 0.0531381...",False,splitlines
3,6,1351537373-0c3d103550c5a2a53955c79b712cb841,3,Energy Department Unleashes Its Titan Supercom...,"[-0.12549123, 0.35867193, 0.2493185, -0.348122...",False,splitlines


None
Loading topic 6 documents. 1200 - 1400 (Total: 1689)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 10886 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/6_1200.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,6,1351669236-33368c2cc9daa6181f95ff993430a209,0,,"[-0.33302265, 0.40712184, 0.46675143, -0.49605...",False,splitlines
1,6,1351669236-33368c2cc9daa6181f95ff993430a209,1,Southern India braces for cyclone - Worldnews....,"[0.6714264, -0.007937696, 1.072666, -0.7360272...",False,splitlines
2,6,1351669236-33368c2cc9daa6181f95ff993430a209,2,The India Meteorological Department says the c...,"[-0.10163505, -0.2691756, 0.89027625, -0.67741...",False,splitlines
3,6,1351669236-33368c2cc9daa6181f95ff993430a209,3,It could cause a tide surge of up to 1.5 meter...,"[-0.27225187, 0.0145585835, 0.29589966, 0.4293...",False,splitlines


None
Loading topic 6 documents. 1400 - 1600 (Total: 1689)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 17064 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/6_1400.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,6,1351696064-eafd0dbf4aa69931fcc732f3a10fb781,0,,"[-0.33302283, 0.40712255, 0.46675134, -0.49605...",False,splitlines
1,6,1351696064-eafd0dbf4aa69931fcc732f3a10fb781,1,It's official!,"[0.16534038, -0.025500894, -0.6137089, -0.4853...",False,splitlines
2,6,1351696064-eafd0dbf4aa69931fcc732f3a10fb781,2,Nico Hulkenberg to leave Force India to join c...,"[-0.6726029, 0.63314474, 0.7723575, -0.1614497...",False,splitlines
3,6,1351696064-eafd0dbf4aa69931fcc732f3a10fb781,3,Nico Hulkenberg to leave Force India to join c...,"[-0.66647935, 0.7162416, 0.82640916, -0.105772...",False,splitlines


None
Loading topic 6 documents. 1600 - 1689 (Total: 1689)
Loading emb_df


HBox(children=(IntProgress(value=0, max=89), HTML(value='')))


Embeddings generated for 12243 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/6_1600.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,6,1351429531-185024a120cafb0e43269115df803d29,0,,"[-0.33302265, 0.40712124, 0.46675152, -0.49605...",False,splitlines
1,6,1351429531-185024a120cafb0e43269115df803d29,1,Magnitude 7.7 quake strikes off Canadian coast...,"[0.28505737, -0.11939955, 0.66360575, -0.29425...",False,splitlines
2,6,1351429531-185024a120cafb0e43269115df803d29,2,News Skip to search.,"[-0.69394654, -0.010415524, -0.006793216, 0.35...",False,splitlines
3,6,1351429531-185024a120cafb0e43269115df803d29,3,New User ?,"[-0.938673, 0.070451885, -0.39731687, -1.53758...",False,splitlines


None
Processing topic 8 from beginning
Loading topic 8 documents. 0 - 200 (Total: 394)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 19436 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/8_0.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,8,1354287989-29cdf889a9705e26e632a71bed936305,0,,"[-0.33302262, 0.4071223, 0.46675098, -0.496053...",False,splitlines
1,8,1354287989-29cdf889a9705e26e632a71bed936305,1,UN rights chief says Mursi decree breaks human...,"[-0.08097291, 0.9138727, -0.4016327, -0.610149...",False,splitlines
2,8,1354287989-29cdf889a9705e26e632a71bed936305,2,News Singapore Skip to search.,"[-1.136646, -0.00896359, -0.25427124, 0.163912...",False,splitlines
3,8,1354287989-29cdf889a9705e26e632a71bed936305,3,New User ?,"[-0.93867284, 0.07045223, -0.39731672, -1.5375...",False,splitlines


None
Loading topic 8 documents. 200 - 394 (Total: 394)
Loading emb_df


HBox(children=(IntProgress(value=0, max=194), HTML(value='')))


Embeddings generated for 13740 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/8_200.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,8,1355023501-fab7fa097b451b5fb5802b28f83cb899,0,,"[-0.33302283, 0.40712255, 0.46675134, -0.49605...",False,splitlines
1,8,1355023501-fab7fa097b451b5fb5802b28f83cb899,1,Dramatic twist: Typhoon Bopha to strike Philip...,"[-0.18680917, -0.19486551, 0.37377334, 0.18438...",False,splitlines
2,8,1355023501-fab7fa097b451b5fb5802b28f83cb899,2,Crosstalk Prime Time Russia News Spotlight Cap...,"[-0.3214137, 0.5715263, 0.11705506, -0.3455063...",False,splitlines
3,8,1355023501-fab7fa097b451b5fb5802b28f83cb899,3,"More than 56,000 people were displaced by the ...","[0.98909813, -0.31196478, 1.2935507, 0.1127343...",False,splitlines


None
Processing topic 9 from beginning
Loading topic 9 documents. 0 - 200 (Total: 305)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 12201 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/9_0.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,9,1352566975-da4262fddc1eebbb4e88013b479f5e3d,0,,"[-0.33302265, 0.40712184, 0.46675143, -0.49605...",False,splitlines
1,9,1352566975-da4262fddc1eebbb4e88013b479f5e3d,1,Zeker tien mensen omgekomen bij aardbeving Gua...,"[-0.34653872, 0.69584656, 0.7919942, -0.411774...",False,splitlines
2,9,1352566975-da4262fddc1eebbb4e88013b479f5e3d,2,Het aantal gewonden is nog niet duidelijk .,"[-0.8817611, 0.14159356, -0.4172308, -0.322891...",False,splitlines
3,9,1352566975-da4262fddc1eebbb4e88013b479f5e3d,3,Het epicentrum van de ... more » Read full art...,"[0.2616497, 0.2964059, 0.5361657, -0.13756487,...",False,splitlines


None
Loading topic 9 documents. 200 - 305 (Total: 305)
Loading emb_df


HBox(children=(IntProgress(value=0, max=105), HTML(value='')))


Embeddings generated for 4299 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/9_200.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,9,1352375987-9338d70a3df519198ee7488ebc14d1fe,0,,"[-0.33302283, 0.40712208, 0.4667508, -0.496053...",False,splitlines
1,9,1352375987-9338d70a3df519198ee7488ebc14d1fe,1,48 dead after earthquake rocks Guatemala - CNN...,"[1.1609291, -0.3469089, 0.12622389, -0.0448230...",False,splitlines
2,9,1352375987-9338d70a3df519198ee7488ebc14d1fe,2,INTERNATIONAL MÉXICO ARABIC TV : CNN CNNi CNN ...,"[0.036686487, 0.2648979, 0.28203386, -0.336241...",False,splitlines
3,9,1352375987-9338d70a3df519198ee7488ebc14d1fe,3,World Politics Justice Entertainment Tech Heal...,"[1.1412991, -0.07915233, 0.8227004, -0.1958817...",False,splitlines


None
Processing topic 10 from beginning
Loading topic 10 documents. 0 - 200 (Total: 1418)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 15947 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/10_0.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,10,1354113657-a4417f055ea5ae84207a4edb4dad881b,0,,"[-0.33302262, 0.4071223, 0.46675098, -0.496053...",False,splitlines
1,10,1354113657-a4417f055ea5ae84207a4edb4dad881b,1,Morning Briefing: Support grows for bid by Pal...,"[0.19336243, 0.9147884, 0.14385256, -0.3363837...",False,splitlines
2,10,1354113657-a4417f055ea5ae84207a4edb4dad881b,2,Abbas will visit New York this week as the Pal...,"[-0.8052915, 0.67320836, -0.18652903, -1.44647...",False,splitlines
3,10,1354113657-a4417f055ea5ae84207a4edb4dad881b,3,( MARKO DJURICA /REUTERS ) Palestinians hold p...,"[0.06238022, 0.81158215, 0.10491646, -0.355189...",False,splitlines


None
Loading topic 10 documents. 200 - 400 (Total: 1418)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 9973 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/10_200.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,10,1354141193-f1f3bcb0efa64195f00184e5e18ed5e1,0,,"[-0.3330226, 0.4071221, 0.46675104, -0.4960537...",False,splitlines
1,10,1354141193-f1f3bcb0efa64195f00184e5e18ed5e1,1,Egypt to vote on draft constitution | Global I...,"[0.3434992, -0.06101521, -0.0038727028, -0.526...",False,splitlines
2,10,1354141193-f1f3bcb0efa64195f00184e5e18ed5e1,2,President Mohamed Morsi had just last week giv...,"[0.793847, 0.18954468, 0.20799111, -0.7157669,...",False,splitlines
3,10,1354141193-f1f3bcb0efa64195f00184e5e18ed5e1,3,But as protests mounted over his decision to g...,"[0.6769016, -0.80873144, 1.2564847, -0.5762605...",False,splitlines


None
Loading topic 10 documents. 400 - 600 (Total: 1418)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 13930 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/10_400.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,10,1354217501-04e33b2ebe298f92cb7da57139c05060,0,,"[-0.33302292, 0.4071219, 0.46675158, -0.496054...",False,splitlines
1,10,1354217501-04e33b2ebe298f92cb7da57139c05060,1,LBN E-Lert 1) Home 2) About & gt; About LBN E ...,"[0.14566371, 0.14039704, 0.427319, -0.5549245,...",False,splitlines
2,10,1354217501-04e33b2ebe298f92cb7da57139c05060,2,Lohan was arrested on a third-degree misdemean...,"[-0.09647485, -0.62641263, 0.16334225, -0.3101...",False,splitlines
3,10,1354217501-04e33b2ebe298f92cb7da57139c05060,3,New York police Sergeant John Buthorn told Reu...,"[-0.48973525, -0.48439333, -0.28608778, -0.041...",False,splitlines


None
Loading topic 10 documents. 600 - 800 (Total: 1418)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 12533 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/10_600.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,10,1354011372-c5daf8b8172c3a11df5f2df671749dbe,0,,"[-0.33302265, 0.40712184, 0.46675143, -0.49605...",False,splitlines
1,10,1354011372-c5daf8b8172c3a11df5f2df671749dbe,1,Egypt 's president stands by his decrees - Wor...,"[0.63497484, 0.13587412, 0.6132797, -0.1464467...",False,splitlines
2,10,1354011372-c5daf8b8172c3a11df5f2df671749dbe,2,The uncompromising stance came during a meetin...,"[0.64408916, 0.40625927, 0.7680882, -0.1786005...",False,splitlines
3,10,1354011372-c5daf8b8172c3a11df5f2df671749dbe,3,"Morsi issued a decree on Thursday, giving hims...","[0.4290005, -0.01730176, 0.62848943, -0.697999...",False,splitlines


None
Loading topic 10 documents. 800 - 1000 (Total: 1418)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 52549 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/10_800.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,10,1354022290-d4ed970c07377f94488953730113f4c0,0,,"[-0.33302262, 0.4071223, 0.46675098, -0.496053...",False,splitlines
1,10,1354022290-d4ed970c07377f94488953730113f4c0,1,Egypt 's Morsi faces nationwide protests - Yahoo!,"[0.38279662, 0.33276257, -0.1751454, -0.176446...",False,splitlines
2,10,1354022290-d4ed970c07377f94488953730113f4c0,2,News UK Skip to search.,"[-0.7550583, 0.2167367, 0.18062939, -0.333915,...",False,splitlines
3,10,1354022290-d4ed970c07377f94488953730113f4c0,3,New User ?,"[-0.93867284, 0.07045223, -0.39731672, -1.5375...",False,splitlines


None
Loading topic 10 documents. 1000 - 1200 (Total: 1418)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 16487 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/10_1000.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,10,1353624820-6a875819da28acb24a62a358e7251278,0,,"[-0.33302245, 0.40712228, 0.46675122, -0.49605...",False,splitlines
1,10,1353624820-6a875819da28acb24a62a358e7251278,1,Israel makes arrests in Tel Aviv bus bombing: ...,"[0.6465916, 0.43511724, 0.08469462, -0.6789561...",False,splitlines
2,10,1353624820-6a875819da28acb24a62a358e7251278,2,Insight Opinion Forum Letters All Videos Top S...,"[0.56854606, 0.3487405, 0.77729857, 0.25957906...",False,splitlines
3,10,1353624820-6a875819da28acb24a62a358e7251278,3,Communities Blogs Forum Letters Polls Photos /...,"[0.37798482, 0.40453362, 0.053449843, -0.37136...",False,splitlines


None
Loading topic 10 documents. 1200 - 1400 (Total: 1418)
Loading emb_df


HBox(children=(IntProgress(value=0, max=200), HTML(value='')))


Embeddings generated for 12509 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/10_1200.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,10,1354058479-60aadd9d88006017ff91692fd9d741d2,0,,"[-0.33302262, 0.4071223, 0.46675098, -0.496053...",False,splitlines
1,10,1354058479-60aadd9d88006017ff91692fd9d741d2,1,Egypt Clashes: Morsi Protests Gather Pace - Ya...,"[0.28856036, 0.188023, 0.065916166, -0.1293971...",False,splitlines
2,10,1354058479-60aadd9d88006017ff91692fd9d741d2,2,News UK Skip to search.,"[-0.7550583, 0.2167367, 0.18062939, -0.333915,...",False,splitlines
3,10,1354058479-60aadd9d88006017ff91692fd9d741d2,3,New User ?,"[-0.93867284, 0.07045223, -0.39731672, -1.5375...",False,splitlines


None
Loading topic 10 documents. 1400 - 1418 (Total: 1418)
Loading emb_df


HBox(children=(IntProgress(value=0, max=18), HTML(value='')))


Embeddings generated for 580 sentences
df created from scratch
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/10_1400.hdf


Unnamed: 0,topic_id,streamid,sent_id,sentence,embedding,is_nugget,technique
0,10,1353693068-5903bdd96a276bba31cbae704bd52af4,0,,"[-0.3330225, 0.40712184, 0.46675122, -0.496053...",False,splitlines
1,10,1353693068-5903bdd96a276bba31cbae704bd52af4,1,Saudi women 'monitored by SMS' | UnFox News Lo...,"[0.2667716, 0.20312668, 0.48571187, 0.1428197,...",True,splitlines
2,10,1353693068-bb3f1f02416e49828186153315caaaa0,0,,"[-0.3330228, 0.40712214, 0.46675128, -0.496053...",False,splitlines
3,10,1353693068-bb3f1f02416e49828186153315caaaa0,1,Oil prices climb on new Gaza - Israel fears | ...,"[0.5532839, 0.67118406, 0.05363112, 0.04725129...",False,splitlines


None

Adding cosine label to original-trects-kba2014-filtered | distilbert-base-nli-stsb-mean-tokens embedding dataframes
Avg nugget embedding loaded from: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/avg_nug_emb.npy


HBox(children=(IntProgress(value=0, max=56), HTML(value='')))

saved with cosine column at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/1_0.hdf
saved with cosine column at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/1_200.hdf
saved with cosine column at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/1_400.hdf
saved with cosine column at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/2_0.hdf
saved with cosine column at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/2_200.hdf
saved with cosine column at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/2_400.hdf
saved with cosine column at: /nf

saved with cosine column at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/10_600.hdf
saved with cosine column at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/10_800.hdf
saved with cosine column at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/10_1000.hdf
saved with cosine column at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/10_1200.hdf
saved with cosine column at: /nfs/proj-repo/AAARG-dissertation/dataset/original-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/10_1400.hdf

Finished adding cosine labels for original-trects-kba2014-filtered distilbert-base-nli-stsb-mean-tokens embeddings
Finished generating files


Unnamed: 0,corpus_name,exists,file_purpose,file_type,instance_identifier,nested_dir,num_splits,path,split_identifier,split_step
0,original-trects-kba2014-filtered,True,topics,.csv.gz,topics_df,,1,/nfs/proj-repo/AAARG-dissertation/dataset/orig...,,
1,original-trects-kba2014-filtered,True,corpus,.csv.gz,1,,2,/nfs/proj-repo/AAARG-dissertation/dataset/orig...,0.0,200.0
2,original-trects-kba2014-filtered,True,corpus,.csv.gz,1,,2,/nfs/proj-repo/AAARG-dissertation/dataset/orig...,200.0,200.0
3,original-trects-kba2014-filtered,True,corpus,.csv.gz,2,,2,/nfs/proj-repo/AAARG-dissertation/dataset/orig...,0.0,200.0
4,original-trects-kba2014-filtered,True,corpus,.csv.gz,2,,2,/nfs/proj-repo/AAARG-dissertation/dataset/orig...,200.0,200.0
5,original-trects-kba2014-filtered,True,corpus,.csv.gz,3,,2,/nfs/proj-repo/AAARG-dissertation/dataset/orig...,0.0,200.0
6,original-trects-kba2014-filtered,True,corpus,.csv.gz,3,,2,/nfs/proj-repo/AAARG-dissertation/dataset/orig...,200.0,200.0
7,original-trects-kba2014-filtered,True,corpus,.csv.gz,4,,2,/nfs/proj-repo/AAARG-dissertation/dataset/orig...,0.0,200.0
8,original-trects-kba2014-filtered,True,corpus,.csv.gz,4,,2,/nfs/proj-repo/AAARG-dissertation/dataset/orig...,200.0,200.0
9,original-trects-kba2014-filtered,True,corpus,.csv.gz,5,,2,/nfs/proj-repo/AAARG-dissertation/dataset/orig...,0.0,200.0


None


## Loading Generated Corpus Files

In [929]:
class PathRetriever:
    """Maybe put this functionality in FilePathHandler?"""
    def __init__(proj_repo):
        self.proj_repo = proj_repo
        self.path_handler = FilePathHandler(proj_repo)
        self.file_type_options = [".hdf", ".csv.gz"]
        
    def get_corpus_names(self):
        corpus_names = self.path_handler.corpus_sources.keys()
        return corpus_names
    
    def get_nested_dirs(self, corpus_name, file_purpose):
        paths = self.get_base_paths(corpus_name, file_purpose, verbose=False, exists=True)
        nested_dirs = paths['nested_dir'].unique()
        nested_dirs = [x for x in nested_dirs if (x != np.nan) or (x != None) or (x != "")]
        return nested_dirs
    
    def get_topic_path(self, corpus_name, file_type=".hdf", use_any=True, verbose=True):
        paths = self.get_base_paths(corpus_name, "topics", verbose=verbose, exists=exists)
        paths = self.resolve_path_attr(paths, "file_type", file_type, use_any=use_any_type, technique="first",
                                      verbose=verbose)
        path = list(paths['path'])[0]
        return path
    
    def get_corpus_paths(self, corpus_name, file_type=".hdf", topic_ids=None, split_step=None, use_any=True,
                      verbose=True, exists=True):
        paths = self.get_base_paths(corpus_name, "corpus", verbose=verbose, exists=exists)
        # prioritise matching split_step, then file_type
        paths = self.resolve_path_attr(paths, "split_step", split_step, technique="max", use_any=use_any,
                                      verbose=verbose)
        paths = self.resolve_path_attr(paths, "file_type", file_type, technique="first", use_any=use_any,
                                      verbose=verbose)
        if topic_ids is not None:  # match specific topic_ids
            paths = self.resolve_multi_target_vals(paths, "instance_identifier", topic_ids, verbose=verbose)
        return paths
    
    def get_nugget_paths(self, corpus_name, file_type=".hdf", topic_ids=None, use_any=True, verbose=True,
                        exists=True):
        paths = self.get_base_paths(corpus_name, "nuggets", verbose=verbose, exists=exists)
        paths = self.resolve_path_attr(paths, "file_type", file_type, technique="first", use_any=use_any,
                                      verbose=verbose)
        if topic_ids is not None:
            paths = self.resolve_multi_target_vals(paths, "split_identifier", topic_ids, verbose=verbose)
        return paths
    
    def get_embedding_paths(self, corpus_name, nested_dir, file_type=".hdf", topic_ids=None, split_step=None, 
                            exists=True, use_any=True, verbose=True):
        paths = self.get_base_paths(corpus_name, "embeddings", verbose=verbos, exists=exists)
        paths = self.resolve_path_attr(paths, "nested_dir", nested_dir, technique="first", use_any=use_any,
                                      verbose=True)  # always explicitly tell if using diff embedding
        # prioritise matching split_step, then file_type
        paths = self.resolve_path_attr(paths, "split_step", split_step, technique="max", use_any=use_any,
                                      verbose=verbose)
        paths = self.resolve_path_attr(paths, "file_type", file_type, technique="first", use_any=use_any,
                                      verbose=verbose)
        if topic_ids is not None:  # match specific topic_ids
            paths = self.resolve_multi_target_vals(paths, "instance_identifier", topic_ids, verbose=verbose)
        return paths
        
    def get_base_paths(self, corpus_name, file_purpose, exists=True, verbose=True):
        print_name = str(file_purpose) + " df"
        if verbose:
            print("Loading " + print_name + " for " + str(corpus_name))
        paths = self.path_handler.load_path_df_slice(corpus_name, file_purpose)
        if exists:
            paths = paths[paths['exists'] == True]
        if len(paths) == 0:
            raise Exception(print_name + " not processed for " + str(corpus_name))
        return paths
    
    def resolve_multi_target_vals(self, paths, attr_name, target_vals, full_match=True, verbose=True):
        """Find where path attribute/df column inclusively matches target_vals (e.g. topic_ids)"""
        target_vals = convert_to_list(target_vals)  # case only one is passed
        paths_list = []
        for target_val in target_vals:
            match = paths[paths[attr_name] == target_val]
            if len(match) == 0 and full_match:  # can have no missing
                raise Exception("There are no paths with " + str(attr_name) + " that have a value of "
                               + str(target_val))
            paths_list.append(match)
        paths = pd.concat(paths_list)
            
    
    def resolve_path_attr(self, paths, attr_name, target_val, technique="first", use_any=True, verbose=True):
        unique = paths[attribute].unique()
        exact_match = True
        if target_val in unique():
            paths = paths[paths[attr_name] == target_val]  # use paths that match target vals
        else:  # resolve to find other matches
            exact_match = False
            if use_any:
                if technique == "first":  # get row(s) that have the first identified unique val
                    paths = paths[paths[attr_name] == unique[0]]
                elif technique == "max":  # get row(s) with max of attr column
                    paths = paths[paths[attr_name]==paths[attr_name].max()]
                elif technique == "min":  # get row(s) with min of attr col
                    paths = paths[paths[attr_name]==paths[attr_name].min()]
            else:
                raise Exception("file with attribute " + str(attr_name) + " and value " + str(target_val) 
                                + " not found")
        if verbose and not exact_match and target_val is not None:  # unable to find specified target_val
            print(str(attr_name) + " not found with " + str(target_val) + ", technique "
                 + str(technique) + " used to find match instead")
        return paths