# Loading/Processing Corpus

In [1]:
# !pip install numba

In [2]:
import os
num_threads = 32
# os.environ['NUMEXPR_MAX_THREADS'] = str(num_threads)  # faster ckdtree indexing
from multiprocessing import Pool  # multithreading for cos_sim_nearest_nug comparisons

from bs4 import BeautifulSoup as bs
import gzip
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm import tqdm_notebook
# from tqdm.notebook import tqdm
import os
from collections import OrderedDict
import pickle
import warnings

import copy
import spacy
from sentence_transformers import SentenceTransformer
from pathlib import Path
from collections import defaultdict
warnings.simplefilter(action='ignore', category=DeprecationWarning)

### General Helper Functions

In [3]:
def convert_to_list(item):
    if type(item) is not list:
        item = [item]
    return item

def file_exists(path):
    """Check if path or list of paths has item that does not exist"""
    exists = []
    path = convert_to_list(path)
    for p in path:
        exists.append(os.path.exists(p))
    return all(exists)

def remove_unnamed_cols(df, show_removed=False):
    """Bug where useless columns entitled 'Unnamed' appear"""
    removed = False
    for col in df.columns:
        if "Unnamed" in col:
            del df[col]
            removed=True
    if show_removed:
        return df, removed
    else:
        return df

## File IO Management

In [4]:
class FilePathHandler:
    """Paths will be in the format:
    
    proj_dir/dataset_dir/corpus_name/file_purpose/(nested_dir)/instance_identifier+split_identifier+sfile_type
    """
    def __init__(self, proj_dir, dataset_dir="dataset", compression='gzip'):
        self.proj_dir = proj_dir
        self.dataset_dir = proj_dir + '/' + dataset_dir
        self.create_dir_if_not_exists(self.dataset_dir)
        self.path_df_path = self.dataset_dir + '/' + 'file_path_df.csv.gz'
        self.corpus_sources_pickle_path = self.dataset_dir + '/' + 'corpus_sources.pickle'
        self.compression = compression
        self.file_purposes = ["topics", "corpus", "nuggets", "embeddings", "updates"]
        self.path_df_cols = {"corpus_name":str, "file_purpose":str, "split_identifier":str, "num_splits":int,
                             "split_step":int, "nested_dir":str, "instance_identifier":str, 
                             "file_type":str, "path":str, "exists":bool}
        self.corpus_sources_keys = ['corpus_name', 'dir_path', 'nuggets_path', 'matches_path', 'topics_path']
        
        # load meta files
        self.load_corpus_sources()
        self.load_path_df()
        
    def load_path_df_slice(self, corpus_name, file_purpose, instance_identifier=None, file_type=None,
                           split_identifier=None, exists=None, nested_dir=None, split_step=None):
        paths = self.path_df
        paths = paths[paths['corpus_name'] == corpus_name]
        paths = paths[paths['file_purpose'] == file_purpose]
        if instance_identifier is not None:
            paths = paths[paths['instance_identifier'] == instance_identifier]
        if split_identifier is not None:
            paths = paths[paths['split_identifier'] == split_identifier]
        if nested_dir is not None:
            paths = paths[paths['nested_dir'] == nested_dir]
        if split_step is not None:
            paths = paths[paths['split_step'] == split_step]
        if file_type is not None:
            paths = paths[paths['file_type'] == file_type]
        if exists is not None:
            paths = paths[paths['exists'] == exists]
        return paths
        
    def false_exists_in_df(self, remove_false_exists=False, verbose=True):
        """Function to check which paths are listed as existing, but actually do not"""
        exists = self.path_df[self.path_df['exists'] == True]
        false_exists = []
        for path in exists['path']:
            if not os.path.exists(path):
                false_exists.append(path)
    
        if verbose:
            out = "\n".join(false_exists)
            print("Paths found in path_df but not on system: " + str(out))
        
        if remove_false_exists:
            prev_len = len(self.path_df)
            # drop rows not in paths
            self.path_df = self.path_df[~self.path_df['path'].isin(false_exists)]
            cur_len = len(self.path_df)
            if prev_len == cur_len:
                raise Exception("Operation did not remove paths from dataframe")
            self.save_path_df()
            if verbose:
                print("Removed " + str(prev_len - cur_len) + " paths from path_df dataframe")
        
    
    def get_path(self, corpus_name, file_purpose, inst_identifier, file_type, add_path=True, exists=False,
                split_identifier=None, num_splits=1, split_step=0, nested_dir=None, warn=False):
        # do check here make sure filename compatible, or elsewhere
        path = self.dataset_dir + '/' + corpus_name + '/' + file_purpose + '/'
        if nested_dir is not None:
            path += nested_dir + '/'
        path += str(inst_identifier)
        if split_identifier is not None:
            path += '_' + str(split_identifier)
        path += file_type
        
        if add_path:
            self.add_path_to_df(corpus_name, file_purpose, split_identifier, num_splits, split_step, nested_dir,
                                inst_identifier, file_type, path, exists, save=True, warn=warn)
        return path
            
    def add_path_to_df(self, corpus_name, file_purpose, split_identifier, num_splits, split_step, nested_dir, 
                       inst_identifier, file_type, path, exists, save=True, warn=False):
        if not (self.path_df['path'] == path).any():  # check if row exists
            # create appropriate dir if needed
            file_purp_dir_path = self.dataset_dir + '/' + corpus_name +  '/' + file_purpose
            self.create_dir_if_not_exists(file_purp_dir_path)
            if nested_dir is not None:  # created nested dir if neccessary
                nested_dir_path = file_purp_dir_path + '/' + nested_dir
                self.create_dir_if_not_exists(nested_dir_path)
            # add to path_df
            if num_splits is None:
                num_splits = 1
            
            row = pd.DataFrame({'corpus_name': pd.Series([corpus_name], dtype=str),
                                  'file_purpose': pd.Series([file_purpose], dtype=str),
                                  'split_identifier': pd.Series([split_identifier], dtype=str),
                                  'num_splits': pd.Series([num_splits], dtype=int),
                                  'split_step': pd.Series([split_step], dtype=int),
                                  'nested_dir': pd.Series([nested_dir], dtype=str),
                                  'instance_identifier': pd.Series([inst_identifier], dtype=str),
                                  'file_type': pd.Series([file_type], dtype=str),
                                  'path': pd.Series([path], dtype=str),
                                  'exists': pd.Series([exists], dtype=bool)})
            
            self.path_df = self.path_df.append(row, ignore_index=True)
            if save:  # save new path_df
                self.save_path_df()
        else:
            if warn:
                warnings.warn("Path already exists in dataframe: " + str(path))
            
    def update_path_exists(self, path, save=True):
        self.path_df.loc[self.path_df['path'] == path, 'exists'] = True
        if save:
            self.save_path_df()
        

    def create_dir_if_not_exists(self, dir_path, warn=True):
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
            warnings.warn("Created new directory at " + str(dir_path))
            return True
        return False
    
    def search_path_df(self, search_dict, df_slice=None):
        if df_slice is None:
            df_slice = self.path_df
        for col_name, value in search_dict.items():
            df_slice = df_slice[col_name == value]
        return df_slice
    
    def source_dict_correct(self, source_dict):
        # check has all appropriate keys
        for key in self.corpus_sources_keys:
            if key not in source_dict:
                raise Exception(str(key) + " is missing from corpus_source dict")
        false_paths = []
        for path_type, path in source_dict.items():
            if path_type == "corpus_name":  # dict entry not a path, don't check
                continue
            if not file_exists(path):
                false_paths.append(str(path_type) + " does not exist at " + str(path))
        if len(false_paths) > 0:
            error_str = "\n".join(false_paths)
            raise FileNotFoundError(error_str)
        else:
            return True
        
    
    def create_corpus_source_dict(self, corpus_name, dir_path, topics_file_path, nuggets_file_path,
                                 matches_file_path):
        s_dict = {"corpus_name":corpus_name, "dir_path":dir_path, 
                  "topics_path":topics_file_path, "nuggets_path":nuggets_file_path,
                 "matches_path":matches_file_path}
        self.source_dict_correct(s_dict)
        return p_dict
    
    def add_corpus_source(self, corpus_source_dict, overwrite=False):
        """Add a corpus directory to load from and its meta files"""
        # check paths exist
        self.source_dict_correct(corpus_source_dict)
        corpus_name = copy.deepcopy(corpus_source_dict["corpus_name"])
        # store new entry
        if corpus_name in self.corpus_sources and overwrite==False:
            warnings.warn(str(corpus_name) + " is already present in corpus source dictionary. \n Proceeding with dict entry")
        else:
            del corpus_source_dict["corpus_name"]  # remove corpus_name from dict to add as a key
            self.corpus_sources[corpus_name] = corpus_source_dict
            # create folder for outputting new source files
            new_corpus_dir = self.dataset_dir + '/' + corpus_name
            self.create_dir_if_not_exists(new_corpus_dir)
            self.save_corpus_sources()
            
    def get_corpus_sources(self, corpus_names=None):
        """Retrieve file paths from corpus_load dicts
        Parameters:
            corpus_names: list of corpus names retrieve, if None then retrieve all
        
        Returns:
            A dictionary where keys are the corpus names and values are target file paths
        """
        if corpus_names is None:
            corpus_names = self.corpus_sources.keys()
        corpus_paths = {}
        for name in corpus_names:
            corpus_paths[name] = self.corpus_sources[name]
        return corpus_paths
    
    def save_corpus_sources(self):
        with open(self.corpus_sources_pickle_path, 'wb') as handle:
            pickle.dump(self.corpus_sources, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    def load_corpus_sources(self):
        if os.path.exists(self.corpus_sources_pickle_path):
            with open(self.corpus_sources_pickle_path, 'rb') as handle:
                self.corpus_sources = pickle.load(handle)
            return True
        else:
            self.corpus_sources = {} # create empty dictionary
            return False
        
    def save_path_df(self):
        self.path_df.to_csv(self.path_df_path, compression=self.compression)
        
    def load_path_df(self):
        """File containing info about file paths to systematically load files"""
        if os.path.exists(self.path_df_path):
            self.path_df = pd.read_csv(self.path_df_path, compression=self.compression)

            self.path_df, removed = remove_unnamed_cols(self.path_df, show_removed=True)
            if removed:  # save changes
                self.save_path_df()
            return True
        else:
            path_df_dict = {}
            for col_name, data_type in self.path_df_cols.items():  # ensure columns don't infer wrong typing
                path_df_dict[col_name] = pd.Series([], dtype=data_type)
            self.path_df = pd.DataFrame(path_df_dict)  # create empty dataframe
            return False

### Retrieving Generated Corpus Files

In [5]:
class PathRetriever:
    """Helper class to select necessary paths"""
    def __init__(self, proj_repo):
        self.proj_repo = proj_repo
        self.path_handler = FilePathHandler(proj_repo)
        self.file_type_options = [".hdf", ".csv.gz"]
        
    def get_corpus_names(self):
        corpus_names = self.path_handler.corpus_sources.keys()
        return corpus_names
    
    def get_nested_dirs(self, corpus_name, file_purpose):
        paths = self.get_base_paths(corpus_name, file_purpose, verbose=False, exists=True)
        nested_dirs = paths['nested_dir'].unique()
        nested_dirs = [x for x in nested_dirs if (x != np.nan) or (x != None) or (x != "")]
        return nested_dirs
    
    def get_file_types(self, corpus_name, file_purpose, nested_dir=None):
        paths = self.path_handler.load_path_df_slice(corpus_name, file_purpose, nested_dir=nested_dir)
        file_types = paths['file_type'].unique()
        return file_types
    
    def get_topic_path(self, corpus_name, file_type=".hdf", use_any=True, verbose=True):
        paths = self.get_base_paths(corpus_name, "topics", verbose=verbose, exists=exists)
        paths = self.resolve_path_attr(paths, "file_type", file_type, use_any=use_any_type, technique="first",
                                      verbose=verbose)
        path = list(paths['path'])[0]
        return path
    
    def get_corpus_paths(self, corpus_name, file_type=".hdf", topic_ids=None, split_step=None, use_any=True,
                      verbose=True, exists=True):
        paths = self.get_base_paths(corpus_name, "corpus", verbose=verbose, exists=exists)
        # prioritise matching split_step, then file_type
        paths = self.resolve_path_attr(paths, "split_step", split_step, technique="max", use_any=use_any,
                                      verbose=verbose)
        paths = self.resolve_path_attr(paths, "file_type", file_type, technique="first", use_any=use_any,
                                      verbose=verbose)
        if topic_ids is not None:  # match specific topic_ids
            paths = self.resolve_multi_target_vals(paths, "instance_identifier", topic_ids, verbose=verbose)
        return paths
    
    def get_nugget_paths(self, corpus_name, file_type=".hdf", topic_ids=None, use_any=True, verbose=True,
                        exists=True):
        paths = self.get_base_paths(corpus_name, "nuggets", verbose=verbose, exists=exists)
        paths = self.resolve_path_attr(paths, "file_type", file_type, technique="first", use_any=use_any,
                                      verbose=verbose)
        if topic_ids is not None:
            paths = self.resolve_multi_target_vals(paths, "split_identifier", topic_ids, verbose=verbose)
        return paths
    
    def get_embedding_paths(self, corpus_name, nested_dir, file_type=".hdf", topic_ids=None, split_step=None, 
                            exists=True, use_any=True, verbose=True, return_dir_path=False):
        paths = self.get_base_paths(corpus_name, "embeddings", verbose=verbose, exists=exists)
        paths = self.resolve_path_attr(paths, "nested_dir", nested_dir, technique="first", use_any=use_any,
                                      verbose=True)  # always explicitly tell if using diff embedding
        # prioritise matching split_step, then file_type
        paths = self.resolve_path_attr(paths, "split_step", split_step, technique="max", use_any=use_any,
                                      verbose=verbose)
        paths = self.resolve_path_attr(paths, "file_type", file_type, technique="first", use_any=use_any,
                                      verbose=verbose)
        if topic_ids is not None:  # match specific topic_ids
            paths = self.resolve_multi_target_vals(paths, "instance_identifier", topic_ids, verbose=verbose)
        if return_dir_path:
            emb_dir = self.get_nested_dir_path(corpus_name, "embeddings", nested_dir)
            return paths, emb_dir
        return paths
        
    def get_base_paths(self, corpus_name, file_purpose, exists=True, verbose=True):
        print_name = str(file_purpose) + " paths"
        if verbose:
            print("Loading " + print_name + " for " + str(corpus_name))
        paths = self.path_handler.load_path_df_slice(corpus_name, file_purpose)
        if exists:
            paths = paths[paths['exists'] == True]
        if len(paths) == 0:
            raise Exception(print_name + " not processed for " + str(corpus_name))
        return paths
    
    def resolve_multi_target_vals(self, paths, attr_name, target_vals, full_match=True, verbose=True):
        """Find where path attribute/df column inclusively matches target_vals (e.g. topic_ids)"""
        target_vals = convert_to_list(target_vals)  # case only one is passed
        paths_list = []
        for target_val in target_vals:
            match = paths[paths[attr_name] == target_val]
            if len(match) == 0:  
                match = paths[paths[attr_name] == str(target_val)]  # see if wrong datatype
                if len(match) == 0 and full_match:
                    raise Exception("There are no paths with " + str(attr_name) + " that have a value of "
                               + str(target_val))
            paths_list.append(match)
        paths = pd.concat(paths_list)
        return paths
    
    def resolve_path_attr(self, paths, attr_name, target_val, technique="first", use_any=True, verbose=True):
        unique = paths[attr_name].unique()
        exact_match = True
        if target_val in unique:
            paths = paths[paths[attr_name] == target_val]  # use paths that match target vals
        else:  # resolve to find other matches
            exact_match = False
            if use_any:
                if technique == "first":  # get row(s) that have the first identified unique val
                    paths = paths[paths[attr_name] == unique[0]]
                elif technique == "max":  # get row(s) with max of attr column
                    paths = paths[paths[attr_name]==paths[attr_name].max()]
                elif technique == "min":  # get row(s) with min of attr col
                    paths = paths[paths[attr_name]==paths[attr_name].min()]
            else:
                raise Exception("file with attribute " + str(attr_name) + " and value " + str(target_val) 
                                + " not found")
        if verbose and not exact_match and target_val is not None:  # unable to find specified target_val
            print(str(attr_name) + " not found with " + str(target_val) + ", technique "
                 + str(technique) + " used to find match instead")
        return paths
    
    def get_nested_dir_path(self, corpus_name, file_purpose, nested_dir):
        emb_dir = os.path.join(self.path_handler.dataset_dir, corpus_name, file_purpose, nested_dir)
        return emb_dir

## Markup Loading Functions

In [6]:
# open and get beautifulsoup object from markup file
def open_markup_file(addr, gz=True, xml=False, verbose=False):
    markup = None
    f = None
    
    if verbose:
        print(addr)

    if gz:
        f = gzip.open(addr)
        if verbose:
            print("gz file opened")
    else:
        f = open(addr)
        
    if xml == False:
        markup = bs(f, "lxml")  # using lxml parser for speed
    else:
        markup = bs(f, "xml")
        
    f.close()
    return markup


# parse markup and return 2D list [entry:tags]
def parse_markup(markup, entry_list, tag_list, find_tag, topic_id=None):
    for e in markup.find_all(find_tag):
        entry = OrderedDict.fromkeys(tag_list)
        if topic_id is not None:
            entry['topic_id'] = topic_id
        for c in e.children:  # children use direct children, descendants uses all
            if str(c.name).lower() in entry:
                entry[c.name] = str(c.string)
            elif c.name is None and c.string != '\n':  # inner body of <doc> tag
                entry['text'] = str(c.string)
        entry_list.append(list(entry.values()))
        
            
# recursively find gz html files from a directory address
def search_dir(path):    
    # separate the subdirectories and html files 
    # (help maintain sequential order of insertion)
    gz_paths = []
    for f in os.scandir(path):
        if os.path.splitext(f.path)[-1].lower() == ".gz":
            gz_paths.append(f.path)
    
    return gz_paths


def list_to_dataframe(markup_list, tags):
    return pd.DataFrame(markup_list, columns=tags)

## Dataframe Loading

### Control Functions

In [7]:
def topic_id_as_int(topic_id):
    try:
        topic_id = int(topic_id)
        return topic_id
    except ValueError:  # non-standard topic_id, e.g. 'TS14.18'
        split = topic_id.split(".", 2)  
        try:
            match = split[0].upper()
            if match == "TS14" or match == "14":
                topic_id = int(split[1])  # extract int '18'
                return topic_id
            else:
                raise ValueError()
        except ValueError:
            return None  # no discernable topic_id

def convert_df_topic_id(df, col_name="query_id", remove_null=True, in_place=False):
    conv_df = df
#     print("df[" + col_name + "] unique entries: " + conv_df[col_name].unique())
    conv_df[col_name] = conv_df[col_name].apply(lambda x: topic_id_as_int(x))
    if remove_null:
        conv_df = conv_df[conv_df[col_name].notna()]
        conv_df = conv_df.astype({col_name:int})
    if in_place:
        df = conv_df
        return df
    else:
        return conv_df

def find_duplicates(df):
    seen = set()
    seen_twice = set()
    for docid in df['docid']:
        if docid not in seen:
            seen.add(docid)
        else:
            seen_twice.add(docid)
    return seen_twice

def get_file_ext(path):
    ext = Path(path).suffixes
    ext = "".join(ext)  # in case of multiple suffix e.g. .csv and .gz
    return ext

def save_df_file_type(df, save_path, verbose=True):
    file_type = get_file_ext(save_path)
    df = remove_unnamed_cols(df)
    if file_type == ".csv.gz":
        df.to_csv(save_path, compression='gzip')
        if verbose:
            print("df saved as gzipped csv at: " + str(save_path))
    elif file_type == ".hdf":
        complevel = 9
        key = "single_df"
        # key param (single_df) is required, since only storing one df in file, useless info
        df.to_hdf(save_path, "single_df", complevel=complevel)
        if verbose:
            print("df saved as hdf complevel " + str(complevel) + " at: " + str(save_path))
    else:
        raise ValueError(str(file_type) + " is not a valid file type option")
        
def read_df_file_type(save_path, verbose=True, concat_multiple=True):
    # convert to list to allow read multiple
    save_path = convert_to_list(save_path)
    dfs = []
    for path in save_path:
        file_type = get_file_ext(path)
        if file_type == ".csv.gz":
            for path in save_path:
                dfs.append(pd.read_csv(path, compression='gzip'))
            if verbose:
                print("loaded from .csv.gz file")
        elif file_type == ".hdf":
            for path in save_path:
                dfs.append(pd.read_hdf(path))
            if verbose:
                print("loaded from .hdf file")
        else:
            raise ValueError(str(file_type) + " is not a valid file type option")
    for df in dfs:
        df = remove_unnamed_cols(df)
    if concat_multiple:
        dfs = pd.concat(dfs, ignore_index=True, sort=False)  # combine into one df
    return dfs

def load_df_control(save_path, load_func, save=True, force_reload=False, 
                    name=None, verbose=True, path_handler=None):
    df = None
    save_path = convert_to_list(save_path)  # allows loading groups of saved files same way as singular paths
    if name is not None and verbose:
        print("Loading " + name)
    if not file_exists(save_path) or force_reload:
        if len(save_path) > 1:
            raise ValueError("There should only be one path to save to if no save paths already exist")
        df = load_func()
        df = remove_unnamed_cols(df)
        if verbose:
            print("df created from scratch")
        if save:
            # in case of loading df from original file, should only be one save_path
            save_df_file_type(df, save_path[0], verbose=verbose)
            if path_handler is not None:
                path_handler.update_path_exists(save_path[0])
    else:
        df = read_df_file_type(save_path, verbose=verbose)
    if verbose:
        print(display(df[0:4]))
    return df

### Dataframes from Corpus Files

#### Topics

In [8]:
# load topics into dataframe
def __load_topics(path, verbose=True):
    topics_list = []
    path = convert_to_list(path)
    for p in path:
        parse_markup(open_markup_file(p, gz=False, xml=True), 
                        topics_list, topic_tags, "event")
    df = list_to_dataframe(topics_list, topic_tags)
    
    df = convert_df_topic_id(df, col_name='id', remove_null=True)
    # drop any duplicates found over the files
    prev_size = len(df)
    df = df.drop_duplicates(subset=['id'], keep='first')  # no duplicate documents
    if verbose:
        num_removed = len(df) - prev_size
        print(str(num_removed) + " duplicate documents removed from topics df")
    return df

def load_topics(save_path, load_path=None, save=True, force_reload=False, verbose=True, path_handler=None):
    topics = load_df_control(save_path, 
                             
                             lambda: __load_topics(load_path, verbose=verbose), 
                             
                             save=save, force_reload=force_reload, name="topics", verbose=verbose, 
                             path_handler=path_handler)
    return topics

# topics = load_topics()

#### Main Corpus Files

In [9]:
# load all formatted gzipped html files into dataframe

def __load_corpus(corpus_dir, doc_tags=None, topic_ids=None, split_every=None, split_start_doc=None,
                 drop_duplicates=True, verbose=True):
    if doc_tags is None:
        doc_tags = ['topic_id','streamid', 'docid', 'yyyymmddhh', 'kbastream', 'zulu', 'epoch', 'title', 'text', 'url'] # doc fields
    df = pd.DataFrame(columns=doc_tags)
    
    for topic_id in topic_ids:
        print("Loading topic " + str(topic_id) + "...")
        topic_list = []
        topic_path = corpus_dir + '/' + str(topic_id)
        gz_paths = search_dir(topic_path)
        
        if split_every is not None and split_start_doc is not None:
            end_split = split_start_doc + split_every
            if end_split >= len(gz_paths):  # last section
                end_split = len(gz_paths) - 1
            gz_paths = gz_paths[split_start_doc:end_split]
        
        for gz_path in tqdm(gz_paths, position=0, leave=True):
            parse_markup(open_markup_file(gz_path, verbose=False),
                             topic_list, doc_tags, "doc", topic_id=topic_id)
        topic_df = list_to_dataframe(topic_list, doc_tags)
        df = df.append(topic_df)
    df['epoch'] = pd.to_numeric(df['epoch'])
    if drop_duplicates:
        prev_size = len(df)
        df = df.drop_duplicates(subset=['streamid'], keep='first')  # no duplicate documents
        if verbose:
            num_removed = len(df) - prev_size
            print(str(num_removed) + " duplicate documents removed from corpus")
    return df

def load_corpus(save_path, corpus_dir=None, doc_tags=None, topic_ids=None, split_every=None, split_start_doc=None,
                save=True, force_reload=False, verbose=True, path_handler=None, drop_duplicates=True):
    
    corpus = load_df_control(save_path, 
                             
                             lambda: __load_corpus(corpus_dir, doc_tags=doc_tags, 
                                                   topic_ids=topic_ids, split_every=split_every,
                                                   split_start_doc=split_start_doc, 
                                                   drop_duplicates=drop_duplicates, verbose=verbose), 
                             
                             save=save, force_reload=force_reload, name="corpus", verbose=verbose, 
                             path_handler=path_handler)
    # remove duplicate documents from corpus if required
    if drop_duplicates:
        if corpus['streamid'].duplicated().any():  # if there are any duplicates
            prev_size = len(corpus)
            corpus = corpus.drop_duplicates(subset=['streamid'], keep='first')  # get rid of them
            if verbose:
                num_removed = len(corpus) - prev_size
                print(str(num_removed) + " duplicate documents removed from corpus")
    
    if verbose:
        print("Corpus loaded succesfully: " + str(len(corpus)) + " documents loaded.")
    return corpus

# corpus = load_corpus(doc_tags=doc_tags)

#### Nuggets (Evaluation Technique)

In [10]:
def spacy_sents_string_list(text, nlp):
    if nlp is None:
        nlp = spacy.load("en_core_web_sm")
    spacy_text = nlp(text)
    spacy_sents = list(map(str, spacy_text.sents))
    return spacy_sents, nlp

def find_nugget_spacy(text, match_start, nlp):
    spacy_sents, nlp = spacy_sents_string_list(text, nlp)
    nug = None
    char_count = 0
    sent_id = 0
    for s in spacy_sents:  # find sentence by where characters start
        s = str(s)  # convert from spacy tokens to string
        s_len = len(s)
        if char_count + s_len > match_start:
            nug = s
            break
        char_count += s_len
        sent_id += 1
    return nug, sent_id, nlp

def find_nugget_in_text(text, sent_id, match_start, nlp, spacy_if_not_found=True):
    """Retrieve sentence at index sent_id"""
    technique = "splitlines"  # indicate how sent was found in df
    split = text.splitlines()
    if split[0] == "":
        sent_id += 1  # first entry is empty, adjust offset
    nug = None
    try:
        nug = split[sent_id]
    except IndexError:  # increment has pushed offset out of bounds
        try:
            sent_id -= 1
            nug = split[sent_id]
        except IndexError as e:  # sent_id does not match text indexing
            if spacy_if_not_found:
                nug, sent_id, nlp = find_nugget_spacy(text, match_start, nlp)
                technique = "spacy"
    return nug, sent_id, technique

In [11]:
def create_nugget_df(corpus_df, nuggets_tsv=None, matches_tsv=None, nuggets_tsv_path=None,
                     matches_tsv_path=None, topic_ids=None, spacy_if_not_found=True, verbose=True):
    def check_load_tsv(tsv, path):
        if tsv is None:
            if path is None:
                raise Exception("Must either pass the tsv file or the path to load it")
            else:  # load tsv
                path = convert_to_list(path)
                tsv = []
                for p in path:
                    tsv.append(pd.read_csv(p, "\t"))
                tsv = pd.concat(tsv, ignore_index=True, sort=False)
        return tsv
    
    # perform check if tsvs or their paths have been passed
    nuggets_tsv = check_load_tsv(nuggets_tsv, nuggets_tsv_path)
    matches_tsv = check_load_tsv(matches_tsv, matches_tsv_path)
    
    def parse_update_id(update_id):
        """Separate update_id into component streamid and sent_id"""
        update_id = update_id.split("-")
        sent_id = int(update_id[-1])
        streamid = "-".join(update_id[:-1])
        return streamid, sent_id
    
    
    nlp = None  # spacy model, load if needed
    entry_list = []  # list of dicts to build dataframe
    
    # what columns from each dataframe to extract to put into nugget_df
    nug_tsv_cols = ['nugget_id', 'importance', 'nugget_len', 'nugget_text']
    mat_tsv_cols = ['query_id', 'match_start', 'match_end']
    corp_cols = ['docid', 'streamid', 'epoch']
    # reference what columns to convert from string into numerical values
    num_cols = ['query_id', 'importance', 'nugget_len', 'epoch', 'sent_id', 'match_start', 'match_end']
    
    # convert topic_ids to int standard
    # set to new var to allow passing same unchanged nuggets/matches_tsv each time
    nug_tsv = convert_df_topic_id(nuggets_tsv, col_name='query_id', remove_null=True)
    mat_tsv = convert_df_topic_id(matches_tsv, col_name='query_id', remove_null=True)
    
    # target only selected topic_ids if not None
    if topic_ids is not None:
        for topic_id in topic_ids:
            nug_tsv = nug_tsv[nug_tsv['query_id'] == topic_id]
            mat_tsv = mat_tsv[mat_tsv['query_id'] == topic_id]
    
    missed_streamids = []  # store streamids not found for debug purposes
    missed_nuggetids = []  # debug purposes
    missed_sentid_streamids = []  # streamid where sent_id indexing out of bounds
    pbar = tqdm(total=len(mat_tsv), position=0, leave=True)
    for index, row  in mat_tsv.iterrows():
        entry = {}
        
        # get streamid and sentid of nugget occurence
        streamid, sent_id = parse_update_id(row['update_id'])
        
        # find occurence in corpus
        occur = corpus_df[corpus_df['streamid'] == streamid]
        if len(occur) == 0:
            missed_streamids.append(streamid)
            pbar.update()
            continue
        elif len(occur) > 1:
            if verbose:
                print("Number of entries with streamid: " + str(len(occur)))
                print(display(occur))
            raise Exception("There should be one entry in corpus with given streamid " + str(streamid))
        occur = occur.iloc[0].to_dict()
        
        # get text of the occurence
        occur_text = occur['text']
        match_start = int(row['match_start'])
        
        # get text of the nugget
        nug_row = nug_tsv[nug_tsv['nugget_id'] == row['nugget_id']]
        if len(nug_row) != 1:
            missed_nuggetids.append(row['nugget_id'])
            continue
        nug_row = nug_row.iloc[0].to_dict()
        
        # add columns from each dataframe
        for col in mat_tsv_cols:
            entry[col] = mat_tsv.at[index, col]
        for col in nug_tsv_cols:
            entry[col] = nug_row[col]
        # adding these columns here to control order of columns in final df
        found_sent, sent_id, technique = find_nugget_in_text(occur_text, sent_id, match_start, nlp,
                                                            spacy_if_not_found=spacy_if_not_found)
        entry['sent_in_text'] = found_sent
        entry['sent_id'] = sent_id
        entry['technique'] = technique
        for col in corp_cols:
            entry[col] = occur[col]
        
        if technique == "spacy":  # sent_id indexing was wrong
            missed_sentid_streamids.append(streamid)
        
        entry_list.append(entry)
        pbar.update()
        
    if verbose:
        print("Nugget entries were generated for " + str(len(entry_list)) + " nuggets. There were "
             + str(len(missed_streamids)) + " found in matches.tsv but not in corpus")
        print("There were " + str(len(missed_nuggetids)) + " nugget_ids found in matches.tsv but not in nuggets.tsv")
        print(str(len(missed_sentid_streamids)) + " out of " + str(len(entry_list)) + 
              " streamids had out of bounds sent_ids")
        
    nugget_df = pd.DataFrame(entry_list)
    if len(nugget_df) > 0:
        nugget_df[num_cols] = nugget_df[num_cols].apply(pd.to_numeric, errors='coerce', axis=1)  # convert appropriate cols to numerical values
        nugget_df.rename(columns={'query_id':'topic_id'}, inplace=True)  # topic_id matches other dataframes
    
    if verbose:
        print("nugget_df entries: " + str(len(nugget_df)))
    
    return nugget_df

In [12]:
def load_nugget_df(save_path, corpus_df=None, topic_ids=None,path_handler=None, save=True, force_reload=False, 
                   verbose=True, nuggets_tsv=None, matches_tsv=None, nuggets_tsv_path=None, matches_tsv_path=None,
                  spacy_if_not_found=True):
    
    nugget_df = load_df_control(save_path, 
                                
                    lambda: create_nugget_df(corpus_df, nuggets_tsv=nuggets_tsv, verbose=verbose,
                                             matches_tsv=matches_tsv, nuggets_tsv_path=nuggets_tsv_path, 
                                             matches_tsv_path=matches_tsv_path, topic_ids=topic_ids,
                                            spacy_if_not_found=spacy_if_not_found), 
                                
                                save=save, force_reload=force_reload, verbose=verbose,
                               path_handler=path_handler)
    return nugget_df

### Embedding Generation

In [13]:
def create_embedding_df(emb_model, corpus_df, nugget_df, sents_default="splitlines",
                        only_docs_with_nugs=False, nlp=None, verbose=True):
    """
    Pass through corpus and create embedding for each sentence.
    Use nugget_df to identify is sentences were created by splitlines or spacy  (add label on df)
    Use nugget_df sent_id label in resulting embedding dataframe whether sentence is a nugget
    """
        
    entry_list = []
    docs_multiple_nugs = []

    for index, row in tqdm_notebook(corpus_df.iterrows(), total=len(corpus_df)):
        # look up doc in nugget_df
        streamid = row['streamid']
        nug = nugget_df[nugget_df['streamid'] == streamid]
        nug_sent_ids = []  # sent ids of nuggets in doc
        technique = sents_default  # technique used to split sentences/get nuggets

        if len(nug) == 0:  # doc has no nuggets
            if only_docs_with_nugs:  # skip this document
                continue
        elif len(nug) > 1:  # doc has multiple possible nuggets
            # collect unique sent_ids
            uniq_sent_ids = list(nug['sent_id'].unique())
            # check if technique/way sentences were constructed match
            uniq_techs = list(nug['technique'].unique())
            if len(uniq_techs) > 1:
                continue
#                 print(display(nug))
#                 raise Exception("Streamid with multiple nuggets, uses inconsistent techniques to collect nuggets")
            # check sents with different ids are different
            for sent_id in uniq_sent_ids:
                nug_sent_id = nug[nug['sent_id'] == sent_id]
                uniq_sents = list(nug_sent_id['sent_in_text'].unique())
                if len(uniq_sents) > 1:  # mismatch sent_ids and sentence its referencing
                    print(display(nug))
                    print("Unique sentences for " + str(streamid) + " at sent_id " + str(sent_id) + ": ")
                    print(uniq_sents)
                    raise Exception("Streamid with multiple nuggets, has mismatched sent_ids and referred sentences")
            
            docs_multiple_nugs.append(streamid)  # debug/verbose info
            technique = uniq_techs[0] # should only be one in list
            nug_sent_ids.extend(uniq_sent_ids)  # add multiple nug sent ids
            
        elif len(nug) == 1: # doc has a single nugget
            #  get nugget info for df columns
            nug = nug.iloc[0].to_dict()
            if technique in nug and nug['technique'] is not None:
                technique = nug['technique']
            nug_sent_ids.append(nug['sent_id'])  # change this for multiple

        # split sentence in accordance with how nugget sentence was found/default method if no nuggets
        sents = []
        if technique == "splitlines":
            sents = row['text'].splitlines()
        elif technique == "spacy":
            # remove spacy option, refactor this
            continue 
#             sents, nlp = spacy_sents_string_list(str(row['text']), nlp)

        # get contextual sentence embeddings
        emb_sents = emb_model.encode(sents, show_progress_bar=False)

        # create dataframe entries
        topic_id = int(row['topic_id'])
        for i in range(len(sents)):
            entry = {"topic_id":topic_id, "streamid":streamid, "sent_id":i, "sentence":sents[i],
                     "embedding":emb_sents[i], "is_nugget":False, "technique":technique}
            if i in nug_sent_ids:
                entry['is_nugget'] = True
            entry_list.append(entry)

    if verbose:
        print("Embeddings generated for " + str(len(entry_list)) + " sentences")
    emb_df = pd.DataFrame(entry_list)
    return emb_df 

In [14]:
def load_embeddings(save_path, emb_model=None, corpus_df=None, nugget_df=None, sents_default="splitlines", 
                    only_docs_with_nugs=False, nlp=None, 
                    force_reload=False, save=True, verbose=True, path_handler=None):
    emb_df = load_df_control(save_path, 
                             
                             lambda: create_embedding_df(emb_model, corpus_df, nugget_df,
                                            sents_default=sents_default, nlp=nlp, verbose=verbose,
                                            only_docs_with_nugs=only_docs_with_nugs),

                                save=save, force_reload=force_reload, name="emb_df", verbose=verbose,
                                path_handler=path_handler)
    return emb_df

#### Update Dataframe (Temporal Information)

In [15]:
# def create_update_df():
#     """Data Frame containing information about docs which have updates/multiple instances in corpus"""
#     def create_entry(row, col_tags):
#         entry = {}
#         for col in col_tags:
#             entry[col] = row[col]
#         return entry
    
#     col_tags = ['docid', 'streamid', 'epoch', 'yyyymmddhh', 'zulu']
#     entry_list = []
#     dups = find_duplicates(corpus)
#     for docid in tqdm(dups, position=0, leave=True):
#         d = corpus[corpus['docid'] == docid]
#         for index, row in d.iterrows():
#             entry = create_entry(row, col_tags)
#             entry_list.append(entry)
             
#     update_df = pd.DataFrame(entry_list)
#     update_df = update_df.set_index(col_tags)
#     return update_df

# def load_update_df(save=True, force_reload=False, verbose=True):
#     update_df = load_df_control(update_csv_path, create_update_df, 
#                                 save=save, force_reload=force_reload, name="update_df", verbose=verbose)
#     return update_df

# update_df = load_update_df()

### Embedding Label Generation

In [16]:
def emb_str_to_float_vector(emb_string):
    """Vectors are being stored as a string. equivalent to str(vector)"""
    # get newlines
    emb_temp = emb_string.split("\n")
    lines = []
    for line in emb_temp:
        # remove bracket if necessary
        line = line.replace("[", "")
        line = line.replace("]", "")
        # get individual nums
        line = line.split(" ")
        line = [x for x in line if x != ""]  # remove any empty entries
        # convert string to float
        line = np.array(line)
        line = line.astype(np.float32)  # type used in the bert embeddings
        lines.append(line)
    # join all lines into single vector
    emb_vec = np.concatenate(lines, axis=0)
    return emb_vec

# def cosine_similarity(vec_a, vec_b):
#     """Get cosine similarity between two vectors"""
# #     print("in custom cosine_similarity func, is_normed==" + str(is_normed))
#     # retain most precision by converting to 64 bit for operation
#     vec_a = vec_a.astype(np.float64)
#     vec_b = vec_b.astype(np.float64)
#     cos_sim = np.dot(vec_a, vec_b) / (np.linalg.norm(vec_a)*np.linalg.norm(vec_b))
#     # adjust marginal out of bounds floating point rounding errors
#     if cos_sim > 1.0:
#         cos_sim = np.float64(1.0)
#     elif cos_sim < -1.0:
#         cos_sim = np.float64(-1.0)
#     return cos_sim

from numba import jit
@jit(nopython=True)
def cosine_similarity(u:np.ndarray, v:np.ndarray):
    """https://gist.github.com/pranaychandekar/195dc2de1deda3cb0102e3f310071b5e#file-fast_cosine_similarity-py
    Speed up numpy computation
    """
    assert(u.shape[0] == v.shape[0])
    uv = 0
    uu = 0
    vv = 0
    for i in range(u.shape[0]):
        uv += u[i]*v[i]
        uu += u[i]*u[i]
        vv += v[i]*v[i]
    cos_theta = 1
    if uu!=0 and vv!=0:
        cos_theta = uv/np.sqrt(uu*vv)
    return cos_theta

def l2_normalise_emb(x):
#     normed = x / sum(x**2)**0.5
    normed = (x.T/ (x**2).sum()**0.5).T
    return normed

def thread_map_cos_sim(emb, nug_embs):
    max_cos_sim = float('-inf')
    for nug_emb in nug_embs:
        cos_sim = cosine_similarity(emb, nug_emb)
        if cos_sim > max_cos_sim:
            max_cos_sim = cos_sim
#     pbar.update()
    return max_cos_sim

def cos_sim_nearest_nug_col(embs, nug_embs, verbose=True):
#     pbar = tqdm_notebook(total=len(embs))

    def create_arg_tuples(embs, nug_embs):
        arg_list = []
        for emb in embs:
            add_tuple = (emb, nug_embs)
            arg_list.append(add_tuple)
        return arg_list

#         pool = Pool(num_threads)
    pool = Pool()
    arg_list = create_arg_tuples(embs, nug_embs)

    if verbose:
        print("Starting cos_sim_nearest_nug for " + str(len(embs)) + " embeddings and "
             + str(len(nug_embs)) + " nugget embeddings")

    max_cos_sims = pool.starmap(thread_map_cos_sim, arg_list)  # tag progress bar
    pool.close()
    pool.join()

    if verbose:
        print("Finished cos_sim_nearest_nug for " + str(len(embs)) + " embeddings")

    nearest_nug_col = pd.Series(max_cos_sims, dtype=np.float64)
    return nearest_nug_col

In [17]:
# from scipy.spatial import cKDTree

class EmbeddingLabelGenerator:
    def __init__(self, proj_dir):
        self.proj_dir = proj_dir
        self.path_retriever = PathRetriever(proj_dir)
        self.avg_nug_emb_filename = "avg_nug_emb.npy"
        self.label_options = ["cosine_similarity", "cos_sim_nearest_nug"]
        self.path_history_filename = "label_gen_history.pickle"
        
    def add_labels(self, corpus_name, nested_dir, file_type=".hdf", split_step=None, 
                   verbose=True, force_reload=False, save=True, selection=None):
        # resolve label selection
        if selection is None:  # select all labels
            selection = self.label_options
        else:  # make sure no mistakes in parameter
            labels_correct = [x for x in selection if x in self.label_options]
            if not all(labels_correct):
                raise Exception(str(selection) + " contains an invalid selection parameter")
        
        emb_df_paths, emb_dir = self.path_retriever.get_embedding_paths(corpus_name, nested_dir, 
                                                                        split_step=split_step,
                                                         file_type=file_type, verbose=False, exists=True,
                                                         use_any=False, return_dir_path=True)
        
        
        # dict tracking what files have already been processed
        path_history, path_history_path = self.load_path_history(emb_dir, verbose=verbose)
        
#         new_path_history = defaultdict(set)
#         for topic_id in range(1, 12):
#             if topic_id == 7:
#                 continue
#             new_cols = ['cosine_similarity', 'cos_sim_nearest_nug']
#             new_cols = set(new_cols)
#             topic_paths = self.path_retriever.get_embedding_paths(corpus_name, nested_dir, 
#                                                                         split_step=split_step,
#                                                          file_type=file_type, verbose=False, exists=True,
#                                                          use_any=False, return_dir_path=False,
#                                                                  topic_ids = [topic_id])
#             topic_paths = list(topic_paths['path'])
#             for topic_path in topic_paths:
#                 new_path_history[topic_path] = new_cols
#         self.save_path_history(new_path_history, path_history_path, verbose=True)
#         raise Exception("reset path history")
        
#         new_path_history = defaultdict(list)
#         for hist_emb_path, hist_cols in path_history.items():
#             new_cols = [x for x in hist_cols if x == "l2_normed"]
#             new_cols = set(new_cols)
#             new_path_history[hist_emb_path] = new_cols
#         self.save_path_history(new_path_history, path_history_path, verbose=verbose)
#         raise Exception("reset path_history")
        
        # do pre-processing for labels where required
        avg_emb = None  # cosine_similarity
        nug_embs = None
        if "cosine_similarity" in selection:
            avg_emb, nug_embs = self.get_avg_nugget_emb(corpus_name, nested_dir, emb_df_paths, emb_dir, verbose=verbose, 
                                      force_reload=False, save=save, return_nug_embs=True)
        if "cos_sim_nearest_nug" in selection:
            if nug_embs is None:
                nug_embs = self.get_nug_embs(emb_df_paths, emb_dir, col='embedding', force_reload=False)
                nug_embs = np.asarray(nug_embs, dtype=np.float64)
        if verbose:
            colstr = ", ".join(selection)
            print("Adding " + colstr + " to " + str(len(emb_df_paths)) + " dataframes")
        
        # add selected labels
        for emb_path in tqdm_notebook(emb_df_paths['path']):
            # find what part of file hasn't been processed
            missing_cols = selection
            if not force_reload and emb_path in path_history:
                missing_cols = [col for col in selection if col not in path_history[emb_path]]
            if len(missing_cols) == 0:
                continue  # already processed all target columns
            if verbose:
                print("Loading " + str(emb_path))
                
            # add selected labels
            emb_df = load_embeddings(emb_path, verbose=False)
            cols = emb_df.columns
            embs = list(emb_df['embedding'])
            embs = np.asarray(embs, dtype=np.float64)
            for select in missing_cols:
#                 if select not in cols or force_reload:
                if verbose:
                    print("Adding " + str(select))
                if select == 'cosine_similarity':
                    cos_sim_col = self.cos_sim_col(embs, avg_emb)
                    emb_df['cosine_similarity'] = cos_sim_col
                if select == "cos_sim_nearest_nug":
                    nearest_nug_col = cos_sim_nearest_nug_col(embs, nug_embs, verbose=verbose)
                    emb_df['cos_sim_nearest_nug'] = nearest_nug_col
            if save:
                if verbose:
                    print("Saving file to " + str(emb_path))
                save_df_file_type(emb_df, emb_path, verbose=verbose)
                # store path history to prevent reopening file to check if columns added
                path_history[emb_path].update(missing_cols)
                self.save_path_history(path_history, path_history_path, verbose=verbose)
        print("\nCompleted adding " + str(", ".join(selection)) + " to " + str(nested_dir))
    
    def load_path_history(self, emb_dir, verbose=True):
        """Dictionary to keep track of which files have already been processed. Stored in emb_dir/nested_dir
            Keys: path string
            Values: list of columns added
        """
        path_history_path = os.path.join(emb_dir, self.path_history_filename)
        path_history = None
        if os.path.exists(path_history_path):
            with open(path_history_path, 'rb') as handle:
                path_history = pickle.load(handle)
            if verbose:
                print("Path history loaded from file at: " + str(path_history_path))
        else:
            path_history = defaultdict(set)
            if verbose:
                print("Path history created from scratch for: " + str(emb_dir))
        return path_history, path_history_path
    
    def save_path_history(self, path_history, path_history_path, verbose=True):
        with open(path_history_path, 'wb') as handle:
            pickle.dump(path_history, handle, protocol=pickle.HIGHEST_PROTOCOL)
        if verbose:
            print("Path history saved to: " + str(path_history_path))
        
    def l2_normed_col(self, embs, verbose=True):
        if verbose:
            print("Adding l2_normed column")
        normed = [l2_normalise_emb(x) for x in embs]
        l2_col = pd.Series(normed, dtype=object)
        return l2_col
    
    def cos_sim_col(self, embs, avg_emb):
        # get cosine similarity for each emb
        sims = [cosine_similarity(avg_emb, x) for x in embs]
        # add to df
        sim_col = pd.Series(sims, dtype=np.float64)
        return sim_col
    
    
#     def cos_sim_nearest_nug_col(self, embs, nug_embs, verbose=True):
# #         pbar = tqdm_notebook(total=len(embs))
        
#         def create_arg_tuples(embs, nug_embs):
#             arg_list = []
#             for emb in embs:
#                 add_tuple = (emb, nug_embs)
#                 arg_list.append(add_tuple)
#             return arg_list
                
# #         pool = Pool(num_threads)
#         pool = Pool()
#         arg_list = create_arg_tuples(embs, nug_embs, pbar)
        
#         if verbose:
#             print("Starting cos_sim_nearest_nug for " + str(len(embs)) + " embeddings and "
#                  + str(len(nug_embs)) + " nugget embeddings")
            
#         max_cos_sims = pool.starmap(thread_map_cos_sim, arg_list)  # tag progress bar
        
#         if verbose:
#             print("Finished cos_sim_nearest_nug for " + str(len(embs)) + " embeddings")
            
#         nearest_nug_col = pd.Series(max_cos_sims, dtype=np.float64)
#         return nearest_nug_col
        

    def get_avg_nugget_emb(self, corpus_name, nested_dir, emb_df_paths, emb_dir, verbose=True, 
                           force_reload=False, save=True, return_nug_embs=False):
        
        avg_emb_path = os.path.join(emb_dir, self.avg_nug_emb_filename)
        # create list of nugget embeddings or load from file
        nug_embs = self.get_nug_embs(emb_df_paths, emb_dir, col='embedding', verbose=verbose,
                                    force_reload=force_reload)
        if force_reload or not os.path.exists(avg_emb_path):  # create new file
            if verbose:
                print("Generating average nugget embedding from dataframes")
            # convert to matrix
            emb_stack = np.stack(nug_embs, axis=0)
            # get avg embedding
            avg_emb = np.mean(emb_stack, axis=0, dtype=np.float64)  # use f64 dtype for better precision in avg
            if save:
                np.save(avg_emb_path, avg_emb)
                if verbose:
                    print("Avg nugget embedding saved to: " + str(avg_emb_path))
        else:  # load from old file
            avg_emb = np.load(avg_emb_path)
            if verbose:
                print("Avg nugget embedding loaded from: " + str(avg_emb_path))
        if return_nug_embs:
            return avg_emb, nug_embs
        return avg_emb
    
    def get_nug_embs(self, emb_df_paths, emb_dir, col='embedding', force_reload=False, save=True, verbose=True):
        # check if already saved
        nug_embs = []
        fn = str(col) + "_list_all.pickle"
        nug_embs_path = os.path.join(emb_dir, fn)
        if force_reload or not os.path.exists(nug_embs_path):  # create new file
            if verbose:
                print("Collecting nugget embeddings from dataframes")
            for emb_path in tqdm_notebook(emb_df_paths['path']):
                emb_df = load_embeddings(emb_path, verbose=False)
                embs = list(emb_df.query('is_nugget==True')[col])
                for emb in embs:
                    if type(emb) == str:
                        emb = emb_str_to_float_vector(emb)
                nug_embs.extend(embs)
            # save
            if save:
                with open(nug_embs_path, 'wb') as handle:
                    pickle.dump(nug_embs, handle, protocol=pickle.HIGHEST_PROTOCOL)
                if verbose:
                    print("nug_embs list saved to " + str(nug_embs_path))
        else:  # load from file
            if verbose:
                print("Loading nug_embs list from " + str(nug_embs_path))
            with open(nug_embs_path, 'rb') as handle:
                nug_embs = pickle.load(handle)
        return nug_embs

## Filter the Larger Trects Dataset

In [18]:
class TrectsFilter:
    def __init__(self):
        self.base_dir = '/nfs/trects-kba2014'
        self.updates_dir = "/nfs/TemporalSummarization/ts14/results"
        self.updates_csv_paths = self.generate_update_paths()
        self.save_dir = '/nfs/mine-trects-kba2014-filtered'
        self.proc_history_path = self.save_dir + '/' + 'process_history.pickle'
        self.proc_history = None
        self.streamids = set()

    def generate_update_paths(self, verbose=True):
        up_dir = "/nfs/TemporalSummarization"
        ts_dirs = ["ts13", "ts14", "ts15"]
        target_files = ['updates_sampled.extended.tsv', 'updates_sampled.tsv',
                                 'matches.tsv']
        
        up_paths = []
        wrong_paths = []
        for ts_dir in ts_dirs:
            for target_file in target_files:
                full_path = up_dir + '/' + ts_dir + '/results/' + target_file
                if os.path.exists(full_path):
                    up_paths.append(full_path)
                else:
                    wrong_paths.append(full_path)
        if verbose:
            print("Attempted to find streamids in these files, but no path exists:")
            print(wrong_paths)
            print("")
        return up_paths
        
        
    def create_filtered_dataset(self, force_reload=False, verbose=True, no_soup=True):
        """ Outline of Process
        1. Find streamids
            1.1 open updates_sampled.tsv file (or updates_sampled.extended.tsv)
            1.2 Scrape update_id column
            1.3 transform into streamid (drop last hyphenated numbers (these are sentenceids))
            1.4 Put streamids into datastructure for comparing (e.g. set)
        2. Create a new directory for each topic folder there is in target dir
        3. Opening up documents
            3.1 Go for each topic folder
            3.2 Open up each document
            3.3 Parse into html tree
            3.4 if streamid matches, store locally in memory buffer
            3.5 store file with same filename with matched streamids in another location
        """
        # get streamids for docs that we will filter for
        for update_csv_path in self.updates_csv_paths:
            self.get_streamids(update_csv_path)
        if verbose:
            print("Number of streamids searching for: " + str(len(self.streamids)))

        # get topicids from folder names
        topic_ids = [int(tid) for tid in os.listdir(self.base_dir) if tid.isdigit()]
        topic_ids.sort()
        
        # create dir to save filtered corpus to
        self.create_dir(self.save_dir)
        
        # load history of files already processed if exists
        self.load_process_history_dict(topic_ids)
        
        
        for topic_id in tqdm_notebook(topic_ids, position=0, leave=True):
            # create save directory
            topic_save_dir = self.save_dir + '/' + str(topic_id)
            self.create_dir(topic_save_dir)
            
            # get paths for files in target topic dir
            topic_dir = self.base_dir + '/' + str(topic_id)
            gz_paths = search_dir(topic_dir)
            
            # remove already processed files
            if not force_reload:
                if verbose:
                    prev_printed = [x for x in gz_paths if x in self.proc_history[topic_id]]
                    print("Previously processed " + str(len(prev_printed)) + " of " + str(len(gz_paths))
                         + " paths for topic " + str(topic_id))
                gz_paths = [x for x in gz_paths if x not in self.proc_history[topic_id]]
            
            if verbose:
                if len(gz_paths) > 0:
                    print("Processing topic " + str(topic_id))
            
            # process each file
            for gz_path in tqdm_notebook(gz_paths):
                if no_soup == True:
                    matches = self.process_file(gz_path, verbose=verbose)
                    if verbose:
                        print("len matches: " + str(len(matches)))
                    save_path = self.get_file_save_path(topic_id, gz_path)
                    self.write_docs_to_file(matches, save_path, no_soup=True, verbose=verbose)
                else:
                    # get file markup
                    markup = open_markup_file(gz_path, verbose=verbose)  # 50MB file proving hard for beautifulsoup
                    # get docs in file that are in streamids
                    matches = self.retrieve_matching_docs(markup, verbose=verbose)
                    save_path = self.get_file_save_path(topic_id, gz_path)
                    # write file and save results
                    self.write_docs_to_file(matches, save_path, verbose=verbose)
                self.proc_history[topic_id].add(gz_path)
                self.save_process_history_dict(verbose=verbose)
                
        print("Finished filtering corpus")
        
    def process_file(self, filepath, verbose=True):
        if verbose:
            print("Processing file at: " + str(filepath))
        matching_docs = []
        doc_buffer = []
        found_sid = False
        save_doc = False
        with gzip.open(filepath, 'rt') as f:
            for line in f:
                doc_buffer.append(line)  # add cur line to buffer
                buff_len = len(doc_buffer)
                if buff_len > 1:
                    if not found_sid:
                        if self.is_tag(line, tag="streamid"):
                            found_sid = True
                            sid = self.get_inner_tag(line, tag="streamid", remove_whitespace=True)
                            if sid in self.streamids:
                                save_doc = True
                    if self.is_tag(line, tag="doc", start_tag=False, end_tag=True):
                        if save_doc:  # if has matching streamid save doc file
                            matching_docs.append(doc_buffer)
                        doc_buffer = []
                        save_doc = False
                        found_sid = False
                        
                elif self.is_tag(line, tag="doc", start_tag=True, end_tag=False):
                    # reset variables
                    doc_buffer = []
                    doc_buffer.append(line)
        return matching_docs
                
                
    def is_tag(self, line, tag="streamid", start_tag=True, end_tag=True):
        start, end = self.create_tags(tag)
        start_true = False
        end_true = False
        
        if re.search(start, line, re.IGNORECASE):
            start_true = True
        if re.search(end, line, re.IGNORECASE):
            end_true = True
        
        if start_tag == True:
            if end_tag == True:
                return start_true and end_true
            else:
                return start_true
        else:
            return end_true
        
    def create_tags(self, tag):
        start_tag = "<" + tag + ">"
        end_tag = "</" + tag + ">"
        return start_tag, end_tag
            
    def get_inner_tag(self, line, tag="streamid", remove_whitespace=False):
        start_tag, end_tag = self.create_tags(tag)
        no_tags = line.replace(start_tag,'').replace(end_tag,'')
        no_tags = line.replace(start_tag.upper(), '').replace(end_tag.upper(),'')  # repeat for uppercase
        if remove_whitespace == True:
            no_tags = no_tags.rstrip()
        return no_tags
        
    def save_process_history_dict(self, verbose=True):
        with open(self.proc_history_path, 'wb') as handle:
            pickle.dump(self.proc_history, handle, protocol=pickle.HIGHEST_PROTOCOL)
            if verbose:
                print("saved proc_history")
        
    def load_process_history_dict(self, topic_ids):
        if os.path.exists(self.proc_history_path):
            with open(self.proc_history_path, 'rb') as handle:
                self.proc_history = pickle.load(handle)
            return True
        else:
            self.proc_history = self.create_process_history_dict(topic_ids)
            return False
        
    def create_process_history_dict(self, topic_ids):
        """Create a dictionary to keep track of what files have already been searched"""
        proc_history = {}
        for topic_id in topic_ids:
            proc_history[int(topic_id)] = set()  # sets have faster indexing
        return proc_history

                
    def get_file_save_path(self, topic_id, gz_path):
        filename = self.get_filename_from_gz_path(gz_path)
        save_path = self.save_dir + '/' + str(topic_id) + '/' + filename
        return save_path
        
                
    def get_filename_from_gz_path(self, gz_path):
        split = gz_path.split("/")
        filename = split[-1]
        return filename  # return with file extension on
                

    def write_docs_to_file(self, doc_list, save_path, no_soup=False, verbose=True):
        # transform docs into string
        if len(doc_list) > 0:  # don't write empty files
            out = ""
            if no_soup == True:
                out += "<html>\n"
                for doc in doc_list:
                    out += "".join(list(map(str, doc)))
                    out += "\n"
                out += "</html>"
            else:
                out = "\n".join(list(map(str, doc_list)))
            # write
            with gzip.open(save_path, "wt") as f:
                f.write(out)
                if verbose:
                    print("File written to: " + str(save_path))
        
            
    def retrieve_matching_docs(self, markup, verbose=False):
        """Retrieve docs with matching streamids from markup"""
        matches = []
        doc_count = 0
        match_count = 0
        for doc in markup.find_all("doc"):
            d_streamid = str(doc.find("streamid").string)
            if d_streamid in self.streamids:  # matching doc
                matches.append(doc)
                match_count += 1
            doc_count +=1
        if verbose:
            print("doc count: " + str(doc_count) + "\nmatch_count: " + str(match_count))
        return matches
    
    def get_streamids(self, path):
        # read tsv file
        updates_csv = pd.read_csv(path, "\t")
        # take column with streamids
        updateids = list(updates_csv['update_id'])
        for updateid in updateids:
            streamid = self.parse_streamid(updateid)
            self.streamids.add(streamid)
        return self.streamids
        
    def parse_streamid(self, updateid):
        """Convert updateid in format: epoch-docid-sentid into epoch-docid"""
        split = updateid.split("-")
        split = split[:-1]  # remove sentid from end
        streamid = "-".join(split)
        return streamid
    
    def create_dir(self, dir_path):
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
            print("Created new directory at " + str(dir_path))

In [19]:
# trectsfilter = TrectsFilter()
# trectsfilter.create_filtered_dataset(verbose=True, force_reload=False, no_soup=True)

## Generating Processed Corpus Files

In [20]:
class CorpusGenerator:
    def __init__(self, proj_dir, corpus_split_step=200, embedding_split_step=50, load_default_emb_model=True):
        self.proj_dir = proj_dir
        self.path_handler = FilePathHandler(proj_dir)
        # ["topics", "corpus", "nuggets", "embed_labels", "updates"]
        self.file_purposes = ["topics", "corpus", "nuggets", "embeddings", "labels"]
        self.corpus_split_step = corpus_split_step
        self.embedding_split_step = embedding_split_step
        self.topic_dfs = {}  # dict of topic dfs per corpus_name
        self.force_reload_options = ["topics", "corpus", "nuggets", "embeddings", "labels"]
        self.file_type_options = [".csv.gz", ".hdf"]
        if load_default_emb_model:
            self.emb_model_dict = [{"model":SentenceTransformer('distilbert-base-nli-stsb-mean-tokens'),
                                  "name":'distilbert-base-nli-stsb-mean-tokens'}]
        
    def generate(self, file_type=".hdf", selection=None, corpus_names=None, new_corpuses=None, 
            force_reload=False, save=True, verbose=True, emb_model_dict=None, label_selection=None):
        """
        Parameters:
            file_type: output file type for dataframes, can be either "csv" or "hdf"
            
            force_reload: force generate new dataframes if files already exist
                options: True, False or list of selection i.e. ["topics", "corpus", "nuggets", "embeddings", "labels"]
            
            emb_model_dict: a dict or list of dicts containing a model to generate sentence embeddings with .encode(),
                            and a name to act as an identifier in a filename. If None, use default defined
                            in __init__() method. 
        """
        # interpret force_reload input
        if type(force_reload) is not list:
            if type(force_reload) is bool:
                if force_reload == True:
                    force_reload = self.force_reload_options
                else:
                    force_reload = []  # empty list, no chosen selection
        else:
            for select in force_reload:
                if select not in self.force_reload_options:
                    raise ValueError(str(select) + " is not a force_reload option")
        # interpret file_type input
        if file_type not in self.file_type_options:
            raise Exception(str(file_type) + " is not a valid file_type option to save dataframes")
        
        if verbose:
            if len(force_reload) != 0:
                print("force reloading the following selection: " + str(force_reload))
        
        if emb_model_dict is not None:  # replace default emb_model
            emb_model_dict = convert_to_list(emb_model_dict)
            self.emb_model_dict = emb_model_dict
        
        # add new corpuses to load
        if new_corpuses is not None:
            for new_corpus in new_corpuses:
                self.path_handler.add_corpus_source(new_corpus, overwrite=True)
        # get corpus paths to load from (if corpus_names is None loads all)
        self.corpus_sources = self.path_handler.get_corpus_sources(corpus_names=corpus_names)
        corpus_names = self.corpus_sources.keys()
        
        if selection is None:  # if none do all
            selection = self.file_purposes
        
        label_gen = None
        for corpus_name in corpus_names:
            print("corpus_name generate loop: " + str(corpus_name))
            if "topics" in selection:
                # create topic_df for corpus_name
                print("Generating topic_df")
                topic_reload = self.check_force_reload(corpus_name, "topics", file_type, force_reload, change_paths=True,
                                                      verbose=verbose)
                self.load_topic_df_control(corpus_name, file_type=file_type, save=save, force_reload=topic_reload, 
                                        verbose=verbose, add_path=True)
                
            if "corpus" in selection:
                # create corpus df csvs
                print("Generating corpus_dfs")
                corp_reload = self.check_force_reload(corpus_name, "corpus", file_type, force_reload, change_paths=True,
                                                      split_step=self.corpus_split_step, verbose=verbose)
                self.corpus_splitter(corpus_name, file_type, force_reload=corp_reload, verbose=False)
                
            if "nuggets" in selection:
                print("Generating nugget files")
                nug_reload = self.check_force_reload(corpus_name, "nuggets", file_type, force_reload, change_paths=True,
                                                    verbose=verbose)
                self.nuggets_generator(corpus_name, file_type, force_reload=nug_reload, verbose=verbose)
                
            if "embeddings" in selection:
                print("Generating embedding files")
                for model_dict in emb_model_dict:
                    print("Using model " + str(model_dict['name']))
                    nested_dir = model_dict['name']
                    embed_reload = self.check_force_reload(corpus_name, "embeddings", file_type, force_reload, change_paths=True,
                                                           nested_dir=nested_dir, split_step=self.embedding_split_step,
                                                           verbose=verbose)
                    self.embedding_generator(model_dict, corpus_name, file_type, force_reload=force_reload, 
                                             verbose=verbose)
            if "labels" in selection:
                if label_gen is None:
                    label_gen = EmbeddingLabelGenerator(self.proj_dir)
                # get each nested_dir/embedding type to load labels for
                nested_dirs = self.path_handler.load_path_df_slice(corpus_name, "embeddings", exists=True,
                                                                  file_type=file_type)
                nested_dirs = list(nested_dirs['nested_dir'].unique())
                label_reload = self.check_force_reload(corpus_name, "labels", file_type, force_reload, 
                                                       change_paths=False)
                
                nested_dirs.sort(reverse=True)  # debug, force stsb-roberta run first
                
                for nested_dir in nested_dirs:
                    label_gen.add_labels(corpus_name, nested_dir, verbose=verbose, force_reload=label_reload,
                                       save=save, split_step=self.embedding_split_step, selection=label_selection,
                                       file_type=file_type)
                
            
        print("Finished generating files")
        
    def check_force_reload(self, corpus_name, select, file_type, force_reload, change_paths=True, 
                           inst_identifier=None, nested_dir=None, split_step=None, verbose=True):
        if select in force_reload:
            if change_paths:
                self.change_force_reload_paths(corpus_name, select, file_type, inst_identifier=inst_identifier, 
                                              nested_dir=nested_dir, split_step=split_step)
                if verbose:
                    print("Changed paths for " + str(select) + " in " + str(corpus_name))
            return True
        else:
            return False
        
    def change_force_reload_paths(self, corpus_name, select, file_type, inst_identifier=None, nested_dir=None,
                                 split_step=None):
        # change path df paths to not exists if force_reload
        targ = self.path_handler.path_df
        targ = targ[targ['corpus_name'] == corpus_name]
        targ = targ[targ['file_purpose'] == select]
        targ = targ[targ['file_type'] == file_type]
        if inst_identifier is not None:
            targ = targ[targ['instance_identifier'] == inst_identifier]
        if nested_dir is not None:
            targ = targ[targ['nested_dir'] == nested_dir]
        if split_step is not None:
            targ = targ[targ['split_step'] == split_step]
        targ['exists'] = False  # set value for all items
        self.path_handler.save_path_df()
        
    def embedding_generator(self, model_dict, corpus_name, file_type, force_reload=False, verbose=True):
        # if not exists load topics
        self.load_topic_df_control(corpus_name, force_reload=False, verbose=False, file_type=file_type)
        
        # create nested dir named after model used to generate embeddings
        nested_dir = model_dict['name']
        
        # model to create sentence embeddings
        emb_model = model_dict['model']
        
        topic_df = self.topic_dfs[corpus_name]
        topic_ids = list(topic_df['id'].unique())
        # create embedding dfs per topic
        for topic_id in tqdm_notebook(topic_ids, position=0, leave=True):
            # paths for corpus_df and nugget_df for topic
            corp_paths = self.load_corpus_paths_control(corpus_name, topic_id, file_type=file_type)
            nug_path = self.path_handler.load_path_df_slice(corpus_name, "nuggets", file_type=file_type,
                                                            split_identifier=topic_id, exists=True)
            if len(nug_path) != 1:
                raise Exception("There are " + str(len(nug_path)) + " nugget paths for topic " + str(topic_id))
            nug_path = list(nug_path['path'])[0]
            
            # load corpus and nuggets
            corpus_df = load_corpus(corp_paths, verbose=False)
            nugget_df = load_nugget_df(nug_path, verbose=False)

            
            # get emb paths for this topic
            emb_paths = self.path_handler.load_path_df_slice(corpus_name, "embeddings", exists=True, 
                                                instance_identifier=str(topic_id), nested_dir=nested_dir,
                                               split_step=self.embedding_split_step, file_type=file_type)
            
            # first step computed will be after this step
            # adding emb_split_step to this number will give us our start point  (e.g. 0)
            prev_split = 0 - self.embedding_split_step
            num_splits = 0
            
            if len(emb_paths) == 0 or force_reload:
                # prev_split unchanged, start from initial value
                if verbose:
                    print("Processing topic " + str(topic_id) + " from beginning")
            else:
                # check what has already been processed
                num_splits = list(emb_paths['num_splits'])[0]
                if len(emb_paths) == num_splits:  # no missing paths, already processed
                    continue
                else:  # missing paths, continue where left off
                    prev_split = max(list(map(int, list(emb_paths['split_identifier']))))
            
            
            # split every self.embedding_split_step
            split_indexes = []
            temp = prev_split + self.embedding_split_step
            while temp < len(corpus_df):
                split_indexes.append(temp)
                temp = split_indexes[-1] + self.embedding_split_step
                
            if num_splits == 0:  # no previous splits processed
                num_splits = len(split_indexes)
                
            for split_index in split_indexes:
                # make sure final index doesn't go over corpus length
                end_index = split_index + self.embedding_split_step
                if end_index > len(corpus_df):
                    end_index = len(corpus_df)
                # get desired section of corpus_df to pass through
                split_corpus_df = corpus_df[split_index:end_index]
                
                # create save path for resultant file
                save_path = self.path_handler.get_path(corpus_name, "embeddings", str(topic_id), file_type,
                                        split_identifier=split_index, num_splits=num_splits, 
                                        split_step=self.embedding_split_step, nested_dir=nested_dir, add_path=True)
                
                if verbose:
                    print("Loading topic " + str(topic_id) + " documents. " + str(split_index) 
                          + " - " + str(end_index) + " (Total: " + str(len(corpus_df)) + ")")
                
                # create embeddings for this section of corpus_df for this topic
                load_embeddings(save_path, emb_model=emb_model, corpus_df=split_corpus_df, nugget_df=nugget_df,
                               sents_default="splitlines", nlp=None, force_reload=force_reload, save=True,
                               verbose=verbose, path_handler=self.path_handler, only_docs_with_nugs=False)
            
            
    def nuggets_generator(self, corpus_name, file_type, force_reload=False, verbose=True):
        # if not exists load topics
        self.load_topic_df_control(corpus_name, force_reload=False, verbose=False, file_type=file_type)
        
        # this is used for the filename of the resulting saved file
        base_identifier = "nuggets"
        
        topic_df = self.topic_dfs[corpus_name]
#         topic_df = self.remove_topic_and_save(corpus_name, topic_df, 7, 
#                                               file_type=file_type, verbose=True)  # temp debug
        topic_ids = list(topic_df['id'].unique())
        # create nuggets_df per topic
        for topic_id in tqdm_notebook(topic_ids):
            # get paths for corpus files for this topic_id (inelegantly)
            print("nuggets_generator topic_id: " + str(topic_id))
            corp_paths = self.load_corpus_paths_control(corpus_name, topic_id, file_type=file_type)
            
            # load corpus for a given topic
            corpus_df = load_corpus(corp_paths, save=False,
                                    force_reload=False, verbose=False, path_handler=self.path_handler)
            
            # nugget save destination
            save_path = self.path_handler.get_path(corpus_name, "nuggets", base_identifier, file_type,
                                        split_identifier=str(topic_id), num_splits=len(topic_ids), 
                                        add_path=True)
            
            nuggets_tsv_path = self.corpus_sources[corpus_name]['nuggets_path']
            matches_tsv_path = self.corpus_sources[corpus_name]['matches_path']
            # generate nugget file
            load_nugget_df(save_path, corpus_df=corpus_df, topic_ids=[topic_id], matches_tsv_path=matches_tsv_path, 
                            nuggets_tsv_path=nuggets_tsv_path, save=True, force_reload=force_reload, 
                           verbose=verbose, path_handler=self.path_handler, spacy_if_not_found=True)
            
            
                    
    def corpus_splitter(self, corpus_name, file_type, force_reload=False, verbose=True):
        # split by topic and then every 200 html gz files, then parse together in loading
        # add check for what's been done already (i.e. check current topics, if all splits taken place)
        
        # if not exists load topics
        self.load_topic_df_control(corpus_name, force_reload=False, verbose=verbose, file_type=file_type)
        topic_df = self.topic_dfs[corpus_name]
        
        corpus_dir = self.corpus_sources[corpus_name]["dir_path"]
        if verbose:
            print("corpus_dir:" + str(corpus_dir))
            
        for topic_id in tqdm_notebook(topic_df['id'].unique()):
            # confirm dir exists
            t_dir = corpus_dir + '/' + str(topic_id)
            if not file_exists(t_dir):
                warnings.warn("Corpus loading path at " + t_dir + " does not exist. Removing from topic_df")
                topic_df = self.remove_topic_and_save(corpus_name, topic_df, topic_id, 
                                                      file_type=file_type, verbose=verbose)
                continue
            
            # check if path exists
            t_df_paths = self.path_handler.load_path_df_slice(corpus_name, "corpus", 
                                                              instance_identifier=str(topic_id), exists=True,
                                                             file_type=file_type,
                                                             split_step=self.corpus_split_step)
            start_split = 0
            num_splits = 0
            if len(t_df_paths) == 0 or force_reload:  # not yet processed
                start_split = 0
            else:
                # check if all splits been processed
                num_splits = list(t_df_paths['num_splits'])[0]  # ensure same num_splits is inputted into path_df
                if len(t_df_paths) == num_splits:  # already fully processed
                    continue
                # get start point if partway through
                start_split = max(list(map(int, list(t_df_paths['split_identifier']))))
            
            num_files = len(search_dir(t_dir))
            if num_files == 0:
                warnings.warn("No files found in directory " + str(t_dir) + ". Removing " + str(topic_id)
                             + " from topic_df")
                topic_df = self.remove_topic_and_save(corpus_name, topic_df, topic_id, verbose=verbose)
            
            # create split indexes to feed to load_corpus
            splits = [start_split]
            add = splits[-1] + self.corpus_split_step
            while add < num_files:
                splits.append(int(add))
                add = splits[-1] + self.corpus_split_step
            
            if start_split == 0:  
                num_splits = int(len(splits))  # for inputting into path_df
            
            if verbose:
                print("creating corpus df for topic " + str(topic_id) + " starting at file no. " 
                      + str(split_start_doc) + " of " + str(num_files) + " splitting every " 
                      + str(split_every) + " files")
            # create corpus_df files
            for split_num in splits:
                # get save path
                save_path = self.path_handler.get_path(corpus_name, "corpus", str(topic_id), file_type,
                                        split_identifier=str(split_num), num_splits=num_splits, 
                                        split_step=self.corpus_split_step, add_path=True)
                
                load_corpus(save_path, corpus_dir=corpus_dir, topic_ids=[topic_id], 
                            split_every=self.corpus_split_step, split_start_doc=split_num, 
                            save=True, force_reload=force_reload, 
                            verbose=verbose, path_handler=self.path_handler)
                
    def load_corpus_paths_control(self, corpus_name, topic_id, file_type=".csv.gz", verbose=False):
        corp_paths = self.path_handler.load_path_df_slice(corpus_name, "corpus", instance_identifier=str(topic_id),
                                                exists=True, split_step=self.corpus_split_step) 
        # try loading selected file type, otherwise any file type
        corp_file_types = corp_paths['file_type'].unique()
        if len(corp_file_types) > 0:
            load_type = corp_file_types[0]
            if file_type in corp_file_types:
                load_type = file_type
            corp_paths = corp_paths[corp_paths['file_type'] == load_type]
        else:
            # repeating this for speed of refactoring
            raise Exception("Corpus files for topic " + str(topic_id) + " have not been fully loaded")
        # check that all paths are loaded
        try:
            num_split = int(corp_paths.iloc[0]['num_splits'])
            if num_split < len(corp_paths):
                raise IndexError()
        except IndexError:
            if verbose:
                print(display(corp_paths))
            raise Exception("Corpus files for topic " + str(topic_id) + " have not been fully loaded")
        # return the list of path addresses
        corp_paths = list(corp_paths['path'])
        return corp_paths
    
    def remove_topic_and_save(self, corpus_name, topic_df, topic_id, file_type=".csv.gz", verbose=True):
        # remove topic_id from topic_df and save
        topic_df = topic_df[topic_df['id'] != topic_id]
        path = self.path_handler.path_df
        path = path[(path['corpus_name'] == corpus_name) & (path['file_purpose'] == "topics")
                   & (path['file_type'] == file_type)]['path']
        path = list(path)[0]
        save_df_file_type(topic_df, path, verbose=verbose)
        if verbose:
            print(str(topic_id) + " removed from topic_df and saved to " + str(path))
        return topic_df
                
    def load_topic_df_control(self, corpus_name, save=True, force_reload=False, verbose=True, add_path=False,
                             file_type=".csv.gz"):
        if self.topic_dfs is None:
            self.topic_dfs = {}
        if corpus_name not in self.topic_dfs:
            self.topic_dfs[corpus_name] = self.load_topic_df(corpus_name, save=save, force_reload=force_reload,
                                                verbose=verbose, add_path=add_path, file_type=file_type)
                

    def load_topic_df(self, corpus_name, save=True, force_reload=False, verbose=True, add_path=False,
                     file_type=".csv.gz"):
        load_path = self.corpus_sources[corpus_name]["topics_path"]
        save_path = self.path_handler.get_path(corpus_name, "topics", "topics_df", file_type, add_path=add_path)
        
        topic_df = load_topics(save_path, load_path=load_path, save=save, force_reload=force_reload, 
                               verbose=verbose, path_handler=self.path_handler)
        return topic_df
    
    def fix_emb_df(self, emb_df_path, emb_model_dict):
        """Function is used to debug, hdf files are prone to error if stopped during saving process
        Allows manual recreating an emb_df based on specified path and supplied model
        """
        paths = self.path_handler.path_df
        paths = paths[paths['path'] == emb_df_path]
        if len(paths) != 1:
            raise Exception(str(len(paths)) + " found for " + str(emb_df_path))
        print("Path df entry:")
        print(display(paths))
        paths = paths.iloc[0].to_dict()
        if paths['nested_dir'] != emb_model_dict['name']:
            raise Exception("Wrong model dict supplied. \n"
                           + "Target model: " + str(paths['nested_dir']) + "\n"
                           + "Supplied model: " + str(emb_model_dict['name']))
        # get values to load corpus/nuggets
        topic_id = paths['instance_identifier']
        file_type = paths['file_type']
        start_split = int(paths['split_identifier'])
        split_step = int(paths['split_step'])
        corpus_name = paths['corpus_name']
        
        # load corpus
        corp_paths = self.load_corpus_paths_control(corpus_name, topic_id, file_type=file_type)
        corpus_df = load_corpus(corp_paths, verbose=False)
        end_split = int(start_split + split_step)
        if end_split > len(corpus_df):
            end_split = len(corpus_df)
        split_corpus_df = corpus_df[start_split:end_split]
        
        # load nuggets
        nug_path = self.path_handler.load_path_df_slice(corpus_name, "nuggets", file_type=file_type,
                                                            split_identifier=int(topic_id), exists=True)
        nug_path = list(nug_path['path'])
        nugget_df = load_nugget_df(nug_path, verbose=False)
        
        # force reload emb_df
        load_embeddings(emb_df_path, emb_model=emb_model_dict["model"], corpus_df=split_corpus_df, nugget_df=nugget_df,
                   sents_default="splitlines", nlp=None, force_reload=True, save=True,
                   verbose=True, path_handler=self.path_handler, only_docs_with_nugs=False)
        print("Fixed " + str(emb_df_path))

In [21]:
proj_dir = '/nfs/proj-repo/AAARG-dissertation'
orig_tr14_filtered_dict = { "corpus_name":"original-trects-kba2014-filtered",
                        "dir_path":"/nfs/original-trects-kba2014-filtered", 
                      "topics_path":"/nfs/original-trects-kba2014-filtered/test-topics.xml", 
                      "nuggets_path":"/nfs/TemporalSummarization/ts13/results/nuggets.tsv",
                        "matches_path":"/nfs/TemporalSummarization/ts13/results/matches.tsv"}

mine_tr14_filtered_dict = {"corpus_name":"mine-trects-kba2014-filtered",
                          "dir_path":"/nfs/mine-trects-kba2014-filtered",
                          "topics_path":["/nfs/TemporalSummarization/ts13/test-topics.xml",
                                        "/nfs/TemporalSummarization/ts14/trec2014-ts-topics-test.xml",
                                        "/nfs/TemporalSummarization/ts15/trec2015-ts-topics-test.xml"],
                          "nuggets_path":["/nfs/TemporalSummarization/ts13/results/nuggets.tsv",
                                         "/nfs/TemporalSummarization/ts14/results/nuggets.tsv",
                                         "/nfs/TemporalSummarization/ts15/results/nuggets.tsv"],
                          "matches_path":["/nfs/TemporalSummarization/ts13/results/matches.tsv",
                                         "/nfs/TemporalSummarization/ts14/results/matches.tsv",
                                         "/nfs/TemporalSummarization/ts15/results/matches.tsv"]}

emb_model_dict = [{"model":SentenceTransformer("stsb-roberta-base"),
                  "name":"stsb-roberta-base"},
                 {"model":SentenceTransformer('distilbert-base-nli-stsb-mean-tokens'),
                                  "name":'distilbert-base-nli-stsb-mean-tokens'}]

In [None]:
# corp_gen = CorpusGenerator(proj_dir)
corp_gen = CorpusGenerator(proj_dir, embedding_split_step=200)

# debug_path = "/nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/12_400.hdf"
# corp_gen.fix_emb_df(debug_path, emb_model_dict[0])


# selection = ["topics", "corpus", "nuggets", "embeddings"]
# force_reload = ["embeddings"]
# force_reload = ["labels"]
force_reload = False
selection = ["labels"]
# label_selection = ["cos_sim_nearest_nug"]
label_selection = ["cosine_similarity", "cos_sim_nearest_nug"]
# selection = None
# corpus_names = ["original-trects-kba2014-filtered", "mine-trects-kba2014-filtered"]
corpus_names = ["mine-trects-kba2014-filtered", "original-trects-kba2014-filtered"]
# corpus_names = ["original-trects-kba2014-filtered"]
# corpus_names = ["mine-trects-kba2014-filtered"]
file_type = ".hdf"
# file_type = ".csv.gz"

corp_gen.generate(corpus_names=corpus_names, 
                  force_reload=force_reload, verbose=True, selection=selection, file_type=file_type,
                 emb_model_dict=emb_model_dict, label_selection=label_selection)

corpus_name generate loop: mine-trects-kba2014-filtered
Path history loaded from file at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading nug_embs list from /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/embedding_list_all.pickle
Avg nugget embedding loaded from: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/avg_nug_emb.npy
Adding cosine_similarity, cos_sim_nearest_nug to 732 dataframes


  0%|          | 0/732 [00:00<?, ?it/s]

Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/24_2000.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 46219 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 46219 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/24_2000.hdf


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->Index(['streamid', 'sentence', 'embedding', 'technique', 'l2_normed'], dtype='object')]

  encoding=encoding,


df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/24_2000.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/24_2200.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 46174 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 46174 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/24_2200.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/24_2200.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_hist

Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 44047 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 44047 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/24_4400.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/24_4400.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/24_4600.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 44878 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 44878 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb

df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/24_6600.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/24_6800.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 43835 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 43835 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/24_6800.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/24_6800.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_hist

Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 47073 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 47073 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/24_9000.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/24_9000.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/24_9200.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 46158 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 46158 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb

df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/25_200.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/25_400.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 45478 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 45478 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/25_400.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/25_400.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.

Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 54070 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 54070 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/25_2600.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/25_2600.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/25_2800.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 86273 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 86273 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb

df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/25_4800.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/26_0.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 35879 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 35879 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/26_0.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/26_0.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickl

Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 33250 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 33250 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/26_2200.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/26_2200.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/26_2400.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 37213 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 37213 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb

df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/26_4400.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/26_4600.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 43459 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 43459 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/26_4600.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/26_4600.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_hist

Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 39559 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 39559 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/26_6800.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/26_6800.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/26_7000.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 30455 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 30455 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb

df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/26_9000.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/26_9200.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 50105 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 50105 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/26_9200.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/26_9200.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_hist

Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 39351 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 39351 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/28_400.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/28_400.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/28_600.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 45001 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 45001 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-ro

Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 48974 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 48974 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/31_0.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/31_0.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/31_200.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 53988 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 53988 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-robert

Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 38484 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 38484 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/33_0.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/33_0.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/33_200.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 39912 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 39912 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-robert

df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/33_2200.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/33_2400.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 48736 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 48736 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/33_2400.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/33_2400.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_hist

Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 24058 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 24058 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/33_4600.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/33_4600.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/34_0.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 46029 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 46029 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-ro

df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/34_2000.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/34_2200.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 45330 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 45330 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/34_2200.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/34_2200.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_hist

Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 37263 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 37263 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/35_600.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/35_600.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/35_800.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 37983 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 37983 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-ro

df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/35_2800.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/35_3000.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 32232 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 32232 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/35_3000.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/35_3000.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_hist

Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 34483 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 34483 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/35_5200.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/35_5200.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/35_5400.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 36949 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 36949 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb

df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/37_0.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/37_200.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 25945 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 25945 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/37_200.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/37_200.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pi

Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 26345 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 26345 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/40_0.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/40_0.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/40_200.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 26328 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 26328 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-robert

Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 41759 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 41759 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/41_1400.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/41_1400.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/41_1600.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 49177 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 49177 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb

df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/41_3600.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/41_3800.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 38461 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 38461 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/41_3800.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/41_3800.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_hist

Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 39893 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 39893 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/41_6000.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/41_6000.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/41_6200.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 34782 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 34782 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb

df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/42_400.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/42_600.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 65110 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 65110 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/42_600.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/42_600.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.

Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 42398 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 42398 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/43_0.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/43_0.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/43_200.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 36934 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 36934 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-robert

df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/43_2200.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/43_2400.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 40317 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 40317 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/43_2400.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/43_2400.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_hist

Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 45682 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 45682 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/43_4600.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/43_4600.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/43_4800.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 38374 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 38374 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb

Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 31475 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 31475 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/45_600.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/45_600.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/45_800.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 37265 embeddings and 17877 nugget embeddings
Finished cos_sim_nearest_nug for 37265 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-ro

  0%|          | 0/732 [00:00<?, ?it/s]

Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/1_0.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 8675 embeddings and 18117 nugget embeddings
Finished cos_sim_nearest_nug for 8675 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/1_0.hdf


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block3_values] [items->Index(['streamid', 'sentence', 'embedding', 'technique', 'l2_normed'], dtype='object')]

  encoding=encoding,


df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/1_0.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/label_gen_history.pickle
Loading /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/1_200.hdf
Adding cosine_similarity
Adding cos_sim_nearest_nug
Starting cos_sim_nearest_nug for 8132 embeddings and 18117 nugget embeddings
Finished cos_sim_nearest_nug for 8132 embeddings
Saving file to /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/1_200.hdf
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/1_200.hdf
Path history saved to: /nfs/proj-repo/AAARG-dissertati

## Testing Area

In [None]:
# proj_dir = '/nfs/proj-repo/AAARG-dissertation'
# corpus_name = "original-trects-kba2014-filtered"
# nested_dir = 'distilbert-base-nli-stsb-mean-tokens'

# retr = PathRetriever(proj_dir)
# retr.get_embedding_paths(corpus_name, nested_dir)

In [None]:
import random
import sklearn.neighbors
from sklearn.neighbors import KDTree, DistanceMetric, NearestNeighbors
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics import pairwise
from sklearn.preprocessing import normalize
from scipy.spatial import cKDTree

# from sklearn.metrics.pairwise import cosine_similarity

def get_embs():
    retriever = PathRetriever('/nfs/proj-repo/AAARG-dissertation')
    corpus_name = "mine-trects-kba2014-filtered"
#     nested_dir = 'distilbert-base-nli-stsb-mean-tokens'
    nested_dir = "stsb-roberta-base"
    
    topic_pos_counts = {}
    for i in tqdm_notebook(range(1, 47)):
        if i == 7:
            continue
        emb_paths = list(retriever.get_embedding_paths(corpus_name, nested_dir, topic_ids=[i], verbose=False)['path'])
        
        pos_counts = []
        for index, emb_path in enumerate(emb_paths):
            emb_df = load_embeddings(emb_path, verbose=False)
            pos_count = 0
            if 'cos_sim_nearest_nug' in emb_df.columns:
                for near_nug in emb_df['cos_sim_nearest_nug']:
                    if near_nug > 0:
                        pos_count += 1
            else:
                print("no col found at topic_id " + str(i) + " idx" + str(index))
            
            pos_counts.append(pos_count)
            if pos_count > 0:
                print("pos_count found in topic " + str(i) + " idx " + str(index))
        topic_pos_counts[i] = pos_counts     
    
    for topic_id, pos_counts in topic_pos_counts.items():
        pos_indexes = []
        for index, pos_count in enumerate(pos_counts):
            if pos_count != 0:
                pos_indexes.append((index,  pos_count))
        if len(pos_indexes) != 0:
            print(str(topic_id) + " has positive values at these indexes (out of " + str(len(pos_counts)) + "): ")
            print(str(pos_indexes))
            print("")
        else:
            print(str(topic_id) + " has no positive values")
            print("")
            
    print("")
    print(topic_pos_counts)
    return None

get_embs()

In [None]:
# potato = "/nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/distilbert-base-nli-stsb-mean-tokens/11_200.hdf"
# potato_df = load_embeddings(potato)
# print(display(potato_df))