# Notebook for Generating Summaries

In [1]:
import os
num_threads = 32
os.environ['NUMEXPR_MAX_THREADS'] = str(num_threads)

import pandas as pd
import numpy as np
import pickle
import copy
import math
from collections import defaultdict
# from tqdm import tqdm
from tqdm import tqdm_notebook
from sentence_transformers import SentenceTransformer
import ipynb.fs

from sklearn.neighbors import KDTree

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import InputLayer
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import LSTM

import kerastuner as kt
from kerastuner.engine.hyperparameters import HyperParameters
from kerastuner.tuners import Hyperband

In [2]:
gpus = tf.config.list_physical_devices('GPU')
print("gpus:")
print(gpus)

gpus:
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:3', device_type='GPU')]


## Keras NN Model

In [102]:
class NNTuner:
    def __init__(self, save_dir, save_name, input_shape, tuning_iterations=2, max_epochs=15, reduction_factor=3,
                 batch_size=32, force_reload=False, output_dims=1):
        """Can save using project_name param, if overwrite false then will reload where it started
        In Tuner Class documentation
        """
        self.input_shape = input_shape
        self.output_dims = output_dims
        self.batch_size = batch_size
        self.models = []
        self.tuner = Hyperband(self.build_model, 
                          objective='mean_squared_error', 
                          max_epochs=max_epochs,
                          hyperband_iterations=tuning_iterations,
                          factor=reduction_factor,  # keras-tuner default is 3
                          directory=save_dir,
                          project_name=save_name,
                          overwrite=force_reload,
                          tune_new_entries = True,
                          allow_new_entries = True,
                           distribution_strategy=tf.distribute.MirroredStrategy(cross_device_ops=tf.distribute.HierarchicalCopyAllReduce()))

        
    def build_model(self, hp):
        model = Sequential()
        # specify input layer to ensure correct input shape
        ilayer = InputLayer(input_shape=(self.input_shape,), 
                            batch_size=self.batch_size, 
                            name='input_layer')
        model.add(ilayer)
        
        # add hidden layers
        for i in range(hp.Int('num_hidden_layers', min_value=1, max_value=6)):
            model.add(Dense(units=hp.Int('hidden_units_' + str(i),
                                        min_value=32, max_value=2048, step=32),
                            activation='tanh',
                           name='hidden_layer_' + str(i)))
            
        # add output layer
        model.add(Dense(units=self.output_dims, 
                        activation='tanh'))
        
        opt = tf.keras.optimizers.Adam(
                learning_rate=hp.Float('learning_rate', min_value=0.0001, max_value=0.1))      
        
        model.compile(optimizer=opt, loss='huber', metrics=['mean_squared_error'])  # add metrics here
        
        self.models.append(model)
        return model
    
    def search(self, batch_generator, best_model_dir):
        """Find optimal model given dataset
        """
        self.tuner.search(x=batch_generator, verbose=1, use_multiprocessing=False, workers=num_threads)
        best_models = self.tuner.get_best_models(num_models=5)
        if best_model_dir is not None:
            for i in range(len(best_models)):
                print("Saving best model number: " + str(i))
                save_path = os.path.join(best_model_dir, str(i))
                best_models[i].save(save_path)
            hyperparams = self.tuner.get_best_hyperparameters(num_trials=1)[0]
        best_model_path = os.path.join(best_model_dir, str(0))
        return best_models[0], hyperparams. best_model_path
    

# from collections import OrderedDict
# from collections import deque

class BatchGenerator(keras.utils.Sequence):
    """Class to load in dataset that is too large to load into memory at once
    
    Do check in class before to make sure all X lists and y lists are same length
    
    https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
    """
    def __init__(self, X, y, batch_size, num_batches):
        if batch_size is None:
            self.batch_size = 1
        else:
            self.batch_size = batch_size
        self.num_batches = num_batches
        self.X = X
        self.y = y
#         self.shuffle = False  # make sure linear progression through dataset for sake of memory efficiency
        
    def __len__(self):
        """Denotes the number of batches per epoch"""
        return self.num_batches
    
    def __getitem__(self, idx):
        """Generates one batch of data"""
        inputs = self.load_samples(self.X, idx)
        labels = self.load_samples(self.y, idx)
        return inputs, labels
    
    
    def load_samples(self, path, idx):
        samples = path[idx]
        return samples

## Generating the samples 

In [4]:
from .defs.corpus_loader import PathRetriever, load_embeddings, load_topics, read_df_file_type, save_df_file_type
from .defs.corpus_loader import convert_to_list

In [5]:
def resolve_input_params(path_ret, corpus_names, nested_dirs, col_labels, input_col=None):
    """Helper function to resolve the selection of input params that determine what data to load/generate"""
    # resolve corpus_names
    if corpus_names is None:
        corpus_names = path_ret.get_corpus_names()
        if len(corpus_names) == 0:
            raise Exception("There are no corpuses to load from")
    # resolve col_labels
    if col_labels is None:  # our columns to generate files for
        col_labels = default_col_labels.copy()
        if input_col is not None:
            col_labels.append(input_col)
    # resolve nested_dirs
    if type(nested_dirs) != dict:  # if output gets passed through again
        nested_dict = {}
        for corpus_name in corpus_names:  # get the nested dir for each corpus name
            nested_dict[corpus_name] = path_ret.get_nested_dirs(corpus_name, "embeddings")
            if nested_dirs is not None:
                # add only selected nested_dirs for this corpus_name
                nested_dict[corpus_name] = [x for x in nested_dict[corpus_name] if x in nested_dirs]
        nested_dirs = nested_dict
    # make sure there is at least one entry in nested_dict
    empty_dirs = [len(x) == 0 for x in nested_dirs.values()]  # get if empty for each item
    if all(empty_dirs):
        raise Exception("There are no nested_dirs matching the selection")
    return corpus_names, nested_dirs, col_labels

def corpus_name_topic_ids(path_retriever, corpus_name):
    topic_path = path_retriever.get_topic_path(corpus_name, verbose=False)
    topic_df = load_topics(topic_path, verbose=False)
    topic_ids = list(topic_df['id'].unique())
    return topic_ids

def find_combinations(path_df, corpus_names, nested_dirs, col_labels, add_topics=False, col_labels_as_list=False,
                      as_tuples=True, force_reload=False, path_retriever=None, batch_size=None, file_type=None,
                     exists_only=False):
    """Find the combinations that have not been generated/trained already in path_df
    
    Tuple ordering: (corpus_name, nested_dir, col_label/[col_labels], **topic_id**)
    """
    if exists_only:
        path_df = path_df[path_df['exists'] == True]  # checking of path_df is only concerned with existing files
    if batch_size is not None:
        path_df = path_df[path_df['batch_size'] == batch_size]
    if file_type is not None:
        path_df = path_df[path_df['file_type'] == file_type]
    topic_ids = {}
    if add_topics:  # find topic_ids for each corpus
        for corpus_name in corpus_names:
            if path_retriever is not None:
                topic_ids[corpus_name] = corpus_name_topic_ids(path_retriever, corpus_name)
            else:
                raise Exception("If add_topics is True then path_retriever must be set to an instance of PathRetriever")
    # get possible combinations
    combinations = []
    for corpus_name in corpus_names:
        for nested_dir in nested_dirs[corpus_name]:
            combo_path = path_df[(path_df['corpus_name'] == corpus_name)
                                    & (path_df['nested_dir'] == nested_dir)]
            combo = [corpus_name, nested_dir]
            if add_topics:  # create permutations with topic_ids
                topic_combo_dict = defaultdict(list)
                for label in col_labels:
                    for topic_id in topic_ids[corpus_name]:  # check if label exists for topic_id
                        topic_path = combo_path[(combo_path['col_label'] == label)
                                               & (combo_path['topic_id'] == topic_id)]
                        if len(topic_path) == 0 or force_reload:
                            topic_combo_dict[topic_id].append(label)
                topic_combos = []
                for topic_id, labels in topic_combo_dict.items():
                    topic_combos = []
                    if col_labels_as_list:  # add single tuple with all missing col_labels for topic_id
                        topic_combo = copy.deepcopy(combo)
                        topic_combo.append(labels)
                        topic_combo.append(topic_id)
                        topic_combos.append(topic_combo)
                    else:
                        for label in labels:  # add a tuple for each missing col_label for topic_id
                            topic_combo = copy.deepcopy(combo)
                            topic_combo.append(topic_id)
                            topic_combos.append(topic_combo)
                    combinations.extend(topic_combos)
            else:  # create permutations with col_labels only
                label_combos = []
                add_labels = None
                if not force_reload:  # find which col_labels don't exist already
                    exist_labels = list(combo_path['col_label'].unique())
                    add_labels = [x for x in col_labels if x not in exist_labels]
                else:
                    add_labels = copy.deepcopy(col_labels)  # force_reload add all labels
                if col_labels_as_list:  # add single tuple
                    label_combo = copy.deepcopy(combo)
                    label_combo.append(add_labels)
                    label_combos.append(label_combo)
                else:
                    for add_label in add_labels:  # add tuple for each col_label
                        label_combo = copy.deepcopy(combo)
                        label_combo.append(add_label)
                        label_combos.append(label_combo)
                combinations.extend(label_combos)
                
    if as_tuples:
        combinations = [tuple(x) for x in combinations]
    return combinations

In [6]:
class MemmapGenerator:
    def __init__(self, proj_dir):
        if proj_dir is None:
            self.proj_dir = '/nfs/proj-repo/AAARG-dissertation'
        else:
            self.proj_dir = proj_dir
        self.default_file_type = ".hdf"
        self.path_ret = PathRetriever(proj_dir)
        self.path_df_cols = ['corpus_name', 'nested_dir', 'col_label', 'path', 'dtype', 'vector_len', 
                             'total_nums', 'offset_step', 'topic_ids', 'complete']
        self.dataset_dir = self.path_ret.path_handler.dataset_dir
        self.sample_dir = os.path.join(self.dataset_dir, "samples")
        self.path_df_path = os.path.join(self.dataset_dir, "memmap_paths.hdf")
        self.path_df = self.load_path_df()
        self.order = 'C'
        
        
    def create_maps(self, corpus_name, nested_dir, col_labels, topic_ids, verbose=True, force_reload=False):
        # check if already completed
        path_slice = self.slice_path_df(corpus_name, nested_dir, topic_ids)
        emb_paths = self.path_ret.get_embedding_paths(corpus_name, nested_dir, 
                                    file_type=self.default_file_type, verbose=False, 
                                    return_dir_path=False, topic_ids=topic_ids)
        emb_paths = list(emb_paths['path'])
        # load partial information on maps that need completed
        meta_dict = self.create_meta_dict(path_slice, corpus_name, nested_dir, col_labels, 
                                          self.topic_ids_str(topic_ids), force_reload=force_reload)
        
        if len(meta_dict) > 0:
            if verbose:
                print("Creating memmaps for " + str(", ".join(col_labels)) + "\nwith topics: " + str(topic_ids))
            for emb_path in tqdm_notebook(emb_paths):
                # get the cols that haven't been loaded for this path
                # scrape data from dataframe
                label_data = self.scrape_col_data(emb_path, meta_dict.keys())
                # add data to memmap
                for col_label, data in label_data.items():
                    col_dict = meta_dict[col_label]
                    if not col_dict['initialised']:
                        col_dict['dtype'] = data.dtype
                        ndim = data.ndim
                        if ndim == 1:  # 1d
                            col_dict['vector_len'] = 1
                        elif ndim == 2:  # 2d
                            col_dict['vector_len'] = data.shape[1]
                        else:
                            raise Exception("Too many dimensions: " + str(data.shape))
                        col_dict['offset_step'] = data.dtype.itemsize
                        col_dict['initialised'] = True
                        
                        
                    # load meta_dict vars, save hashing time
                    total_nums = col_dict['total_nums']
                    offset_step = col_dict['offset_step']
                    path = col_dict['path']
                    dtype = col_dict['dtype']

                    # add data to map
                    flat = data.ravel()
                    num_to_add = len(flat)
                    
                    memmap = None
                    if total_nums != 0:  # write to existing file
                        memmap = np.memmap(path, dtype=dtype, mode='r+', offset=0, 
                                       order=self.order, shape=(total_nums + num_to_add,))
                    else:  # create new file
                        memmap = np.memmap(path, dtype=dtype, mode='w+', offset=0, 
                                       order=self.order, shape=(num_to_add,))
                    
                    memmap[total_nums:total_nums+num_to_add] = flat[:]
                    if not np.array_equal(memmap[total_nums:total_nums+num_to_add], flat):
                        print("memmap: " + str(memmap[total_nums:total_nums+num_to_add]))
                        print("flat: " + str(flat))
                        raise Exception("Memmap and flat not equal")
                    
                    memmap.flush()

                    # update fields
                    col_dict['total_nums'] += num_to_add
                    
            for col_label, meta in meta_dict.items():
                self.update_path_df_entry(meta['path'], col_label, meta['dtype'], meta['vector_len'],
                         meta['offset_step'], meta['total_nums'])

            if verbose:
                print(display(path_slice))
            print("Completed creating memmaps")
        else:
            print("Already loaded " + str(col_labels))
            
    def update_path_df_entry(self, path, col_label, dtype, vector_len, offset_step, total_nums):
        mask = (self.path_df['path'] == path) & (self.path_df['col_label'] == col_label)
        change_cols = ['dtype', 'vector_len', 'offset_step', 'total_nums', 'complete']
        self.path_df.loc[mask, change_cols] = dtype, vector_len, offset_step, total_nums, True
        self.save_path_df()
        
            
    def add_path_df_entry(self, corpus_name, nested_dir, col_label, path, topic_ids, return_row_dict=False):
        row = {'corpus_name':corpus_name, 'nested_dir':nested_dir, 'col_label':col_label, 
               'path':path, 'dtype':None, 'vector_len':np.nan, 'total_nums':0, 
               'offset_step':0, 'topic_ids':topic_ids, 'complete':False}
        self.path_df = self.path_df.append(row, ignore_index=True)
        self.save_path_df()
        if return_row_dict:
            return row
        
    def create_meta_dict(self, path_slice, corpus_name, nested_dir, col_labels, topic_ids,
                        force_reload=False):
        meta_dict = {}
        for col_label in col_labels:
            col_slice = path_slice[path_slice['col_label'] == col_label]
            if len(col_slice) > 0:
                if len(col_slice) == 1:
                    complete = list(col_slice['complete'])[0]
                    if not complete or force_reload:
                        # add previous values
                        col_slice = col_slice.to_dict(orient='list')
                        col_slice['path'][0]
                        row_dict = {"dtype":col_slice['dtype'][0], "path":col_slice['path'][0], 
                                    "vector_len":col_slice['vector_len'][0], 
                                    "offset_step":col_slice['offset_step'][0], "total_nums":0, # set to 0 to restart
                                    "initialised":False, "completed":False}  
                        meta_dict[col_label] = row_dict
                else:
                    print(display(col_slice))
                    raise Exception("Multiple entries in path_df")
            else:
                # add to path df
                row_dict = self.add_path_df_entry(corpus_name, nested_dir, col_label,
                                                 self.generate_new_map_path(col_label),
                                                 topic_ids, return_row_dict=True)
                row_dict['initialised'] = False
                meta_dict[col_label] = row_dict
        return meta_dict
        
    def load_memmap(self, corpus_name, nested_dir, topic_ids, col_label, batch_size=None,
                   return_vector_len=False):
        path_slice = self.slice_path_df(corpus_name, nested_dir, topic_ids)
        col_slice = path_slice[path_slice['col_label'] == col_label]
        if len(col_slice) == 1:
            col_dict = col_slice.to_dict(orient='list')
            dtype = col_dict['dtype'][0]
            vector_len = int(col_dict['vector_len'][0])
            total_nums = int(col_dict['total_nums'][0])
            path = col_dict['path'][0]
            
            shape = None
            num_items = int(total_nums / vector_len)
            if batch_size is not None:
                num_batches = math.floor(num_items / batch_size)
                shape = (num_batches, batch_size, vector_len)
            else:
                shape = (num_items, vector_len)
            memmap = np.memmap(path, dtype=dtype, mode='r', shape=shape, order=self.order)
            if return_vector_len:
                return memmap, vector_len
            return memmap
        else:
            print(display(path_slice))
            raise Exception(str(len(path_slice)) + " entries for ")
    
    def slice_path_df(self, corpus_name, nested_dir, topic_ids):
        topic_id_str = topic_ids
        if type(topic_id_str) != str:
            topic_id_str = self.topic_ids_str(topic_ids)
            
        mask = (self.path_df['corpus_name'] == corpus_name) & (self.path_df['nested_dir'] == nested_dir) & (self.path_df['topic_ids'] == topic_id_str)
        path_slice = self.path_df.loc[mask]
        return path_slice
        
    def topic_ids_str(self, topic_ids):
        if type(topic_ids) != str:
            sort = sorted(topic_ids)
            sort = [str(x) for x in sort]
            string = ",".join(sort)
            return string
        else:
            raise Exception(str(topic_ids) + " is already type str")
        
    def save_path_df(self):
        save_df_file_type(self.path_df, self.path_df_path, verbose=False)
                
    def load_path_df(self):
        if os.path.exists(self.path_df_path):
            path_df = read_df_file_type(self.path_df_path, verbose=True)
        else:
            path_df = pd.DataFrame(columns=self.path_df_cols)
            print("memmap path df created from scratch")
        return path_df
        
    def incompleted_col_labels(self, path_slice, col_labels):
        incompleted = []
        for col_label in col_labels:
            col_slice = path_slice[path_slice['col_label'] == col_label]
            if len(col_slice) > 0:
                if len(col_slice) == 1:
                    complete = list(col_slice['complete'])[0]
                    if not complete:
                        incompleted.append(col_label)
                else:
                    print(display(col_slice))
                    raise Exception("Multiple entries in path_df")
            else:
                incompleted.append(col_label)
        return incompleted
            
    def generate_new_map_path(self, col_label):
        # putting topic_ids in filename too long, use count instead
        count = len(self.path_df)
        base = str(count) + "_" + str(col_label)
        if not os.path.exists(self.sample_dir):
            os.makedirs(self.sample_dir)
        mappath = os.path.join(self.sample_dir, base + ".memmap")
        return mappath          
            
    def scrape_col_data(self, emb_path, col_labels):
        # setup return variables
        labels = {}
        emb_df = load_embeddings(emb_path, verbose=False)
        for col_label in col_labels:
            if col_label not in emb_df.columns:
                raise ValueError("Target label " + str(col_label) + " is not in file at " + str(emb_path))
            # collect label values from df
            labs = np.array(list(emb_df[col_label]))
            labels[col_label] = labs
        return labels
        

In [43]:
class ModelPathHandler:
    def __init__(self, proj_dir='/nfs/proj-repo/AAARG-dissertation', base_dir_name="tuning_models",
                verbose=True):
        self.proj_dir = proj_dir
        self.base_dir_name = base_dir_name
        self.model_dir_path = os.path.join(self.proj_dir, self.base_dir_name)
        self.df_name = "nn_path_df.hdf"
        self.df_path = os.path.join(self.model_dir_path, self.df_name)
        self.df_cols = ["corpus_name", "nested_dir", "X_col", "y_col", "tuner_dir", 
                                "tuner_name", "best_hyperparams", "batch_size", "best_model_path",
                               "input_param_text_path", "redundancy_threshold"]
        self.verbose = verbose
        self.df = self.load_df(verbose=verbose)
        
    def load_df(self, verbose=True):
        if os.path.exists(self.df_path):
            df = read_df_file_type(self.df_path, verbose=verbose)
        else:
            df = pd.DataFrame(columns=self.df_cols)
            if verbose:
                print("model path df created from scratch")
        return df
    
    def save_df(self, verbose=False):
        save_df_file_type(self.df, self.df_path, verbose=verbose)
    
    def add_path(self, corpus_name, nested_dir, X_col, y_col, tuner_dir, tuner_name,
                          best_hyperparams, batch_size, best_model_path, input_param_text_path, verbose=True):
        # check if exists in dataframe
        df = self.df
        mask = (df['corpus_name']==corpus_name)&(df['nested_dir']==nested_dir)&(df['X_col']==X_col)&(df['y_col']==y_col)&(df['tuner_dir']==tuner_dir)&(df['tuner_name']==tuner_name)&(df['batch_size']==batch_size)
        exist_slice = df.loc[mask]
        if len(exist_slice) == 1:  # update existing row
            print("Saving to existing row on nn_path_df")
            self.df.loc[mask, self.df_cols] = corpus_name, nested_dir, X_col, y_col, tuner_dir, tuner_name, best_hyperparams, batch_size, best_model_path, input_param_text_path
        else:  # append new row
            print("Appending new row to nn_path_df")
            row = {"corpus_name":corpus_name, "nested_dir":nested_dir, "X_col":X_col,
                  "y_col":y_col, "tuner_dir":tuner_dir, "tuner_name":tuner_name,
                  "best_hyperparams":best_hyperparams, "batch_size":batch_size, "best_model_path":best_model_path,
                  "input_param_text_path":input_param_text_path, "redundancy_threshold":np.nan}
            self.df = self.nn_path_df.append(row, ignore_index=True)
        # save new entry
        self.save_df(verbose=verbose)
        
    def generate_nn_save_path(self, corpus_name, nested_dir, X_col, y_col, batch_size, create_dir=True):
        col_dir = str(X_col) + "_" + str(y_col) + "_" + str(int(batch_size))
        dir_list = [self.model_dir_path, corpus_name, nested_dir, col_dir]
        # combine directories to form path of subdirectories, create dirs if necessary
        dir_path = None
        for cur_dir in dir_list:
            if dir_path is None:  # first iteration
                dir_path = dir_list[0]
            else:
                dir_path = os.path.join(dir_path, cur_dir)
            if not os.path.exists(dir_path) and create_dir:
                os.makedirs(dir_path)
        # generate name
        save_name = "tuner_proj"
        return dir_path, save_name
    
    def update_redundancy_threshold(self, corpus_name, nested_dir, X_col, y_col, batch_size,
                                   redundancy_threshold, verbose=True):
        if "redundancy_threshold" not in self.df.columns:
            self.df['redundancy_threshold'] = np.nan
        
        mask = self.create_df_mask(corpus_name, nested_dir, X_col, y_col, batch_size)
        self.df.loc[mask, ['redundancy_threshold']] = redundancy_threshold
        self.save_df(verbose=verbose)
        
    def create_df_mask(self, corpus_name, nested_dir, X_col, y_col, batch_size):
        mask = (self.df['corpus_name']==corpus_name)&(self.df['nested_dir']==nested_dir)&(self.df['X_col']==X_col)&(self.df['y_col']==y_col)&(self.df['batch_size']==batch_size)
        return mask
        
    def load_best_model(self, corpus_name, nested_dir, X_col, y_col, batch_size):
        mask = self.create_df_mask(corpus_name, nested_dir, X_col, y_col, batch_size)
        tuner_instance = self.df.loc[mask]
        best_model_path = list(tuner_instance['best_model_path'])[0]
        model = tf.keras.models.load_model(best_model_path)
        return model
        

In [105]:
from time import gmtime, strftime

class NNTrainer:
    def __init__(self, proj_dir='/nfs/proj-repo/AAARG-dissertation', nn_base_save_dir_name=None):
        self.proj_dir = proj_dir
        self.nn_base_save_dir_name = nn_base_save_dir_name
        if self.nn_base_save_dir_name is None:
            self.nn_base_save_dir_name = "tuning_models"
        self.nn_base_save_dir_path = os.path.join(self.proj_dir, self.nn_base_save_dir_name)
        self.nn_path_df_name = "nn_path_df.hdf"
        self.nn_path_df_path = os.path.join(self.nn_base_save_dir_path, self.nn_path_df_name)
        self.nn_path_df_cols = ["corpus_name", "nested_dir", "X_col", "y_col", "tuner_dir", 
                                "tuner_name", "best_hyperparams", "batch_size", "best_model_path",
                               "input_param_text_path"]
        self.default_test_topics = [1,2,3,4,5,6,8,9,10]
        self.default_train_ratio = 0.8
        self.min_train_ratio = 0.5
    
    def train(self, corpus_name, nested_dir, topic_ids, X_col, y_col, tuning_iterations=5, max_epochs=15,
              reduction_factor=3, batch_size=32, force_reload=False, verbose=True):
        """
        1. Generate Data if needed
        2. Determine combinations to try
        3. Load combination
        4. Train network on it
        5. Generate summary on test topics
        5. Save tuned network, metrics, database entries
        
        """
        
            
        self.nn_path_df = self.load_nn_path_df(verbose=verbose)
        
        if verbose:
            print(display(self.nn_path_df))
            print("")
        
        # generate data
        mmap_gen = MemmapGenerator(self.proj_dir)
        mmap_gen.create_maps(corpus_name, nested_dir, [X_col, y_col], topic_ids, verbose=verbose,
                                         force_reload=False)  # setting to False for debug

        # get paths for inputs and total_len of samples
        X_map, vector_len = mmap_gen.load_memmap(corpus_name, nested_dir, topic_ids, X_col, 
                                                        batch_size=batch_size, return_vector_len=True)
        # get paths for labels
        y_map = mmap_gen.load_memmap(corpus_name, nested_dir, topic_ids, y_col, batch_size=batch_size,
                                           return_vector_len=False)

        # create a generator to feed NN samples/batches
        num_batches = X_map.shape[0]
        batch_generator = BatchGenerator(X_map, y_map, batch_size, num_batches)

        # create paths to save NN tuning files to
        save_dir, save_name = self.generate_nn_save_path(corpus_name, nested_dir, X_col, y_col, batch_size,
                                                        create_dir=True)
        
        # Log params in text file
        trials_in_iter = max_epochs * (math.log(max_epochs, reduction_factor) ** 2)
        cur_time = self.get_cur_time()
        param_str = "corpus_name: " + str(corpus_name) + "\n"
        param_str += "nested_dir: " + str(nested_dir) + "\n"
        param_str += "X_input: " + str(X_col) + "\n"
        param_str += "y_labels: " + str(y_col) + "\n"
        param_str += "batch_size: " + str(batch_size) + "\n"
        param_str += "train_topics: " + str(train_topics) + "\n"
        param_str += "max_epochs: " + str(max_epochs) + "\n"
        param_str += "tuning_iterations: " + str(tuning_iterations) + "\n"
        param_str += "reduction_factor: " + str(reduction_factor) + "\n"
        param_str += "estimated trials per iteration: " + str(trials_in_iter) + "\n"
        param_str += "total estimated trials: " + str(trials_in_iter * tuning_iterations) + "\n"
        param_str += "Started at: " + str(cur_time) + "\n"
        input_param_text_path = os.path.join(save_dir, "parameter_details.txt")  # save with NN
        if not os.path.exists(input_param_text_path) or force_reload:
#         if not os.path.exists(input_param_text_path) or True:
            with open(input_param_text_path, "w") as param_file:
                param_file.write(param_str)
        print(param_str)
        
        # add/save new nn_path_df entry
        self.add_to_nn_path_df(corpus_name, nested_dir, X_col, y_col, save_dir,
                               save_name, None, batch_size, None, 
                               input_param_text_path, verbose=verbose)
        
        # generate optimised neural network
        tuner = NNTuner(save_dir, save_name, vector_len, tuning_iterations=tuning_iterations, 
                        max_epochs=max_epochs, reduction_factor=reduction_factor, force_reload=force_reload, 
                        batch_size=batch_size)
        
#         # debug get model
#         debug_model = tuner.tuner.get_best_models(num_models=1)[0]
#         debug_path = '/nfs/proj-repo/debug_model'
#         if not os.path.exists(debug_path):
#             os.makedirs(debug_path)
#         debug_model.save(debug_path)
#         raise Exception("Saved debug model")

        best_model_dir = os.path.join(save_dir, "best_models")
        best_model, best_hyperparams, best_model_path = tuner.search(batch_generator, best_model_dir)
        
        best_hyperparams = best_hyperparams.values  # convert to dict form
        print("best_hyperparams: " + str(best_hyperparams))

        # save nn_path_df entry with completed values
        self.add_to_nn_path_df(corpus_name, nested_dir, X_col, y_col, save_dir,
                               save_name, best_hyperparams, batch_size, best_model_path, 
                               input_param_text_path, verbose=verbose)
        
        # add time stamp to param file when finished
        cur_time = self.get_cur_time()
        end_msg = "Ended at: " + str(cur_time) + "\n"
        with open(input_param_text_path, "a") as param_file:
            param_file.write(end_msg)
        
        print("Finished tuning neural network")
    
    def get_cur_time(self):
        cur_time = strftime("%a, %d %b %Y %H:%M:%S +0000", gmtime())
        return cur_time
    
    def load_nn_path_df(self, verbose=True):
        if os.path.exists(self.nn_path_df_path):
            nn_path_df = read_df_file_type(self.nn_path_df_path, verbose=verbose)
        else:
            nn_path_df = pd.DataFrame(columns=self.nn_path_df_cols)
            if verbose:
                print("nn_path_df created from scratch")
        return nn_path_df
    
    def add_to_nn_path_df(self, corpus_name, nested_dir, X_col, y_col, tuner_dir, tuner_name,
                          best_hyperparams, batch_size, best_model_path, input_param_text_path, verbose=True):
        # check if exists in dataframe
        df = self.nn_path_df
        mask = (df['corpus_name']==corpus_name)&(df['nested_dir']==nested_dir)&(df['X_col']==X_col)&(df['y_col']==y_col)&(df['tuner_dir']==tuner_dir)&(df['tuner_name']==tuner_name)&(df['batch_size']==batch_size)
        exist_slice = df.loc[mask]
        if len(exist_slice) == 1:  # update existing row
            print("Saving to existing row on nn_path_df")
            self.nn_path_df.loc[mask, self.nn_path_df_cols] = corpus_name, nested_dir, X_col, y_col, tuner_dir, tuner_name, best_hyperparams, batch_size, best_model_path, input_param_text_path
        else:  # append new row
            print("Appending new row to nn_path_df")
            row = {"corpus_name":corpus_name, "nested_dir":nested_dir, "X_col":X_col,
                  "y_col":y_col, "tuner_dir":tuner_dir, "tuner_name":tuner_name,
                  "best_hyperparams":best_hyperparams, "batch_size":batch_size, "best_model_path":best_model_path,
                  "input_param_text_path":input_param_text_path}
            self.nn_path_df = self.nn_path_df.append(row, ignore_index=True)
        # save new entry
        save_df_file_type(self.nn_path_df, self.nn_path_df_path, verbose=verbose)
    
    def generate_nn_save_path(self, corpus_name, nested_dir, X_col, y_col, batch_size, create_dir=True):
        col_dir = str(X_col) + "_" + str(y_col) + "_" + str(int(batch_size))
        dir_list = [self.nn_base_save_dir_path, corpus_name, nested_dir, col_dir]
        # combine directories to form path of subdirectories, create dirs if necessary
        dir_path = None
        for cur_dir in dir_list:
            if dir_path is None:  # first iteration
                dir_path = dir_list[0]
            else:
                dir_path = os.path.join(dir_path, cur_dir)
            if not os.path.exists(dir_path) and create_dir:
                os.makedirs(dir_path)
        # generate name
        save_name = "tuner_proj"
        return dir_path, save_name

In [108]:
# Tuning Parameters

train_topics = np.arange(11, 47).tolist()  # 11 - 46
tuning_iterations = 1
max_epochs = 10
batch_size = 1024
reduction_factor = 8

force_reload = False

X_col = "embedding"

In [None]:
corpus_name = "mine-trects-kba2014-filtered"
nested_dir = "stsb-roberta-base"
y_col = "cos_sim_nearest_nug"
trainer = NNTrainer()
trainer.train(corpus_name, nested_dir, train_topics, X_col, y_col, tuning_iterations=tuning_iterations,
              max_epochs=max_epochs, batch_size=batch_size, force_reload=force_reload, verbose=True,
              reduction_factor=reduction_factor)

loaded from .hdf file


Unnamed: 0,corpus_name,nested_dir,X_col,y_col,tuner_dir,tuner_name,best_hyperparams,batch_size,best_model_path,input_param_text_path
0,mine-trects-kba2014-filtered,stsb-roberta-base,embedding,cos_sim_nearest_nug,/nfs/proj-repo/AAARG-dissertation/tuning_model...,tuner_proj,,32,,/nfs/proj-repo/AAARG-dissertation/tuning_model...
1,mine-trects-kba2014-filtered,stsb-roberta-base,embedding,cos_sim_nearest_nug,/nfs/proj-repo/AAARG-dissertation/tuning_model...,tuner_proj,,1024,,/nfs/proj-repo/AAARG-dissertation/tuning_model...


None

loaded from .hdf file
Already loaded ['embedding', 'cos_sim_nearest_nug']
corpus_name: mine-trects-kba2014-filtered
nested_dir: stsb-roberta-base
X_input: embedding
y_labels: cos_sim_nearest_nug
batch_size: 1024
train_topics: [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]
max_epochs: 10
tuning_iterations: 1
reduction_factor: 8
estimated trials per iteration: 12.26134029733554
total estimated trials: 12.26134029733554
Started at: Tue, 02 Mar 2021 18:03:38 +0000

Saving to existing row on nn_path_df
df saved as hdf complevel 9 at: /nfs/proj-repo/AAARG-dissertation/tuning_models/nn_path_df.hdf
INFO:tensorflow:Using MirroredStrategy with devices ('/job:localhost/replica:0/task:0/device:GPU:0', '/job:localhost/replica:0/task:0/device:GPU:1', '/job:localhost/replica:0/task:0/device:GPU:2', '/job:localhost/replica:0/task:0/device:GPU:3')
INFO:tensorflow:Reloading Oracle from existing projec

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_values] [items->Index(['corpus_name', 'nested_dir', 'X_col', 'y_col', 'tuner_dir',
       'tuner_name', 'best_hyperparams', 'batch_size', 'best_model_path',
       'input_param_text_path'],
      dtype='object')]

  encoding=encoding,



Search: Running Trial #1

Hyperparameter    |Value             |Best Value So Far 
num_hidden_layers |2                 |?                 
hidden_units_0    |1088              |?                 
learning_rate     |0.090382          |?                 
tuner/epochs      |2                 |?                 
tuner/initial_e...|0                 |?                 
tuner/bracket     |1                 |?                 
tuner/round       |0                 |?                 

Epoch 1/2

In [89]:
corpus_name = "mine-trects-kba2014-filtered"
nested_dir = "stsb-roberta-base"
y_col = "cosine_similarity"
trainer = NNTrainer()
trainer.train(corpus_name, nested_dir, train_topics, X_col, y_col, tuning_iterations=tuning_iterations,
              max_epochs=max_epochs, batch_size=batch_size, force_reload=force_reload, verbose=True,
              reduction_factor=reduction_factor)

loaded from .hdf file


Unnamed: 0,corpus_name,nested_dir,X_col,y_col,tuner_dir,tuner_name,best_hyperparams,batch_size,best_model_path,input_param_text_path
0,mine-trects-kba2014-filtered,stsb-roberta-base,embedding,cos_sim_nearest_nug,/nfs/proj-repo/AAARG-dissertation/tuning_model...,tuner_proj,,32,,/nfs/proj-repo/AAARG-dissertation/tuning_model...
1,mine-trects-kba2014-filtered,stsb-roberta-base,embedding,cos_sim_nearest_nug,/nfs/proj-repo/AAARG-dissertation/tuning_model...,tuner_proj,,1024,,/nfs/proj-repo/AAARG-dissertation/tuning_model...


None

loaded from .hdf file
Creating memmaps for embedding, cosine_similarity
with topics: [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/692 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [99]:
corpus_name = "mine-trects-kba2014-filtered"
nested_dir = "distilbert-base-nli-stsb-mean-tokens"
y_col = "cos_sim_nearest_nug"
trainer = NNTrainer()
trainer.train(corpus_name, nested_dir, train_topics, X_col, y_col, tuning_iterations=tuning_iterations,
              max_epochs=max_epochs, batch_size=batch_size, force_reload=force_reload, verbose=True,
              reduction_factor=reduction_factor)

loaded from .hdf file


Unnamed: 0,corpus_name,nested_dir,X_col,y_col,tuner_dir,tuner_name,best_hyperparams,batch_size,best_model_path,input_param_text_path
0,mine-trects-kba2014-filtered,stsb-roberta-base,embedding,cos_sim_nearest_nug,/nfs/proj-repo/AAARG-dissertation/tuning_model...,tuner_proj,,32,,/nfs/proj-repo/AAARG-dissertation/tuning_model...
1,mine-trects-kba2014-filtered,stsb-roberta-base,embedding,cos_sim_nearest_nug,/nfs/proj-repo/AAARG-dissertation/tuning_model...,tuner_proj,,1024,,/nfs/proj-repo/AAARG-dissertation/tuning_model...


None

loaded from .hdf file
Creating memmaps for embedding, cos_sim_nearest_nug
with topics: [11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46]


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_values] [items->Index(['corpus_name', 'nested_dir', 'col_label', 'path', 'dtype', 'vector_len',
       'total_nums', 'offset_step', 'topic_ids', 'complete'],
      dtype='object')]

  encoding=encoding,
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/692 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
corpus_name = "mine-trects-kba2014-filtered"
nested_dir = "distilbert-base-nli-stsb-mean-tokens"
y_col = "cosine_similarity"
trainer = NNTrainer()
trainer.train(corpus_name, nested_dir, train_topics, X_col, y_col, tuning_iterations=tuning_iterations,
              max_epochs=max_epochs, batch_size=batch_size, force_reload=force_reload, verbose=True,
              reduction_factor=reduction_factor)

## Calculating Similarity Threshold for Redundant Sentences

In [None]:
from .defs.corpus_loader import cosine_similarity
import matplotlib.pyplot as plt

In [76]:
def form_predict_batches(emb_list, num_dims, batch_size):
    # add empty rows to complete batch_size
    num_empty_rows = batch_size - (len(emb_list) % batch_size)
    if num_empty_rows > 0:
        empty_rows = [np.zeros(num_dims)] * num_empty_rows
        emb_list.extend(empty_rows)
    # shape into batches
    emb_arr = np.asarray(emb_list)
    num_batches = int(len(emb_arr) / batch_size)
    emb_arr = emb_arr.reshape(num_batches, batch_size, num_dims)
    return emb_arr, num_empty_rows

def predict_emb_list(model, emb_list, batch_size=256):
    # create batches, where does not divide evenly, fill with empty rows
    emb_batches, num_empty_rows = form_predict_batches(emb_list, len(emb_list[0]), batch_size)
    preds = model.predict(emb_batches, 
                          batch_size=batch_size,
                          verbose=1, 
                          use_multiprocessing=True, 
                          workers=num_threads)

    # format predictions to add to df
    preds = preds.flatten()  # undo batch shape
    preds = preds[0:len(preds)-num_empty_rows]  # remove empty row predictions
    return preds

In [74]:
class RedundancyThresholdGenerator:
    def __init__(self, proj_repo='/nfs/proj-repo/AAARG-dissertation'):
        self.path_ret = PathRetriever(proj_repo)
        
    def get_emb_paths(self, corpus_name, nested_dir, topic_ids):
        emb_paths, emb_dir = self.path_ret.get_embedding_paths(corpus_name, nested_dir, topic_ids=topic_ids, 
                                                      return_dir_path=True)
        emb_paths = list(emb_paths['path'])
        return emb_paths, emb_dir
    
    def get_top_sentence_scores(self, emb_paths, model, k=10000, batch_size=256):        
        # create empty df with default vals
        df_empty_dict = {}
        df_empty_dict["sentence"] = ["empty"] * k
        df_empty_dict["score"] = [np.NINF] * k
        score_df = pd.DataFrame(df_empty_dict)
        
        
        # find top k scoring sentences from emb_paths
        for emb_path in tqdm_notebook(emb_paths):
            # load emb_df
            emb_df = load_embeddings(emb_path, verbose=False)
#             print("emb_df")
#             print(display(emb_df))
            
            # scrape embeddings
            embs = list(emb_df['embedding'])
            
            # predict embeddings
            print("Predicting emb_df's embeddings")
            preds = predict_emb_list(model, embs, batch_size=batch_size)
            print("predictions 0-20")
            print(preds[0:20])
            
            # tie sentence to predicted score
            print("Creating prediction dataframe")
            pred_dict = {"sentence":emb_df['sentence'], "score":preds}
            pred_df = pd.DataFrame(pred_dict)
            pred_df = pred_df.sort_values(by="score", ascending=False, ignore_index=True) # descending order
#             print("pred_df")
#             print(display(pred_df))
#             print("score_df")
#             print(display(score_df))
            
            # add scores where values within top k
            bottom_score = score_df['score'][0]
            pred_df_highest = pred_df['score'][0]
            print("bottom_score/pred_df_highest: " + str(bottom_score) + " / " + str(pred_df_highest))
            if pred_df_highest > bottom_score:
                
                # find how many scores to add
                print("Adding new rows")
                num_rows_add = 0
                for index, row in pred_df.iterrows():
                    row_score = row['score']
                    if row_score > bottom_score:
                        num_rows_add += 1
                    else:
                        break  # found all rows that are within current top k
                if num_rows_add > k:
                    num_rows_add = k
                
                # add scores to score_df and sort
                score_df.loc[range(num_rows_add), score_df.columns] = pred_df[0:num_rows_add]
                score_df = score_df.sort_values(by="score", ignore_index=True, ascending=True)
                
                if len(score_df) > k:
                    raise Exception("Score df len " + str(len(score_df)) + " is greater than k " + str(k))
            else:
                print("No new rows to be added")
#             print("score_df")
#             print(display(score_df))
        return score_df
    
    def compare_sim_nuggets(self, emb_dir, sample_embs):
        """
        Retrieve similarities between sample_embs where:
            - sample_embs share the same nearest nugget
            - sample_embs do not share the same nearest nugget
        """
        # load nugget embeddings
        nug_embs_path = os.path.join(emb_dir, "embedding_list_all.pickle")
        nug_embs = None
        with open(nug_embs_path, 'rb') as handle:
            nug_embs = pickle.load(handle)
            
        # for each sample_emb, find index of nearest nug
        nearest_nug_indexes = np.zeros(len(sample_embs))  # sample_emb_idx -> nearest_nug_idx
        print("Finding nearest nug for each sample embedding")
        for sample_index, sample_emb in tqdm_notebook(enumerate(sample_embs)):
            # get index of max for this sample_emb
            max_idx = None
            max_sim = -np.inf
            for nug_index, nug_emb in enumerate(nug_embs):
                cur_sim = cosine_similarity(sample_emb, nug_emb)
                if cur_sim > max_sim:
                    max_sim = cur_sim
                    max_idx = nug_index
            # store max idx
            nearest_nug_indexes[sample_index] = max_index
            
        
        # collect cosine similarities of sample embeddings that have same nearest nug, and those that don't
        same_nug_sims = []
        dif_nug_sims = []
        print("Finding cosine similarities between sample embeddings")
        for x in tqdm_notebook(range(len(sample_embs - 1))):
            for y in range(x, len(sample_embs)):
                # get cosine_similarity between sample embs x and y
                x_emb = sample_embs[x]
                y_emb = sample_embs[y]
                sim = cosine_similarity(x_emb, y_emb)
                # check if x and y have same nearest nug
                x_nearest = nearest_nug_indexes[x]
                y_nearest = nearest_nug_indexes[y]
                if x_nearest == y_nearest:  # share same nearest_nug
                    same_nug_sims.append(sim)
                else:
                    dif_nug_sims.append(sim)
        
        # sort similarities
        same_nug_sims = np.sort(np.asarray(same_nug_sims))
        dif_nug_sims = np.sort(np.asarray(dif_nug_sims))   
        return same_nug_sims, dif_nug_sims
        
    def plot_sim_nugget_distributions(self, same_nug_sims, dif_nug_sims):
        # https://stackoverflow.com/a/24567715
        fig = plt.gcf()  # might need to change so gets new figure?
        
        
        # plot cumulative distribution stepwise
        plt.subplot(2,2,1)
        plt.step(np.concatenate([same_nug_sims, same_nug_sims[[-1]]]), 
                 np.arange(same_nug_sims.size+1),
                label='shared most similar nugget')
        plt.step(np.concatenate([dif_nug_sims, dif_nug_sims[[-1]]]), 
                 np.arange(dif_nug_sims.size+1),
                label='different most similar nugget')
        
        plt.title("Distribution of cosine similarities between sample sentences (cumulative)")
        plt.ylabel('Count')
        plt.xlabel('Cosine Similarity')
        plt.legend(loc='upper_left')
#         plt.xlim([min_val, max_val])
#         plt.ylim([0, ])
        plt.grid()
    
        plt.show()
        
print("cell loaded")

cell loaded


In [56]:
# def get_debug_model():
#     model_path = '/nfs/proj-repo/debug_model'
#     model = tf.keras.models.load_model(model_path)
#     print("loaded model")
#     return model

model_path_handler = ModelPathHandler()
print(display(model_path_handler.df))

loaded from .hdf file


Unnamed: 0,corpus_name,nested_dir,X_col,y_col,tuner_dir,tuner_name,best_hyperparams,batch_size,best_model_path,input_param_text_path
0,mine-trects-kba2014-filtered,stsb-roberta-base,embedding,cos_sim_nearest_nug,/nfs/proj-repo/AAARG-dissertation/tuning_model...,tuner_proj,,32,,/nfs/proj-repo/AAARG-dissertation/tuning_model...
1,mine-trects-kba2014-filtered,stsb-roberta-base,embedding,cos_sim_nearest_nug,/nfs/proj-repo/AAARG-dissertation/tuning_model...,tuner_proj,"{'num_hidden_layers': 2, 'hidden_units_0': 897...",1024,/nfs/proj-repo/AAARG-dissertation/tuning_model...,/nfs/proj-repo/AAARG-dissertation/tuning_model...


None


In [79]:
corpus_name = "mine-trects-kba2014-filtered"
nested_dir = "stsb-roberta-base"
X_col = "embedding"
y_col = "cos_sim_nearest_nug"
batch_size = 1024

model = model_path_handler.load_best_model(corpus_name, nested_dir, X_col, y_col, batch_size)
tuner_inst_mask = model_path_handler.create_df_mask(corpus_name, nested_dir, X_col, y_col, batch_size)
model_df = model_path_handler.df
tuner_inst = model_df.loc[tuner_inst_mask]
best_hyperparams = list(tuner_inst['best_hyperparams'])[0]
print("best_hyperparams")
print(best_hyperparams)

# model_path = "/nfs/proj-repo/AAARG-dissertation/tuning_models/mine-trects-kba2014-filtered/stsb-roberta-base/embedding_cos_sim_nearest_nug_1024/best_models/0"
# model = tf.keras.models.load_model(model_path)

print(model.summary())

best_hyperparams
{'num_hidden_layers': 2, 'hidden_units_0': 897, 'hidden_activ_0': 'tanh', 'output_activ': 'tanh', 'learning_rate': 0.05305450445016687, 'tuner/epochs': 15, 'tuner/initial_epoch': 2, 'tuner/bracket': 1, 'tuner/round': 1, 'hidden_units_1': 1, 'hidden_activ_1': 'relu', 'tuner/trial_id': 'b182cd2325c30a3b47721e93f6de69b1'}
Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
hidden_layer_0 (Dense)       (256, 897)                689793    
_________________________________________________________________
hidden_layer_1 (Dense)       (256, 1)                  898       
_________________________________________________________________
output_layer (Dense)         (256, 1)                  2         
Total params: 690,693
Trainable params: 690,693
Non-trainable params: 0
_________________________________________________________________
None


In [71]:
sum_gen = RedundancyThresholdGenerator()

In [59]:
test_ids = np.arange(11, 15).tolist()
test_paths, test_emb_dir = sum_gen.get_emb_paths(corpus_name, nested_dir, test_ids)

Loading embeddings paths for mine-trects-kba2014-filtered


In [51]:
def show_df_test(emb_path):
    emb_df = load_embeddings(emb_path, verbose=False)
    print(display(emb_df[0:200]))
    
# show_df_test(test_paths[0])

In [75]:
k = 10000
batch_size = 256
test_top_scores = sum_gen.get_top_sentence_scores(test_paths, model, k=k, batch_size=batch_size)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


  0%|          | 0/109 [00:00<?, ?it/s]

Predicting emb_df's embeddings
emb_batches 0-5
[[[ 1.31808281e+00 -1.35149360e+00 -2.16791630e-01 ...  8.92709494e-01
    7.69516230e-01 -1.19040012e+00]
  [ 2.24484995e-01 -2.57765383e-01 -3.50602478e-01 ...  1.90732732e-01
    3.17092270e-01 -9.16033447e-01]
  [-2.82743037e-01  9.51191068e-01  1.54983687e+00 ... -1.56537127e+00
   -5.88145435e-01  1.73127079e+00]
  ...
  [-8.03759634e-01 -9.36938167e-01 -1.48505911e-01 ...  7.28978217e-01
   -4.84286100e-02  7.45753944e-02]
  [ 4.77898121e-01  3.02714594e-02  5.03240060e-03 ... -1.11172009e+00
   -1.85167104e-01 -8.78115356e-01]
  [ 4.35933620e-01 -5.11561990e-01  8.28448772e-01 ... -2.12003872e-01
   -1.27906501e-02  4.35475916e-01]]

 [[-5.42698860e-01 -3.42831537e-02 -5.71567416e-01 ... -1.72929430e+00
    4.67046857e-01  7.81671047e-01]
  [ 9.47753862e-02 -4.22628522e-01 -4.77715105e-01 ... -1.01307905e+00
    5.91164291e-01 -8.80811661e-02]
  [ 5.52666187e-01 -5.20555489e-02 -1.00115508e-01 ... -3.36523950e-01
    7.93765113e-02

predictions 0-20
[-0.00539045 -0.00539045 -0.00539045 -0.00539045 -0.00539045 -0.00539045
 -0.00539045 -0.00539045 -0.00539045 -0.00539045 -0.00539045 -0.00539045
 -0.00539045 -0.00539045 -0.00539045 -0.00539045 -0.00539045 -0.00539045
 -0.00539045 -0.00539045]
Creating prediction dataframe
bottom_score/pred_df_highest: -0.0053904485 / -0.0053904485
No new rows to be added
Predicting emb_df's embeddings
emb_batches 0-5
[[[ 1.31808305 -1.35149384 -0.21679243 ...  0.89271069  0.76951575
   -1.19039869]
  [ 0.28974602 -0.06294903  0.85951364 ... -0.71902603  1.33882415
    0.38705596]
  [ 1.31808305 -1.35149384 -0.21679243 ...  0.89271069  0.76951575
   -1.19039869]
  ...
  [ 0.85323685 -0.57414776  0.35997775 ... -0.60646826  0.82066584
    0.08563385]
  [ 0.01932153 -0.7820065   0.20269561 ...  0.7233519   0.18285726
    1.39887452]
  [ 0.10104913 -0.51993024 -0.07946158 ...  0.15104468  0.33584976
    0.27905038]]

 [[ 0.64689952 -0.72453994  0.13737181 ... -0.21017061  0.36940745
    

predictions 0-20
[-0.00539045 -0.00539045 -0.00539045 -0.00539045 -0.00539045 -0.00539045
 -0.00539045 -0.00539045 -0.00539045 -0.00539045 -0.00539045 -0.00539045
 -0.00539045 -0.00539045 -0.00539045 -0.00539045 -0.00539045 -0.00539045
 -0.00539045 -0.00539045]
Creating prediction dataframe
bottom_score/pred_df_highest: -0.0053904485 / -0.0053904485
No new rows to be added
Predicting emb_df's embeddings
emb_batches 0-5
[[[ 1.31808281 -1.3514936  -0.21679163 ...  0.89270949  0.76951623
   -1.19040012]
  [ 0.33735931 -0.1040054   1.00903106 ... -1.33279955  0.87681776
    0.31744558]
  [ 0.01791837 -0.97980791  0.46298274 ... -1.93654048  0.3613238
    0.36071238]
  ...
  [ 0.11713862  0.42606425 -0.15007584 ... -0.41017073  0.2193383
    0.51430464]
  [-0.10173391 -0.70218891 -0.83921206 ...  1.12638533 -0.23936138
   -0.39608955]
  [-0.14516385  0.58596158  0.58762747 ... -1.25435221  0.40228564
    0.42066166]]

 [[ 0.23356158 -0.15426749  0.40086606 ... -0.67714918  0.78808224
    0.

KeyboardInterrupt: 

## Generate Summaries

In [None]:
def gen_score_df_path(tuner_dir, topic_id, X_col, corpus_name):
    fn = str(topic_id) + "_" + str(X_col) + "_" + str(corpus_name)
    fn += ".hdf"
    path = os.path.join(tuner_dir, fn)
    return path

def gen_ranked_df_path(tuner_dir, rank_method, topic_id, X_col, corpus_name):
    fn = str(rank_method) + "_" + str(topic_id) + "_" + str(X_col) + "_" + str(corpus_name)
    fn += ".hdf"
    path = os.path.join(tuner_dir, fn)
    return path

def gen_varname_path(tuner_dir, varname, rank_method, topic_id, X_col, corpus_name):
    fn = "ranked_idxs" + "_" + str(rank_method) + "_" + str(topic_id) + "_" + str(X_col) + str(corpus_name)
    fn += ".pickle"
    path = os.path.join(tuner_dir, fn)
    return path

In [None]:
class SummaryGenerator:
    def __init__(self, model_path_handler, proj_repo='/nfs/proj-repo/AAARG-dissertation'):
        self.model_path_handler = self.model_path_handler
        self.path_ret = PathRetriever(proj_repo)
        self.proj_repo = proj_repo
        self.rank_methods = ['k', 'k_non_redund']
        
    def generate_ranked_df(self, rank_method, model, tuner_dir, corpus_name, nested_dir, topic_id, k=10000,
                         X_col='embedding', batch_size=256, redundancy_score=None, verbose=verbose,
                          force_reload=False):
        
        # create/load predicted scores
        score_df_path = gen_score_df_path(tuner_dir, topic_id, X_col, corpus_name)
        score_df = None
        if os.path.exists(score_df_path):  # load
            score_df = read_df_file_type(score_df_path, verbose=verbose)
        else:  
            # create df of predicted scores
            emb_paths = self.path_ret.get_embedding_paths(corpus_name, nested_dir, topic_ids=topic_id, 
                                                      return_dir_path=False)
            emb_paths = list(emb_paths['path'])
            score_df = self.get_prediction_scores(model, emb_paths, X_col=X_col, batch_size=batch_size,
                                                 sort_scores=True)
            # save df
            score_df = save_df_file_type(score_df_path, verbose=verbose)
        
        # get ranked df
        ranked_df = None
        ranked_df_path = gen_ranked_df_path(tuner_dir, rank_method, topic_id, X_col, corpus_name)
        if not os.path.exists(ranked_df_path) or force_reload:
            # load new ranked_df with selected method
            ranked_idxs = None
            if rank_method == "k":
                ranked_df = self.retrieve_top_k_sentences(score_df, k, is_sorted=True)
            elif rank_method == "k_non_redund":
                ranked_df, ranked_idxs = self.ret_top_k_non_redundant(score_df, 
                                                                    k, 
                                                                    redundancy_threshold, 
                                                                    X_col='embedding', 
                                                                    is_sorted=True,
                                                                    return_indexes=True)
            else:
                raise Exception(str(rank_method) + " is not a valid rank_method: " + str(self.rank_methods))
            # save ranked_idxs
            if ranked_idxs is not None:
                ranked_idxs_path = gen_varname_path(tuner_dir, "ranked_idxs", rank_method, topic_id, X_col,
                                                   corpus_name)
                with open(ranked_idxs_path, 'wb') as handle:
                    pickle.dump(ranked_idxs, handle, protocol=pickle.HIGHEST_PROTOCOL)
        else:
            ranked_df = read_df_file_type(ranked_df_path, verbose=verbose)
            
        return ranked_df
        
        
        
    def get_prediction_scores(self, model, emb_paths, X_col='embedding',
                             batch_size=256, sort_scores=True, add_X_col_to_df=True):
        """
        Return dataframe with columns:
            sentence: string of sentence
            score: predicted score from model
            relative_pos: relative_position of sentence in its article (sent_id)
            embedding (optional): embedding of sentence (determined by X_col parameter)
        """
        
        # predict scores for target embeddings to place into df
        score_df_dict = defaultdict(list)
        print("Predicting scores...")
        for emb_path in tqdm_notebook(emb_paths):
            emb_df = load_embeddings(emb_path, verbose=False)
            embs = list(embs[X_col])
            
            scores = predict_emb_list(model, emb_list, batch_size=batch_size)
            
            score_df_dict['sentence'].extend(list(emb_df['sentence']))
            score_df_dict['score'].extend(scores.tolist())
            score_df_dict['relative_pos'].extend(list(emb_df['sent_id']))  # relative position in article
            if add_X_col_to_df:
                score_df_dict['embedding'].extend(embs)
        
        # tie sentence <-> score together and access DataFrame functionality
        score_df = pd.DataFrame(score_df_dict)
        if sort_scores:
            score_df = score_df.sort_values(by="score", ascending=False)  # descending order
        return score_df
        
    def retrieve_top_k_sentences(self, score_df, k, is_sorted=True):
        """
        Retrieve top k scoring sentences from score_df
        """
        if not is_sorted:
            score_df = score_df.sort_values(by="score", ascending=False)
        
        top_k_df = score_df[0:k]
        
        return top_k_df
    
    def ret_top_k_non_redundant(self, score_df, k, redundancy_threshold, X_col='embedding', is_sorted=True,
                               return_indexes=True):
        """
        Retrieve top k sentences from df, not repeating redundant sentences
        """
        if not is_sorted:
            score_df = score_df.sort_values(by="score", ascending=False)
        
        top_k_rows = []
        top_k_row_idxs = None
        if return_indexes:
            top_k_row_idxs = []
        
        # iteratively find top k sentences where cosine similarity is under redundancy_threshold
        for index, row in score_df.iterrows():
            under_threshold = True
            row_emb = row['embedding']
            for k_row in top_k_rows:  # compare cosine similarity with each sentence so far
                k_emb = k_row['embedding']
                cos_sim = cosine_similarity(row_emb, k_emb)
                if cos_sim >= redundancy_threshold:
                    under_threshold = False
                    break  # break from nested loop
            if under_threshold:  # add when dissimilar enough
                top_k_rows.append(row)
                if return_indexes:
                    top_k_row_idxs.append(index)
            if len(top_k_rows) == k:
                break
        
        top_k_df = pd.DataFrame(top_k_rows)
#         top_k_df = self.order_by_article_position(top_k_df)
        return top_k_df
    
    def order_by_article_position(self, top_k_df):
        """Order sentences by relative article position"""
        top_k_df = top_k_df.sort_values(by="relative_pos", ascending=True)
        return top_k_df
    
    def 