# Notebook for Generating Summaries

In [1]:
# temp while cluster is full
# !pip install keras-tuner
# !pip install tables

In [2]:
import os
num_threads = 32
os.environ['NUMEXPR_MAX_THREADS'] = str(num_threads)

import pandas as pd
import numpy as np
import pickle
import copy
import math
from collections import defaultdict
from tqdm import tqdm
from tqdm import tqdm_notebook
from sentence_transformers import SentenceTransformer
import ipynb.fs

from sklearn.neighbors import KDTree

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import InputLayer
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import LSTM

import kerastuner as kt
from kerastuner.engine.hyperparameters import HyperParameters
from kerastuner.tuners import Hyperband

In [3]:
default_col_labels = ['cosine_similarity', 'cos_sim_nearest_nug']
default_input_col = "embedding"

## Keras NN Model

In [4]:
class NNTuner:
    def __init__(self, save_dir, save_name, input_shape, tuning_iterations=50, batch_size=32, force_reload=False):
        """Can save using project_name param, if overwrite false then will reload where it started
        In Tuner Class documentation
        """
        self.input_shape = input_shape
        self.batch_size = batch_size
        self.models = []
        self.tuner = Hyperband(self.build_model, 
                          objective='mean_squared_error', 
                          max_epochs=25,
                          hyperband_iterations=tuning_iterations,
                          directory=save_dir,
                          project_name=save_name,
                          overwrite=force_reload)
        
    def build_model(self, hp):
        model = Sequential()
        ilayer = InputLayer(input_shape=self.input_shape, batch_size=self.batch_size)
        model.add(ilayer)
        for i in range(hp.Int('num_layers', min_value=1, max_value=4)):
            model.add(Dense(units=hp.Int('units_' + str(i),
                                        min_value=1, max_value=1024, step=32),
                            activation=hp.Choice('activ_' + str(i),
                                                ['relu', 'tanh', 'sigmoid'])))
        opt = tf.keras.optimizers.Adam(
                learning_rate=hp.Float('learning_rate', min_value=0.00001, max_value=0.1))           
        losses = hp.Choice('loss_func', ['MSE', 'huber', 'binary_crossentropy', 'categorical_crossentropy'])
        model.compile(optimizer=opt, loss=losses, metrics=['mean_squared_error'])  # add metrics here
        self.models.append(model)
        return model
    
    def search(self, batch_generator, save_path=None, return_hyperparams=False):
        """Find optimal model given dataset
        """
        self.tuner.search(x=batch_generator, verbose=1, use_multiprocessing=False, workers=num_threads)
        best_model = self.tuner.get_best_models(num_models=1)
        if save_path is not None:
            tf.keras.save(save_path)
        if return_hyperparams:
            hyperparams = self.tuner.get_best_hyperparameters(num_trials=1)
            return best_model, hyperparams
        return best_model
    

# from collections import OrderedDict
from collections import deque

class BatchGenerator(keras.utils.Sequence):
    """Class to load in dataset that is too large to load into memory at once
    
    Do check in class before to make sure all X lists and y lists are same length
    
    https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
    """
    def __init__(self, X_paths, y_paths, batch_size, file_type):
        if batch_size is None:
            self.batch_size = 1
        else:
            self.batch_size = batch_size
        
        self.X_paths = X_paths
        self.y_paths = y_paths
        self.file_type = file_type
#         self.shuffle = False  # make sure linear progression through dataset for sake of memory efficiency
        
    def __len__(self):
        """Denotes the number of batches per epoch"""
        return len(self.X_paths)
    
    def __getitem__(self, idx):
        """Generates one batch of data"""
        inputs = self.load_samples(X_paths[idx])
        labels = self.load_samples(y_paths[idx])
        return inputs, labels
    
    
    def load_samples(self, path):
        samples = None
        if self.file_type == '.pickle':
            with open(path, 'rb') as handle:
                samples = pickle.load(handle)
        else:
            raise Exception("Invalid file type: " + str(self.file_type))
        return samples

## Generating the samples 

In [5]:
from .defs.corpus_loader import PathRetriever, load_embeddings, load_topics, read_df_file_type, save_df_file_type
from .defs.corpus_loader import convert_to_list

In [6]:
def resolve_input_params(path_ret, corpus_names, nested_dirs, col_labels, input_col=None):
    """Helper function to resolve the selection of input params that determine what data to load/generate"""
    # resolve corpus_names
    if corpus_names is None:
        corpus_names = path_ret.get_corpus_names()
        if len(corpus_names) == 0:
            raise Exception("There are no corpuses to load from")
    # resolve col_labels
    if col_labels is None:  # our columns to generate files for
        col_labels = default_col_labels.copy()
        if input_col is not None:
            col_labels.append(input_col)
    # resolve nested_dirs
    if type(nested_dirs) != dict:  # if output gets passed through again
        nested_dict = {}
        for corpus_name in corpus_names:  # get the nested dir for each corpus name
            nested_dict[corpus_name] = path_ret.get_nested_dirs(corpus_name, "embeddings")
            if nested_dirs is not None:
                # add only selected nested_dirs for this corpus_name
                nested_dict[corpus_name] = [x for x in nested_dict[corpus_name] if x in nested_dirs]
        nested_dirs = nested_dict
    # make sure there is at least one entry in nested_dict
    empty_dirs = [len(x) == 0 for x in nested_dirs.values()]  # get if empty for each item
    if all(empty_dirs):
        raise Exception("There are no nested_dirs matching the selection")
    return corpus_names, nested_dirs, col_labels

def corpus_name_topic_ids(path_retriever, corpus_name):
    topic_path = path_retriever.get_topic_path(corpus_name, verbose=False)
    topic_df = load_topics(topic_path, verbose=False)
    topic_ids = list(topic_df['id'].unique())
    return topic_ids

def find_combinations(path_df, corpus_names, nested_dirs, col_labels, add_topics=False, col_labels_as_list=False,
                      as_tuples=True, force_reload=False, path_retriever=None, batch_size=None, file_type=None,
                     exists_only=False):
    """Find the combinations that have not been generated/trained already in path_df
    
    Tuple ordering: (corpus_name, nested_dir, col_label/[col_labels], **topic_id**)
    """
    if exists_only:
        path_df = path_df[path_df['exists'] == True]  # checking of path_df is only concerned with existing files
    if batch_size is not None:
        path_df = path_df[path_df['batch_size'] == batch_size]
    if file_type is not None:
        path_df = path_df[path_df['file_type'] == file_type]
    topic_ids = {}
    if add_topics:  # find topic_ids for each corpus
        for corpus_name in corpus_names:
            if path_retriever is not None:
                topic_ids[corpus_name] = corpus_name_topic_ids(path_retriever, corpus_name)
            else:
                raise Exception("If add_topics is True then path_retriever must be set to an instance of PathRetriever")
    # get possible combinations
    combinations = []
    for corpus_name in corpus_names:
        for nested_dir in nested_dirs[corpus_name]:
            combo_path = path_df[(path_df['corpus_name'] == corpus_name)
                                    & (path_df['nested_dir'] == nested_dir)]
            combo = [corpus_name, nested_dir]
            if add_topics:  # create permutations with topic_ids
                topic_combo_dict = defaultdict(list)
                for label in col_labels:
                    for topic_id in topic_ids[corpus_name]:  # check if label exists for topic_id
                        topic_path = combo_path[(combo_path['col_label'] == label)
                                               & (combo_path['topic_id'] == topic_id)]
                        if len(topic_path) == 0 or force_reload:
                            topic_combo_dict[topic_id].append(label)
                topic_combos = []
                for topic_id, labels in topic_combo_dict.items():
                    topic_combos = []
                    if col_labels_as_list:  # add single tuple with all missing col_labels for topic_id
                        topic_combo = copy.deepcopy(combo)
                        topic_combo.append(labels)
                        topic_combo.append(topic_id)
                        topic_combos.append(topic_combo)
                    else:
                        for label in labels:  # add a tuple for each missing col_label for topic_id
                            topic_combo = copy.deepcopy(combo)
                            topic_combo.append(topic_id)
                            topic_combos.append(topic_combo)
                    combinations.extend(topic_combos)
            else:  # create permutations with col_labels only
                label_combos = []
                add_labels = None
                if not force_reload:  # find which col_labels don't exist already
                    exist_labels = list(combo_path['col_label'].unique())
                    add_labels = [x for x in col_labels if x not in exist_labels]
                else:
                    add_labels = copy.deepcopy(col_labels)  # force_reload add all labels
                if col_labels_as_list:  # add single tuple
                    label_combo = copy.deepcopy(combo)
                    label_combo.append(add_labels)
                    label_combos.append(label_combo)
                else:
                    for add_label in add_labels:  # add tuple for each col_label
                        label_combo = copy.deepcopy(combo)
                        label_combo.append(add_label)
                        label_combos.append(label_combo)
                combinations.extend(label_combos)
                
    if as_tuples:
        combinations = [tuple(x) for x in combinations]
    return combinations

In [7]:
class InputLabelHandler:
    """Class that will load and store an instance of the dataset to be fed to a model
    
    will save in a dir entitled 'samples' in nested_dir
    """
    def __init__(self, proj_dir=None, input_col_name="embedding"):
        if proj_dir is None:
            self.proj_dir = '/nfs/proj-repo/AAARG-dissertation'
        else:
            self.proj_dir = proj_dir
        self.default_file_type = ".hdf"
        self.path_ret = PathRetriever(proj_dir)
        self.label_options = ['cosine_similarity', 'cos_sim_nearest_nug']
        self.input_col_name = input_col_name
#         self.default_test_topics = [1,2,3,4,5,6,8,9,10]
        # label_path_df variables
        self.label_path_df_dir = self.path_ret.path_handler.dataset_dir
        self.sample_dir_name = "samples"
        self.label_path_df_path = os.path.join(self.label_path_df_dir, "label_path_df.hdf")
        self.label_path_df_cols = ['corpus_name', 'nested_dir', 'topic_id', 'col_label', 'batch_instance',
                                   'batches_in_topic', 'batch_size', 'shape', 'file_type', 'path', 'exists']
        self.possible_file_types = ['.pickle']
        
        
    def generate(self, corpus_names=None, nested_dirs=None, col_labels=None, emb_file_type=None, batch_size=32,
                    file_type='.pickle', force_reload=False, verbose=True):
        """Generate easily loadable inputs/labels files to be fed to NN when needed"""
        
        self.label_path_df = self.load_label_path_df(verbose=verbose)
        
        corpus_names, nested_dirs, col_labels = resolve_input_params(self.path_ret, corpus_names,
                                                                    nested_dirs, col_labels, input_col="embedding")
        
        
        if file_type not in self.possible_file_types:
            raise Exception(str(file_type) + " is not a valid file type")
        if emb_file_type is None:  # target file type to load from
            emb_file_type = self.default_file_type
        
        if verbose:
            print("Retrieving the following with batch_size(" + str(batch_size) +"): " 
                  + str(", ".join(col_labels)))
            
        combinations = find_combinations(self.label_path_df, corpus_names, nested_dirs, col_labels,
                                         add_topics=True, col_labels_as_list=True, as_tuples=True, 
                                         force_reload=force_reload, path_retriever=self.path_ret,
                                        batch_size=batch_size, file_type=file_type, exists_only=True)

        if len(combinations) > 0:
            for corpus_name, nested_dir, col_labels, topic_id in tqdm_notebook(combinations):
                if verbose:
                    print("corpus_name: " + str(corpus_name) + "\n"
                         + "nested_dir: " + str(nested_dir) + "\n"
                         + "col_labels: " + str(col_labels) + "\n"
                         + "topic_id: " + str(topic_id))

                emb_paths, nested_dir_path = self.path_ret.get_embedding_paths(corpus_name, nested_dir, 
                                                            file_type=emb_file_type, verbose=False, 
                                                            return_dir_path=True, topic_ids=[topic_id])
                if len(emb_paths) == 0:
                    raise Exception("No paths for " + str(corpus_name) + ", " + str(nested_dir) + ", "
                                   + str(emb_file_type) + ", topic_id: " + str(topic_id))
                    
                # load the selected labels
                loaded_labels = self.retrieve_col_data(emb_paths, col_labels, verbose=verbose)
                
                # create and save batches
                for label, label_data in loaded_labels.items():
                    batches = self.create_batches(label_data, batch_size)
                    shape = batches[0].get_shape()
                    update_paths = []
                    pbar = None
                    if verbose:
                        print("Saving batches for " + str(label))
                        pbar = tqdm_notebook(total=len(batches))
                    for index, batch in enumerate(batches):
                        # create file name for batch
                        path = self.generate_path(nested_dir_path, topic_id, index, label, file_type=file_type)
                        update_paths.append(path)
                        # add to path_df
                        if not os.path.exists(path) or force_reload:  # saves resaving files
                            self.add_path_to_df(corpus_name, nested_dir, topic_id, label, index, 
                                                len(batches), batch_size, shape, file_type, path, False)
                            # save file
                            self.save_object(batch, path, file_type)
                        if verbose:
                            pbar.update()
                    self.update_path_exists(update_paths) 
                    self.save_label_path_df()
                    if verbose:
                        print(str(len(batches)) + " files saved for " + str(label))
            print("Completed generating inputs/labels")
        else:
            print("Input/label combinations fully loaded")
    
    def get_paths(self, corpus_name, nested_dir, col_label, topic_ids=None, file_type='.pickle',
                  batch_size=32, return_shape=False):
        """
        Parameters:
            return_indices: add start and end index for topic into dict, if topics were to be loaded as a
                            continuous list
        Return:
            Dict where:
                    key: a topic_id or 'input_dim'
                    value: nested_dict  : keys = "path", "length", ("start_idx", "end_idx")
        """
        paths = self.label_path_df
        paths = paths[(paths['corpus_name'] == corpus_name) 
                      & (paths['nested_dir'] == nested_dir)
                      & (paths['col_label'] == col_label)
                      & (paths['batch_size'] == batch_size)
                      & (paths['file_type'] == file_type)]
        if topic_ids is not None:
            paths = paths[paths['topic_id'].isin(topic_ids)]
            
        # sort column so consistent ordering
        paths = paths.sort_values(by=['topic_id', 'batch_instance'], ascending=True)
        path_list = list(paths['path'])
        
        shape = list(paths['shape'].unique())
        if len(input_dim) > 1:
            raise Exception("Dimensions of list objects varies: " + str(shape))
        else:
            shape = shape[0]
            
        if return_shape:
            return path_list, shape
        return path_list
        
    
    def corpus_topic_ids(self, corpus_name):
        """Num topics for given corpus_name"""
        topic_ids = list(self.label_path_df[self.label_path_df['corpus_name'] == corpus_name]['topic_id'].unique())
        return topic_ids
                        
    def load_label_path_df(self, verbose=True):
        label_path_df = None
        if verbose:
            print("Loading label_path_df")
        if os.path.exists(self.label_path_df_path):
            label_path_df = read_df_file_type(self.label_path_df_path, verbose=True)
        else:
            label_path_df = pd.DataFrame(columns=self.label_path_df_cols)
            if verbose:
                print("label_path_df created from scratch")
        return label_path_df
    
    def add_path_to_df(self, corpus_name, nested_dir, topic_id, col_label, batch_instance, batches_in_topic,
                      batch_size, shape, file_type, path, exists):
        row = {"corpus_name":corpus_name, "nested_dir":nested_dir, "topic_id":topic_id, "col_label":col_label,
               "batch_instance":batch_instance, "batches_in_topic":batches_in_topic, "batch_size":batch_size,
               "shape":shape, "file_type":file_type, "path":path, "exists":exists}
        self.label_path_df = self.label_path_df.append(row, ignore_index=True)
        
    def save_label_path_df(self):
        save_df_file_type(self.label_path_df, self.label_path_df_path, verbose=False)
        
    def update_path_exists(self, path):
        path = convert_to_list(path)
        self.label_path_df.loc[self.label_path_df['path'].isin(path), 'exists'] = True
        
    def create_batches(self, samples, batch_size):
        batch_list = []
        for i in range(0, len(samples), batch_size):
            slice_end_idx = i + batch_size
            if slice_end_idx > len(samples):  # leave last potential batch if doesn't divide evenly
                break
            batch_slice = samples[i:slice_end_idx]  # end step is exclusive
            test_dims = batch_slice[0]  # debug
            is_scalar = np.isscalar(batch_slice[0])
            batch_slice = tf.convert_to_tensor(batch_slice)
            input_dim = None  # debug
            if is_scalar:
                input_dim = 1  # debug
                batch_slice = labels = tf.expand_dims(batch_slice, 1)  # add dimension to get appropriate shape
            else:
                input_dim = len(test_dims)
            if batch_slice.shape != (batch_size, input_dim):
                raise Exception("Wrong shape for batch_slice: " + str(batch_slice.shape)
                               + "\nExpected shape: " + str((batch_size, input_dim)))
            batch_list.append(batch_slice)
        return batch_list
                
    def retrieve_col_data(self, emb_paths, col_labels, verbose=True):
        # setup return variables
        labels = {}
        for col_label in col_labels:
            labels[col_label] = []
        # search through paths for labels
        pbar = None
        if verbose:
            print("Retrieving samples from dataframes")
            pbar = tqdm_notebook(total=len(emb_paths))
        for emb_path in emb_paths['path']:
            emb_df = load_embeddings(emb_path, verbose=False)
            for col_label in col_labels:
                if col_label not in emb_df.columns:
                    raise ValueError("Target label " + str(col_label) + " is not in file at " + str(emb_path))
                # collect label values from df
                labs = list(emb_df[col_label])
                labels[col_label].extend(labs)
            if verbose:
                pbar.update()
        return labels
    
    def save_object(self, obj, path, file_type):
        if file_type == '.pickle':
            with open(path, 'wb') as handle:
                pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
        if file_type == '.memmap':
            
    
    def generate_path(self, nested_dir_path, topic_id, instance_num, col_label, file_type='.pickle'):
        filename = str(col_label) + "_" + str(topic_id) + "_" + str(instance_num) + str(file_type)
        dir_path = os.path.join(nested_dir_path, self.sample_dir_name)
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        file_path = os.path.join(dir_path, filename)
        return file_path
    
    def delete_files(self):
        paths = self.label_path_df['path']
        deleted_paths = []
        print("Deleting " + str(len(paths)) + " paths")
        for path in tqdm_notebook(paths):
            if os.path.exists(path):
                os.remove(path)
            deleted_paths.append(path)
        self.label_path_df[~self.label_path_df['path'].isin(deleted_paths)]
        self.save_label_path_df()
        print("deleted")


## Training/Tuning Driver

In [8]:
class NNTrainer:
    def __init__(self, proj_dir=None, nn_base_save_dir_name=None):
        if proj_dir is None:
            self.proj_dir = '/nfs/proj-repo/AAARG-dissertation'
        else:
            self.proj_dir = proj_dir
        self.input_handler = InputLabelHandler(self.proj_dir)
        self.nn_base_save_dir_name = nn_base_save_dir_name
        if self.nn_base_save_dir_name is None:
            self.nn_base_save_dir_name = "summarization_models"
        self.nn_base_save_dir_path = os.path.join(self.proj_dir, self.nn_base_save_dir_name)
        self.nn_path_df_name = "nn_path_df.hdf"
        self.nn_path_df_path = os.path.join(self.nn_base_save_dir_path, self.nn_path_df_name)
        self.nn_path_df_cols = ['corpus_name', 'nested_dir', 'col_label', 'dir_path']
        self.default_test_topics = [1,2,3,4,5,6,8,9,10]
        self.default_train_ratio = 0.8
        self.min_train_ratio = 0.5
    
    def train(self, corpus_names=None, nested_dirs=None, col_labels=None, tuning_iterations=100,
              train_topics = None, test_topics=None, input_col_name="embedding", batch_size=32,
              train_ratio=None, sample_file_type='.pickle', force_reload=False, verbose=True):
        """
        1. Generate Data if needed
        2. Determine combinations to try
        3. Load combination
        4. Train network on it
        5. Generate summary on test topics
        5. Save tuned network, metrics, database entries
        
        """
        
            
        self.nn_path_df = self.load_nn_path_df(verbose=verbose)
        
        # generate data
        self.input_handler.generate(corpus_names=corpus_names, nested_dirs=nested_dirs, col_labels=col_labels,
                                   force_reload=force_reload, verbose=verbose)
        
        # get our dataset identifiers, used to load correct inputs/labels
        corpus_names, nested_dirs, col_labels = resolve_input_params(self.input_handler.path_ret,
                                                                     corpus_names, nested_dirs, col_labels)
        
        # resolve train/test split
        if train_ratio is None:
            train_ratio = self.default_train_ratio
        else:
            if train_ratio < self.min_train_ratio:
                raise Exception("Train ratio must be at least 0.5")
        corpus_topics = self.resolve_topics_per_corpus(corpus_names, train_topics, test_topics, train_ratio)
        
        combinations = find_combinations(self.nn_path_df, corpus_names, nested_dirs, col_labels, add_topics=False,
                                        col_labels_as_list=False, as_tuples=True, force_reload=force_reload)
        if len(combinations) > 0:
            for corpus_name, nested_dir, col_label in tqdm_notebook(combinations):   
                train_topics = corpus_topics[corpus_name]['train']
                if verbose:
                    print("corpus_name: " + str(corpus_name) + "\n"
                         + "nested_dir: " + str(nested_dir) + "\n"
                         + "X_input: " + str(input_col_name) + "\n"
                         + "y_labels: " + str(col_label) + "\n"
                         + "train_topics: " + str(train_topics))

                # get paths for inputs and total_len of samples
                X_paths, input_shape = self.input_handler.get_paths(corpus_name, nested_dir, 
                                            input_col_name, topic_ids=train_topics, file_type=sample_file_type,
                                            return_shape=True)
                # get paths for labels
                y_paths = self.input_handler.get_paths(corpus_name, nested_dir, col_label,
                                            topic_ids=train_topics, return_shape=False, file_type=sample_file_type)
                
                # ensure matching path dicts
                if len(X_paths) != len(y_paths):
                    print("Length of X and y paths do not match: " + str(len(X_paths)) + " / " + str(len(y_paths)))
                
                # create a generator to feed NN samples/batches
                batch_generator = BatchGenerator(X_paths, y_paths, batch_size, sample_file_type)
                
                save_dir, save_name = self.generate_nn_save_path(corpus_name, nested_dir, col_label,
                                                                create_dir=True)
                # generate optimised neural network
                tuner = NNTuner(save_dir, save_name, input_shape, tuning_iterations=tuning_iterations, 
                                force_reload=force_reload, batch_size=batch_size)

                best_model_path = os.path.join(save_dir, "best_model")
                best_model, best_hyperparams = tuner.search(batch_generator, save_path=best_model_path, 
                             return_hyperparams=True)

                self.add_path_to_nn_path_df(corpus_name, nested_dir, input_col_name, col_label, save_dir,
                                           save_name, best_hyperparams, batch_size, best_model_path, 
                                            verbose=verbose)
            print("Finished tuning neural networks")
        else:
            print("All neural networks have previously been tuned")
            
    def resolve_topics_per_corpus(self, corpus_names, train_topics, test_topics, train_ratio):
        """Resolve the train/test corpus for each corpus
        This is a bit off in logic
        """
        corpus_topics_dict = defaultdict(dict)
        for corpus_name in corpus_names:
            corpus_topics = self.input_handler.corpus_topic_ids(corpus_name)
            corp_test, corp_train = test_topics, train_topics
            # resolve test_topics for corpus
            if corp_test is None:
                corp_test = self.default_test_topics
            if corp_train is None:
                corp_train = [x for x in corpus_topics if x not in corp_test]
            
            # get rid of repeats
            corp_test, corp_train = set(corp_test), set(corp_train)
            
            cur_train_ratio = len(corpus_topics) / len(corp_train)
            if cur_train_ratio < self.min_train_ratio:
                # set to train_ratio instead
                num_train = math.floor(len(corpus_topics) * train_ratio)
                num_test = len(corpus_topics) - num_train
                corp_test = corpus_topics[0:num_test]
                corp_train = corpus_topics[num_test:]
                
            # check for overlap in train/test topics
            if not corp_test.isdisjoint(corp_train):  # overlap between topics
                raise Exception("Train and test sets contain overlapping topic_ids\nTrain: " + str(corp_train)
                               +"\nTest: " + str(corp_test))
            
            corp_train, corp_test = list(corp_train), list(corp_test)
            corpus_topics_dict[corpus_name]['train'] = corp_train
            corpus_topics_dict[corpus_name]['test'] = corp_test
        return corpus_topics_dict
                
    
    def load_nn_path_df(self, verbose=True):
        if os.path.exists(self.nn_path_df_path):
            nn_path_df = read_df_file_type(self.nn_path_df, verbose=verbose)
        else:
            nn_path_df = pd.DataFrame(columns=self.nn_path_df_cols)
            if verbose:
                print("nn_path_df created from scratch")
        return nn_path_df
    
    def add_to_nn_path_df(self, corpus_name, nested_dir, input_col_name, label_col_name, tuner_dir, tuner_name,
                          best_hyperparams, batch_size, best_model_path, verbose=True):
        row = {"corpus_name":corpus_name, "nested_dir":nested_dir, "input_col_name":input_col_name,
              "label_col_names":label_col_name, "tuner_dir":tuner_dir, "tuner_name":tuner_name,
              "best_hyperparams":best_hyperparams, "batch_size":batch_size, "best_model_path":best_model_path}
        
        self.nn_path_df = self.nn_path_df.append(row, ignore_index=True)
        save_df_file_type(self.nn_path_df, self.nn_path_df_path, verbose=verbose)
    
    def generate_nn_save_path(self, corpus_name, nested_dir, col_labels, create_dir=True):
        col_dir = "_".join(convert_to_list(col_labels))
        dir_list = [self.nn_base_save_dir_path, corpus_name, nested_dir, col_dir]
        # combine directories to form path of subdirectories, create dirs if necessary
        dir_path = None
        for cur_dir in dir_list:
            if dir_path is None:  # first iteration
                dir_path = dir_list[0]
            else:
                dir_path = os.path.join(dir_path, cur_dir)
            if not os.path.exists(dir_path) and create_dir:
                os.makedirs(dir_path)
        # generate name
        save_name = "tuner_proj"
        return dir_path, save_name

In [None]:
"""

https://numpy.org/doc/stable/reference/generated/numpy.memmap.html#numpy.memmap
memmap might provide way to access segments of an array from a binary file

numpy also provides functions to read/save individual arrays to text files (could be slow)
https://numpy.org/doc/stable/reference/generated/numpy.savetxt.html#numpy.savetxt

can save individual ndarrays as binary files  - might have an issue with precision, need to check
https://numpy.org/doc/stable/reference/generated/numpy.ndarray.tofile.html#numpy.ndarray.tofile

might have issues with it being ndarray and not np.array, difference?

"""


corpus_names = ["mine-trects-kba2014-filtered"]
sample_file_type = '.pickle'

trainer = NNTrainer()

trainer.train(corpus_names=corpus_names, sample_file_type=sample_file_type, verbose=True, force_reload=False)

nn_path_df created from scratch
Loading label_path_df
loaded from .hdf file
Retrieving the following with batch_size(32): cosine_similarity, cos_sim_nearest_nug, embedding


HBox(children=(IntProgress(value=0, max=73), HTML(value='')))

corpus_name: mine-trects-kba2014-filtered
nested_dir: stsb-roberta-base
col_labels: ['cosine_similarity', 'cos_sim_nearest_nug', 'embedding']
topic_id: 22
Retrieving samples from dataframes


HBox(children=(IntProgress(value=0, max=50), HTML(value='')))

Saving batches for cosine_similarity


HBox(children=(IntProgress(value=0, max=76155), HTML(value='')))

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_values] [items->['corpus_name', 'nested_dir', 'topic_id', 'col_label', 'batch_instance', 'batches_in_topic', 'batch_size', 'shape', 'file_type', 'path', 'exists']]

  pytables.to_hdf(path_or_buf, key, self, **kwargs)


76155 files saved for cosine_similarity
Saving batches for cos_sim_nearest_nug


HBox(children=(IntProgress(value=0, max=76155), HTML(value='')))

In [None]:
# class NNTrainer:
#     def __init__(self, proj_dir=None):
#         if proj_dir is None:
#             self.proj_dir = '/nfs/proj-repo/AAARG-dissertation'
#         else:
#             self.proj_dir = proj_dir
#         self.paths = PathRetriever(self.proj_dir)
#         self.model_path = self.proj_dir + "/" + "test_nn"
        
#     def train(self, force_reload=False):
#         print("Creating NN")
#         if os.path.exists(self.model_path) and not force_reload:
#             nn = tf.keras.models.load_model(self.model_path)
#             print("loaded from file at " + str(self.model_path))
#         else:
#             tuner = NNTuner()
#             print("Getting X,y")
#             x,y = self.get_x_y()
#             print("Fitting NN")
#             nn = tuner.search(inputs=x,labels=y, save_path=self.model_path)
#             print("Completed fitting")
#         print("Comparing predictions")
#         results = self.compare_predict(nn)
#         print(self.format_results(results))
        
#     def compare_predict(self, nn):
#         knear = KNearest()
#         k_sents = knear.get_k_nearest(k=20)
#         results = []
        
#         embs = []
#         for index, sent in k_sents.iterrows():
#             emb = sent['embedding']
#             embs.append(emb)
#             result = [sent['cosine_similarity']]
#             results.append(result)
#         embs = np.asarray(embs)  # turn into matrix
#         preds = nn.predict(embs)
#         for result, pred in zip(results, preds):
#             result.append(pred)
        
# #         for index, sent in k_sents.iterrows():
# #             emb = sent['embedding']
# # #             emb = np.expand_dims(emb, axis=1)
# #             emb = (emb,)
# # #             print("shape emb: " + str(emb.shape))
# #             results[str(emb)] = []
# #             results[str(emb)].append(sent['cosine_similarity'])  # actual
# #             pred = nn.predict(emb)
# #             results[embedding].append(pred)  # prediction
        
#         return results
    
#     def format_results(self, results):
#         outstr = ""
#         for result in results:
#             outstr += "actual: " + str(result[0]) + "\n"
#             outstr += "pred: " + str(result[1]) + "\n"
#             outstr += "\n"
# #             outstr += str(emb) + "\n"
# #             outstr += "actual: " + str(result[0]) + "\n"
# #             outstr += "pred: " + str(result[1]) + "\n"
# #             outstr += "\n"
#         return outstr
        
#     def get_x_y(self):
#         corpus_name = "original-trects-kba2014-filtered"
#         nested_dir = 'distilbert-base-nli-stsb-mean-tokens'
#         x_y_paths = self.paths.get_embedding_paths(corpus_name, nested_dir)
#         x = []
#         y = []
#         for path in tqdm_notebook(list(x_y_paths['path'])):
#             emb_df = load_embeddings(path, verbose=False)
#             emb_x = list(emb_df['embedding'])
#             emb_y = list(emb_df['cosine_similarity'])
# #             # keras compatibility wrap singular floats in ndarrays
#             emb_y = [np.asarray(num) for num in emb_y]
#             x.extend(emb_x)
#             y.extend(emb_y)
#         x = np.asarray(x)
#         y = np.asarray(y)
#         return x, y

In [None]:
# trainer = NNTrainer()
# trainer.train(force_reload=True)

In [None]:
# class NeuralNetwork:
#     def __init__(self, nn_config, learning_rate=0.0001, input_dim=768, output_dim=1, loss_func="huber",
#                 epochs=10):
#         # nn structure params
#         self.nn_config = nn_config  # currently just list of layer sizes, can expand to include diff types layers
#         self.learning_rate = learning_rate
#         self.input_dim = input_dim
#         self.output_dim = output_dim
#         self.loss_func = loss_func
        
#         # nn fit execution params
#         self.epochs = epochs
        
#         self.model = self.build_model()
        
#     def build_model(self):
#         model = Sequential()
#         ilayer = InputLayer(input_shape=(self.input_dim,))
#         model.add(ilayer)
#         for num_neurons in self.nn_config:
#             # GRU has better memory performance
#             # use tanh bc cos similarity is between -1 and 1
#             model.add(Dense(num_neurons, activation='tanh'))  
#         # output layer
# #         model.add(Dense(self.output_dim, activation='tanh'))
#         # build model
#         opt = keras.optimizers.Adam(learning_rate=self.learning_rate)
#         model.compile(loss=self.loss_func, optimizer=opt)
#         return model
    
#     def fit(self, X, y=None, save_path=None):
#         if y is not None:
#             self.model.fit(x=X, y=y, epochs=self.epochs, verbose=1,
#                           use_multiprocessing=True, workers=32)
#         else:
#             self.model.fit(x=X, epochs=self.epochs, verbose=1,
#                           use_multiprocessing=True, workers=32)
#         if save_path is not None:
#             self.model.save(save_path)
    
#     def predict(self, s, a=None):              
#         if a==None:            
#             return self._predict_nn(s)
#         else:                        
#             return self._predict_nn(s)[a]
        
#     def _predict_nn(self,state_hat):                          
#         """
#         Predict the output of the neural network (note: these can be vectors)
#         """                
#         x = self.model.predict(state_hat)                                                    
#         return x

In [None]:
# class NNTrainer:
#     def __init__(self, proj_dir=None):
#         if proj_dir is None:
#             self.proj_dir = '/nfs/proj-repo/AAARG-dissertation'
#         else:
#             self.proj_dir = proj_dir
#         self.paths = PathRetriever(self.proj_dir)
#         self.nn_config = [752, 128]
#         self.model_path = self.proj_dir + "/" + "test_nn"
        
#     def train(self, force_reload=False):
#         print("Creating NN")
#         if os.path.exists(self.model_path) and not force_reload:
#             nn = tf.keras.models.load_model(self.model_path)
#             print("loaded from file at " + str(self.model_path))
#         else:
#             nn = HyperNNs(self.nn_config, epochs=1)
#             print("Getting X,y")
#             x,y = self.get_x_y()
#             print("Fitting NN")
#             nn.fit(x,y=y, save_path=self.model_path)
#             print("Completed fitting")
#         print("Comparing predictions")
#         results = self.compare_predict(nn)
#         print(self.format_results(results))
        
#     def compare_predict(self, nn):
#         knear = KNearest()
#         k_sents = knear.get_k_nearest(k=20)
#         results = []
        
#         embs = []
#         for index, sent in k_sents.iterrows():
#             emb = sent['embedding']
#             embs.append(emb)
#             result = [sent['cosine_similarity']]
#             results.append(result)
#         embs = np.asarray(embs)  # turn into matrix
#         preds = nn.predict(embs)
#         for result, pred in zip(results, preds):
#             result.append(pred)
        
# #         for index, sent in k_sents.iterrows():
# #             emb = sent['embedding']
# # #             emb = np.expand_dims(emb, axis=1)
# #             emb = (emb,)
# # #             print("shape emb: " + str(emb.shape))
# #             results[str(emb)] = []
# #             results[str(emb)].append(sent['cosine_similarity'])  # actual
# #             pred = nn.predict(emb)
# #             results[embedding].append(pred)  # prediction
        
#         return results
    
#     def format_results(self, results):
#         outstr = ""
#         for result in results:
#             outstr += "actual: " + str(result[0]) + "\n"
#             outstr += "pred: " + str(result[1]) + "\n"
#             outstr += "\n"
# #             outstr += str(emb) + "\n"
# #             outstr += "actual: " + str(result[0]) + "\n"
# #             outstr += "pred: " + str(result[1]) + "\n"
# #             outstr += "\n"
#         return outstr
        
#     def get_x_y(self):
#         corpus_name = "original-trects-kba2014-filtered"
#         nested_dir = 'distilbert-base-nli-stsb-mean-tokens'
#         x_y_paths = self.paths.get_embedding_paths(corpus_name, nested_dir)
#         x = []
#         y = []
#         for path in tqdm_notebook(list(x_y_paths['path'])):
#             emb_df = load_embeddings(path, verbose=False)
#             emb_x = list(emb_df['embedding'])
#             emb_y = list(emb_df['cosine_similarity'])
# #             # keras compatibility wrap singular floats in ndarrays
#             emb_y = [np.asarray(num) for num in emb_y]
#             x.extend(emb_x)
#             y.extend(emb_y)
#         x = np.asarray(x)
#         y = np.asarray(y)
#         return x, y

## Simple K-Nearest

In [None]:
class KNearest:
    def __init__(self, proj_dir=None):
        if proj_dir is None:
            self.proj_dir = '/nfs/proj-repo/AAARG-dissertation'
        self.paths = PathRetriever(self.proj_dir)
    
    def get_k_nearest(self, k=10):
        topic_ids = None
        emb_paths = self.get_emb_paths(topic_ids=topic_ids)
        
        emb_df = []
        for path in tqdm_notebook(list(emb_paths['path'])):
            add_df = load_embeddings(path, verbose=False)
#             debug = add_df[0:1]
#             debug = list(debug['embedding'])[0]
#             print("type emb: " + str(type(debug)))
#             print("emb shape: " + str(debug.shape))
            emb_df.append(load_embeddings(path, verbose=False))
        emb_df = pd.concat(emb_df, ignore_index=True)

        k_sents = emb_df.nlargest(k, columns=['cosine_similarity'])
#         top_emb = k_sents.iloc[0]['embedding']
#         print(top_emb)
#         print(display(k_sents))
        return k_sents
        
        
#     def tokens_embs(self):
#         """Retrieve sentences and their embeddings"""
#         emb_paths = self.get_emb_paths()
#         toks = []
#         embs = []
        
#         for path in tqdm_notebook(list(emb_paths['path'])):
#             emb_df = load_embeddings(path, verbose=False)
#             toks.extend(list(emb_df['sentence']))
#             embs.extend(list(emb_df['embedding']))
#         return toks, embs
        
            
    def get_emb_paths(self, topic_ids=None):
        corpus_name = "original-trects-kba2014-filtered"
        nested_dir = 'distilbert-base-nli-stsb-mean-tokens'
        emb_paths = self.paths.get_embedding_paths(corpus_name, nested_dir, topic_ids=topic_ids)
        return emb_paths

In [None]:
knear = KNearest()
knear.get_k_nearest()

In [None]:
arr = np.array([1,5,6,2,2])
arr = np.expand_dimensions(arr)
print(arr.shape)