# Notebook for Generating Summaries

In [1]:
# temp while cluster is full
# !pip install keras-tuner
# !pip install tables

In [2]:
import os
num_threads = 32
os.environ['NUMEXPR_MAX_THREADS'] = str(num_threads)

import pandas as pd
import numpy as np
import pickle
import copy
import math
from collections import defaultdict
from tqdm import tqdm
from tqdm import tqdm_notebook
from sentence_transformers import SentenceTransformer
import ipynb.fs

from sklearn.neighbors import KDTree

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import InputLayer
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import LSTM

import kerastuner as kt
from kerastuner.engine.hyperparameters import HyperParameters
from kerastuner.tuners import Hyperband

In [3]:
default_col_labels = ['cosine_similarity', 'cos_sim_nearest_nug']
default_input_col = "embedding"

## Keras NN Model

In [4]:
class NNTuner:
    def __init__(self, save_dir, save_name, input_shape, tuning_iterations=50, batch_size=32, force_reload=False):
        """Can save using project_name param, if overwrite false then will reload where it started
        In Tuner Class documentation
        """
        self.input_shape = input_shape
        self.batch_size = batch_size
        self.models = []
        self.tuner = Hyperband(self.build_model, 
                          objective='mean_squared_error', 
                          max_epochs=25,
                          hyperband_iterations=tuning_iterations,
                          directory=save_dir,
                          project_name=save_name,
                          overwrite=force_reload)
        
    def build_model(self, hp):
        model = Sequential()
        ilayer = InputLayer(input_shape=self.input_shape, batch_size=self.batch_size)
        model.add(ilayer)
        for i in range(hp.Int('num_layers', min_value=1, max_value=4)):
            model.add(Dense(units=hp.Int('units_' + str(i),
                                        min_value=1, max_value=1024, step=32),
                            activation=hp.Choice('activ_' + str(i),
                                                ['relu', 'tanh', 'sigmoid'])))
        opt = tf.keras.optimizers.Adam(
                learning_rate=hp.Float('learning_rate', min_value=0.00001, max_value=0.1))           
        losses = hp.Choice('loss_func', ['MSE', 'huber', 'binary_crossentropy', 'categorical_crossentropy'])
        model.compile(optimizer=opt, loss=losses, metrics=['mean_squared_error'])  # add metrics here
        self.models.append(model)
        return model
    
    def search(self, batch_generator, save_path=None, return_hyperparams=False):
        """Find optimal model given dataset
        """
        self.tuner.search(x=batch_generator, verbose=1, use_multiprocessing=False, workers=num_threads)
        best_model = self.tuner.get_best_models(num_models=1)
        if save_path is not None:
            tf.keras.save(save_path)
        if return_hyperparams:
            hyperparams = self.tuner.get_best_hyperparameters(num_trials=1)
            return best_model, hyperparams
        return best_model
    

# from collections import OrderedDict
from collections import deque

class BatchGenerator(keras.utils.Sequence):
    """Class to load in dataset that is too large to load into memory at once
    
    Do check in class before to make sure all X lists and y lists are same length
    
    https://stanford.edu/~shervine/blog/keras-how-to-generate-data-on-the-fly
    """
    def __init__(self, X_paths, y_paths, batch_size, file_type):
        if batch_size is None:
            self.batch_size = 1
        else:
            self.batch_size = batch_size
        
        self.X_paths = X_paths
        self.y_paths = y_paths
        self.file_type = file_type
#         self.shuffle = False  # make sure linear progression through dataset for sake of memory efficiency
        
    def __len__(self):
        """Denotes the number of batches per epoch"""
        return len(self.X_paths)
    
    def __getitem__(self, idx):
        """Generates one batch of data"""
        inputs = self.load_samples(X_paths, idx)
        labels = self.load_samples(y_paths, idx)
        return inputs, labels
    
    
    def load_samples(self, path, index):
        samples = None
        if self.file_type == '.pickle':
            target_path = path[index]
            with open(target_path, 'rb') as handle:
                samples = pickle.load(handle)
        elif self.file_type == '.memmap':
            path[index] = samples
        else:
            raise Exception("Invalid file type: " + str(self.file_type))
        return samples

## Generating the samples 

In [5]:
from .defs.corpus_loader import PathRetriever, load_embeddings, load_topics, read_df_file_type, save_df_file_type
from .defs.corpus_loader import convert_to_list

In [6]:
def resolve_input_params(path_ret, corpus_names, nested_dirs, col_labels, input_col=None):
    """Helper function to resolve the selection of input params that determine what data to load/generate"""
    # resolve corpus_names
    if corpus_names is None:
        corpus_names = path_ret.get_corpus_names()
        if len(corpus_names) == 0:
            raise Exception("There are no corpuses to load from")
    # resolve col_labels
    if col_labels is None:  # our columns to generate files for
        col_labels = default_col_labels.copy()
        if input_col is not None:
            col_labels.append(input_col)
    # resolve nested_dirs
    if type(nested_dirs) != dict:  # if output gets passed through again
        nested_dict = {}
        for corpus_name in corpus_names:  # get the nested dir for each corpus name
            nested_dict[corpus_name] = path_ret.get_nested_dirs(corpus_name, "embeddings")
            if nested_dirs is not None:
                # add only selected nested_dirs for this corpus_name
                nested_dict[corpus_name] = [x for x in nested_dict[corpus_name] if x in nested_dirs]
        nested_dirs = nested_dict
    # make sure there is at least one entry in nested_dict
    empty_dirs = [len(x) == 0 for x in nested_dirs.values()]  # get if empty for each item
    if all(empty_dirs):
        raise Exception("There are no nested_dirs matching the selection")
    return corpus_names, nested_dirs, col_labels

def corpus_name_topic_ids(path_retriever, corpus_name):
    topic_path = path_retriever.get_topic_path(corpus_name, verbose=False)
    topic_df = load_topics(topic_path, verbose=False)
    topic_ids = list(topic_df['id'].unique())
    return topic_ids

def find_combinations(path_df, corpus_names, nested_dirs, col_labels, add_topics=False, col_labels_as_list=False,
                      as_tuples=True, force_reload=False, path_retriever=None, batch_size=None, file_type=None,
                     exists_only=False):
    """Find the combinations that have not been generated/trained already in path_df
    
    Tuple ordering: (corpus_name, nested_dir, col_label/[col_labels], **topic_id**)
    """
    if exists_only:
        path_df = path_df[path_df['exists'] == True]  # checking of path_df is only concerned with existing files
    if batch_size is not None:
        path_df = path_df[path_df['batch_size'] == batch_size]
    if file_type is not None:
        path_df = path_df[path_df['file_type'] == file_type]
    topic_ids = {}
    if add_topics:  # find topic_ids for each corpus
        for corpus_name in corpus_names:
            if path_retriever is not None:
                topic_ids[corpus_name] = corpus_name_topic_ids(path_retriever, corpus_name)
            else:
                raise Exception("If add_topics is True then path_retriever must be set to an instance of PathRetriever")
    # get possible combinations
    combinations = []
    for corpus_name in corpus_names:
        for nested_dir in nested_dirs[corpus_name]:
            combo_path = path_df[(path_df['corpus_name'] == corpus_name)
                                    & (path_df['nested_dir'] == nested_dir)]
            combo = [corpus_name, nested_dir]
            if add_topics:  # create permutations with topic_ids
                topic_combo_dict = defaultdict(list)
                for label in col_labels:
                    for topic_id in topic_ids[corpus_name]:  # check if label exists for topic_id
                        topic_path = combo_path[(combo_path['col_label'] == label)
                                               & (combo_path['topic_id'] == topic_id)]
                        if len(topic_path) == 0 or force_reload:
                            topic_combo_dict[topic_id].append(label)
                topic_combos = []
                for topic_id, labels in topic_combo_dict.items():
                    topic_combos = []
                    if col_labels_as_list:  # add single tuple with all missing col_labels for topic_id
                        topic_combo = copy.deepcopy(combo)
                        topic_combo.append(labels)
                        topic_combo.append(topic_id)
                        topic_combos.append(topic_combo)
                    else:
                        for label in labels:  # add a tuple for each missing col_label for topic_id
                            topic_combo = copy.deepcopy(combo)
                            topic_combo.append(topic_id)
                            topic_combos.append(topic_combo)
                    combinations.extend(topic_combos)
            else:  # create permutations with col_labels only
                label_combos = []
                add_labels = None
                if not force_reload:  # find which col_labels don't exist already
                    exist_labels = list(combo_path['col_label'].unique())
                    add_labels = [x for x in col_labels if x not in exist_labels]
                else:
                    add_labels = copy.deepcopy(col_labels)  # force_reload add all labels
                if col_labels_as_list:  # add single tuple
                    label_combo = copy.deepcopy(combo)
                    label_combo.append(add_labels)
                    label_combos.append(label_combo)
                else:
                    for add_label in add_labels:  # add tuple for each col_label
                        label_combo = copy.deepcopy(combo)
                        label_combo.append(add_label)
                        label_combos.append(label_combo)
                combinations.extend(label_combos)
                
    if as_tuples:
        combinations = [tuple(x) for x in combinations]
    return combinations

In [70]:
#                             row = {'corpus_name':corpus_name, 'nested_dir':nested_dir, 'col_label':col_label, 
#                                    'path':path, 'dtype':dtype, 'vector_len':vector_len, 'total_nums':total_nums, 
#                                    'offset_step':offset_step, 'topic_ids':self.topic_ids_str(topic_ids), 
#                                    'history_path':history_path, 'complete':False}
#                             self.path_df = self.path_df.append(row, ignore_index=True)

import pprint
from collections import defaultdict

class MemmapGenerator:
    def __init__(self, proj_dir):
        if proj_dir is None:
            self.proj_dir = '/nfs/proj-repo/AAARG-dissertation'
        else:
            self.proj_dir = proj_dir
        self.default_file_type = ".hdf"
        self.path_ret = PathRetriever(proj_dir)
        self.path_df_cols = ['corpus_name', 'nested_dir', 'col_label', 'path', 'dtype', 'vector_len', 
                             'total_nums', 'offset_step', 'topic_ids', 'complete']
        self.dataset_dir = self.path_ret.path_handler.dataset_dir
        self.path_df_path = os.path.join(self.dataset_dir, "memmap_paths.hdf")
        self.path_df = self.load_path_df()
        self.order = 'C'
        
        
    def create_maps(self, corpus_name, nested_dir, col_labels, topic_ids, verbose=True, force_reload=False):
        # check if already completed
        path_slice = self.slice_path_df(corpus_name, nested_dir, topic_ids)
        emb_paths, nested_dir_path = self.path_ret.get_embedding_paths(corpus_name, nested_dir, 
                                                                file_type=self.default_file_type, verbose=False, 
                                                                return_dir_path=True, topic_ids=topic_ids)
        emb_paths = list(emb_paths['path'])
        # load partial information on maps that need completed
        meta_dict = self.create_meta_dict(path_slice, corpus_name, nested_dir, col_labels, 
                                          self.topic_ids_str(topic_ids), nested_dir_path, force_reload=force_reload)
        
        if len(meta_dict) > 0:
            if verbose:
                print("Creating memmaps for " + str(", ".join(col_labels)))
            # debug vars
            count = 0
            count_lim = 3
            nums_to_add = defaultdict(list)
            total_nums_d = defaultdict(list)
            
#             mmap_dict = {}
            for emb_path in tqdm_notebook(emb_paths):
                print("loop count: " + str(count))
                if count >= count_lim:
                    break
                # get the cols that haven't been loaded for this path
                # scrape data from dataframe
                label_data = self.scrape_col_data(emb_path, meta_dict.keys())
                # add data to memmap
                for col_label, data in label_data.items():
                    print("processing " + str(col_label))
                    
                    col_dict = meta_dict[col_label]
                    if not col_dict['initialised']:
                        col_dict['dtype'] = data.dtype
                        ndim = data.ndim
                        if ndim == 1:  # 1d
                            col_dict['vector_len'] = 1
                        elif ndim == 2:  # 2d
                            col_dict['vector_len'] = data.shape[1]
                        else:
                            raise Exception("Too many dimensions: " + str(data.shape))
                        col_dict['offset_step'] = data.dtype.itemsize
                        col_dict['initialised'] = True
                        
#                         mmap_dict[col_label] = np.memmap(col_dict['path'], dtype=col_dict['dtype'], mode='w+',
#                                                         order=self.order)
                        
                    # load meta_dict vars, save hashing time
                    total_nums = col_dict['total_nums']
                    offset_step = col_dict['offset_step']
                    path = col_dict['path']
                    dtype = col_dict['dtype']

                    # add data to map
                    flat = data.ravel()
                    offset = 0
                    num_to_add = len(flat)
                    nums_to_add[col_label].append(num_to_add)  # debug
                    
                    memmap = None
                    if total_nums != 0:
#                         offset = offset_step * (total_nums + 1)  # +1 to get space after last add
                        offset = offset_step * (total_nums + 1)  # +1 to get space after last add
                        memmap = np.memmap(path, dtype=dtype, mode='r+', offset=0, 
                                       order=self.order, shape=(total_nums + num_to_add,))
                    else:
                        print("total_nums is zero : " + str(total_nums))
                        memmap = np.memmap(path, dtype=dtype, mode='w+', offset=0, 
                                       order=self.order, shape=(num_to_add,))
                    
                    print("main loop memmap shape: " + str(memmap.shape))
                    print("main loop flat shape: " + str(flat.shape))
                    memmap[total_nums:total_nums+num_to_add] = flat[:]
                    print("memmap after flat assignment")
                    print(str(memmap))
                    if not np.array_equal(memmap[total_nums:total_nums+num_to_add], flat):
                        print("memmap and flat not equal")
                        print("memmap: " + str(memmap[total_nums:total_nums+num_to_add]))
                        print("flat: " + str(flat))
                    
                    memmap.flush()
#                     del memmap
                    
#                     if total_nums == 0:
#                         mmap = np.memmap(path, dtype=dtype, mode='r', shape=(num_to_add,), order=self.order,
#                                         offset=0)
#                         print("load check mmap main loop")
#                         for i in range(10):
#                             print(str(mmap[i]))

                    # update fields
                    col_dict['total_nums'] += num_to_add
                    total_nums_d[col_label].append(col_dict['total_nums'] / col_dict['vector_len'])
                count += 1
                    
#                     pprint.pprint(col_dict)
#                     print("")
                    
            for col_label, meta in meta_dict.items():
                self.update_path_df_entry(meta['path'], col_label, meta['dtype'], meta['vector_len'],
                         meta['offset_step'], meta['total_nums'])

            batch_size = 32
            # test correctness
            for col_label in meta_dict.keys():
                # load couple emb_paths
                emb_data = []
                for emb_path in emb_paths[0:count_lim]:
                    print("emb_path in check loop: " + str(emb_path))
                    emb_data.extend(self.scrape_col_data(emb_path, [col_label])[col_label])
                emb_data = np.asarray(emb_data)
                # load memmap
                mmap = self.load_memmap(corpus_name, nested_dir, topic_ids, col_label, batch_size=None)
                print("checking equality for " + str(col_label))
                print("mmap shape: " + str(mmap.shape))
                print("emb_data shape: " + str(emb_data.shape))
                total_added = sum(nums_to_add[col_label]) / meta_dict[col_label]['vector_len']
                total_minus_first = int(mmap.shape[0]) - (nums_to_add[col_label][-1] / meta_dict[col_label]['vector_len'])
                print("total_added / emb_data shape: " + str(total_added) + " / " + str(mmap.shape))
                print("minus first: " + str(total_minus_first))
                print("total_nums_d: " + str(total_nums_d[col_label]))
                
                print("looping checking 0 equality of memmap")
                zero_arr = np.zeros(meta_dict[col_label]['vector_len'])
                on_zeros = True
                for i in tqdm_notebook(range(int(meta_dict[col_label]['total_nums'] / meta_dict[col_label]['vector_len']))):
                    if not np.array_equal(mmap[i], zero_arr):  # array not zeros
                        if on_zeros:
                            print("not zeroes, count==" + str(i))
#                             print(str(mmap[i]))
                            on_zeros = False
#                             raise Exception("stop zero check")
                    else:  # array is zeros
                        if not on_zeros:
                            print("is zeroes, count==" + str(i))
#                             print(str(mmap[i]))
                            on_zeros = True
    
                for i in tqdm_notebook(range(emb_data.shape[0])):
                    m_row = mmap[i]
                    emb_row = emb_data[i]
                    if np.isscalar(emb_row):
                        emb_row = np.asarray([emb_row])
                    if not np.array_equal(emb_row, m_row):
                        raise Exception("Not equal\nemb_row: " + str(emb_row) + "\nm_row: " + str(m_row))
                print("All equal for " + str(col_label))
            
            raise Exception("stop check")

                    
            if verbose:
                print(display(path_slice))
            print("Completed creating memmaps")
        else:
            print("Already loaded " + str(col_labels))
            
    def update_path_df_entry(self, path, col_label, dtype, vector_len, offset_step, total_nums):
        mask = (self.path_df['path'] == path) & (self.path_df['col_label'] == col_label)
        change_cols = ['dtype', 'vector_len', 'offset_step', 'total_nums', 'complete']
        self.path_df.loc[mask, change_cols] = dtype, vector_len, offset_step, total_nums, True
        self.save_path_df()
        
            
    def add_path_df_entry(self, corpus_name, nested_dir, col_label, path, topic_ids, return_row_dict=False):
        row = {'corpus_name':corpus_name, 'nested_dir':nested_dir, 'col_label':col_label, 
               'path':path, 'dtype':None, 'vector_len':np.nan, 'total_nums':0, 
               'offset_step':0, 'topic_ids':topic_ids, 'complete':False}
        self.path_df = self.path_df.append(row, ignore_index=True)
        self.save_path_df()
        if return_row_dict:
            return row
        
    def create_meta_dict(self, path_slice, corpus_name, nested_dir, col_labels, topic_ids, nested_dir_path,
                        force_reload=False):
        meta_dict = {}
        for col_label in col_labels:
            col_slice = path_slice[path_slice['col_label'] == col_label]
            if len(col_slice) > 0:
                if len(col_slice) == 1:
                    complete = list(col_slice['complete'])[0]
                    if not complete or force_reload:
                        # add previous values
                        col_slice = col_slice.to_dict(orient='list')
                        col_slice['path'][0]
                        row_dict = {"dtype":col_slice['dtype'][0], "path":col_slice['path'][0], 
                                    "vector_len":col_slice['vector_len'][0], 
                                    "offset_step":col_slice['offset_step'][0], "total_nums":0, # set to 0 to restart
                                    "initialised":False, "completed":False}  
                        meta_dict[col_label] = row_dict
                else:
                    print(display(col_slice))
                    raise Exception("Multiple entries in path_df")
            else:
                # add to path df
                row_dict = self.add_path_df_entry(corpus_name, nested_dir, col_label,
                                                 self.generate_new_map_path(nested_dir_path, col_label),
                                                 topic_ids, return_row_dict=True)
                row_dict['initialised'] = False
                meta_dict[col_label] = row_dict
        return meta_dict
        
    def load_memmap(self, corpus_name, nested_dir, topic_ids, col_label, batch_size=None,
                   return_input_shape=False):
        path_slice = self.slice_path_df(corpus_name, nested_dir, topic_ids)
        col_slice = path_slice[path_slice['col_label'] == col_label]
        if len(col_slice) == 1:
            col_dict = col_slice.to_dict(orient='list')
            dtype = col_dict['dtype'][0]
            vector_len = int(col_dict['vector_len'][0])
            total_nums = int(col_dict['total_nums'][0])
            path = col_dict['path'][0]
            
            shape = None
            num_items = int(total_nums / vector_len)
            print("total_items load_memap: " + str(num_items))
#             print("num_items/batch_size: " + str(num_items/batch_size))
#             print("num_items % batch_size: " + str(num_items%batch_size))
            if batch_size is not None:
                shape = (num_items, batch_size, vector_len)
            else:
                shape = (num_items, vector_len)
            print("shape load_memmap: " + str(shape))
            memmap = np.memmap(path, dtype=dtype, mode='r', shape=shape, order=self.order)
            print("loaded memmap from: " + str(path))
            if return_input_shape:
                input_shape = None
                if batch_size is None:
                    input_shape = (vector_len,)
                else:
                    input_shape = (batch_size, vector_len)
                return memmap, input_shape
            return memmap
        else:
            print(display(path_slice))
            raise Exception(str(len(path_slice)) + " entries for ")
    
    def slice_path_df(self, corpus_name, nested_dir, topic_ids):
        topic_id_str = topic_ids
        if type(topic_id_str) != str:
            topic_id_str = self.topic_ids_str(topic_ids)
            
        mask = (self.path_df['corpus_name'] == corpus_name) & (self.path_df['nested_dir'] == nested_dir) & (self.path_df['topic_ids'] == topic_id_str)
        path_slice = self.path_df.loc[mask]
        return path_slice
        
    def topic_ids_str(self, topic_ids):
        if type(topic_ids) != str:
            sort = sorted(topic_ids)
            sort = [str(x) for x in sort]
            string = ",".join(sort)
            return string
        else:
            raise Exception(str(topic_ids) + " is already type str")
        
    def save_path_df(self):
        save_df_file_type(self.path_df, self.path_df_path, verbose=False)
                
    def load_path_df(self):
        if os.path.exists(self.path_df_path):
            path_df = read_df_file_type(self.path_df_path, verbose=True)
        else:
            path_df = pd.DataFrame(columns=self.path_df_cols)
            print("memmap path df created from scratch")
        return path_df
        
    def incompleted_col_labels(self, path_slice, col_labels):
        incompleted = []
        for col_label in col_labels:
            col_slice = path_slice[path_slice['col_label'] == col_label]
            if len(col_slice) > 0:
                if len(col_slice) == 1:
                    complete = list(col_slice['complete'])[0]
                    if not complete:
                        incompleted.append(col_label)
                else:
                    print(display(col_slice))
                    raise Exception("Multiple entries in path_df")
            else:
                incompleted.append(col_label)
        return incompleted
            
    def generate_new_map_path(self, nested_dir_path, col_label):
        # putting topic_ids in filename too long, use count instead
        count = len(self.path_df)
        base = str(count) + "_" + str(col_label)
        mappath = os.path.join(nested_dir_path, base + ".memmap")
        return mappath          
            
    def scrape_col_data(self, emb_path, col_labels):
        # setup return variables
        labels = {}
        emb_df = load_embeddings(emb_path, verbose=False)
        for col_label in col_labels:
            if col_label not in emb_df.columns:
                raise ValueError("Target label " + str(col_label) + " is not in file at " + str(emb_path))
            # collect label values from df
            labs = np.array(list(emb_df[col_label]))
            labels[col_label] = labs
        return labels
        

In [8]:
class NNTrainer:
    def __init__(self, proj_dir=None, nn_base_save_dir_name=None):
        if proj_dir is None:
            self.proj_dir = '/nfs/proj-repo/AAARG-dissertation'
        else:
            self.proj_dir = proj_dir
        self.memmap_generator = MemmapGenerator(self.proj_dir)
        self.nn_base_save_dir_name = nn_base_save_dir_name
        if self.nn_base_save_dir_name is None:
            self.nn_base_save_dir_name = "summarization_models"
        self.nn_base_save_dir_path = os.path.join(self.proj_dir, self.nn_base_save_dir_name)
        self.nn_path_df_name = "nn_path_df.hdf"
        self.nn_path_df_path = os.path.join(self.nn_base_save_dir_path, self.nn_path_df_name)
        self.nn_path_df_cols = ['corpus_name', 'nested_dir', 'col_label', 'dir_path']
        self.default_test_topics = [1,2,3,4,5,6,8,9,10]
        self.default_train_ratio = 0.8
        self.min_train_ratio = 0.5
    
    def train(self, corpus_name, nested_dir, topic_ids, X_col, y_col, tuning_iterations=100, batch_size=32,
                force_reload=False, verbose=True):
        """
        1. Generate Data if needed
        2. Determine combinations to try
        3. Load combination
        4. Train network on it
        5. Generate summary on test topics
        5. Save tuned network, metrics, database entries
        
        """
        
            
        self.nn_path_df = self.load_nn_path_df(verbose=verbose)
        
        # generate data
        self.memmap_generator.create_maps(corpus_name, nested_dir, [X_col, y_col], topic_ids, verbose=verbose,
                                         force_reload=force_reload)
        

        if verbose:
            print("corpus_name: " + str(corpus_name) + "\n"
                 + "nested_dir: " + str(nested_dir) + "\n"
                 + "X_input: " + str(X_col) + "\n"
                 + "y_labels: " + str(y_col) + "\n"
                 + "train_topics: " + str(train_topics))

#                 # get paths for inputs and total_len of samples
        X_map, input_shape = load_memmap(self, corpus_name, nested_dir, topic_ids, X_col, batch_size=batch_size,
                                           return_input_shape=True)
        # get paths for labels
        y_map = load_memmap(self, corpus_name, nested_dir, topic_ids, y_col, batch_size=batch_size,
                                           return_input_shape=False)

        # create a generator to feed NN samples/batches
        batch_generator = BatchGenerator(X_paths, y_paths, batch_size, '.memmap')

        save_dir, save_name = self.generate_nn_save_path(corpus_name, nested_dir, col_label,
                                                        create_dir=True)
        # generate optimised neural network
        tuner = NNTuner(save_dir, save_name, input_shape, tuning_iterations=tuning_iterations, 
                        force_reload=force_reload, batch_size=batch_size)

        best_model_path = os.path.join(save_dir, "best_model")
        best_model, best_hyperparams = tuner.search(batch_generator, save_path=best_model_path, 
                     return_hyperparams=True)

        self.add_path_to_nn_path_df(corpus_name, nested_dir, X_col, y_col, save_dir,
                                   save_name, best_hyperparams, batch_size, best_model_path, 
                                    verbose=verbose)
        print("Finished tuning neural network")
                
    
    def load_nn_path_df(self, verbose=True):
        if os.path.exists(self.nn_path_df_path):
            nn_path_df = read_df_file_type(self.nn_path_df, verbose=verbose)
        else:
            nn_path_df = pd.DataFrame(columns=self.nn_path_df_cols)
            if verbose:
                print("nn_path_df created from scratch")
        return nn_path_df
    
    def add_to_nn_path_df(self, corpus_name, nested_dir, input_col_name, label_col_name, tuner_dir, tuner_name,
                          best_hyperparams, batch_size, best_model_path, verbose=True):
        row = {"corpus_name":corpus_name, "nested_dir":nested_dir, "input_col_name":input_col_name,
              "label_col_names":label_col_name, "tuner_dir":tuner_dir, "tuner_name":tuner_name,
              "best_hyperparams":best_hyperparams, "batch_size":batch_size, "best_model_path":best_model_path}
        
        self.nn_path_df = self.nn_path_df.append(row, ignore_index=True)
        save_df_file_type(self.nn_path_df, self.nn_path_df_path, verbose=verbose)
    
    def generate_nn_save_path(self, corpus_name, nested_dir, col_labels, create_dir=True):
        col_dir = "_".join(convert_to_list(col_labels))
        dir_list = [self.nn_base_save_dir_path, corpus_name, nested_dir, col_dir]
        # combine directories to form path of subdirectories, create dirs if necessary
        dir_path = None
        for cur_dir in dir_list:
            if dir_path is None:  # first iteration
                dir_path = dir_list[0]
            else:
                dir_path = os.path.join(dir_path, cur_dir)
            if not os.path.exists(dir_path) and create_dir:
                os.makedirs(dir_path)
        # generate name
        save_name = "tuner_proj"
        return dir_path, save_name

In [71]:
train_topics = np.arange(11, 47).tolist()  # 11 - 46
corpus_name = "mine-trects-kba2014-filtered"
nested_dir = "stsb-roberta-base"
X_col = "embedding"
y_col = "cos_sim_nearest_nug"
force_reload=True

trainer = NNTrainer()
trainer.train(corpus_name, nested_dir, train_topics, X_col, y_col, tuning_iterations=100, batch_size=32,
                force_reload=force_reload, verbose=True)

loaded from .hdf file
nn_path_df created from scratch
Creating memmaps for embedding, cos_sim_nearest_nug


HBox(children=(IntProgress(value=0, max=692), HTML(value='')))

loop count: 0
processing embedding
total_nums is zero : 0
main loop memmap shape: (23933952,)
main loop flat shape: (23933952,)
memmap after flat assignment
[ 1.3180828  -1.3514936  -0.21679163 ...  0.89270985  0.7695162
 -1.1903995 ]
processing cos_sim_nearest_nug
total_nums is zero : 0
main loop memmap shape: (31164,)
main loop flat shape: (31164,)
memmap after flat assignment
[-0.00916366 -0.00437354 -0.00419822 ... -0.00509296 -0.00916364
 -0.00916364]
loop count: 1
processing embedding
main loop memmap shape: (48629760,)
main loop flat shape: (24695808,)
memmap after flat assignment
[ 1.3180828  -1.3514936  -0.21679163 ... -0.40571207 -0.22572818
 -0.28645784]
processing cos_sim_nearest_nug
main loop memmap shape: (63320,)
main loop flat shape: (32156,)
memmap after flat assignment
[-0.00916366 -0.00437354 -0.00419822 ... -0.00850286 -0.00414493
 -0.00387387]
loop count: 2
processing embedding
main loop memmap shape: (75329280,)
main loop flat shape: (26699520,)
memmap after flat 

HBox(children=(IntProgress(value=0, max=98085), HTML(value='')))

not zeroes, count==0



HBox(children=(IntProgress(value=0, max=98085), HTML(value='')))


All equal for embedding
emb_path in check loop: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/11_0.hdf
emb_path in check loop: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/11_200.hdf
emb_path in check loop: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/11_400.hdf
total_items load_memap: 98085
shape load_memmap: (98085, 1)
loaded memmap from: /nfs/proj-repo/AAARG-dissertation/dataset/mine-trects-kba2014-filtered/embeddings/stsb-roberta-base/1_cos_sim_nearest_nug.memmap
checking equality for cos_sim_nearest_nug
mmap shape: (98085, 1)
emb_data shape: (98085,)
total_added / emb_data shape: 98085.0 / (98085, 1)
minus first: 63320.0
total_nums_d: [31164.0, 63320.0, 98085.0]
looping checking 0 equality of memmap


HBox(children=(IntProgress(value=0, max=98085), HTML(value='')))

not zeroes, count==0



HBox(children=(IntProgress(value=0, max=98085), HTML(value='')))


All equal for cos_sim_nearest_nug


Exception: stop check

In [None]:
class InputLabelHandler:
    """Class that will load and store an instance of the dataset to be fed to a model
    
    will save in a dir entitled 'samples' in nested_dir
    """
    def __init__(self, proj_dir=None, input_col_name="embedding"):
        if proj_dir is None:
            self.proj_dir = '/nfs/proj-repo/AAARG-dissertation'
        else:
            self.proj_dir = proj_dir
        self.default_file_type = ".hdf"
        self.path_ret = PathRetriever(proj_dir)
        self.label_options = ['cosine_similarity', 'cos_sim_nearest_nug']
        self.input_col_name = input_col_name
#         self.default_test_topics = [1,2,3,4,5,6,8,9,10]
        # label_path_df variables
        self.label_path_df_dir = self.path_ret.path_handler.dataset_dir
        self.sample_dir_name = "samples"
        self.label_path_df_path = os.path.join(self.label_path_df_dir, "label_path_df.hdf")
        self.label_path_df_cols = ['corpus_name', 'nested_dir', 'topic_id', 'col_label', 'batch_instance',
                                   'batches_in_topic', 'batch_size', 'shape', 'file_type', 'path', 'exists']
        self.possible_file_types = ['.pickle']
        
        
    def generate(self, corpus_names=None, nested_dirs=None, col_labels=None, emb_file_type=None, batch_size=32,
                    file_type='.pickle', force_reload=False, verbose=True):
        """Generate easily loadable inputs/labels files to be fed to NN when needed"""
        
        self.label_path_df = self.load_label_path_df(verbose=verbose)
        
        corpus_names, nested_dirs, col_labels = resolve_input_params(self.path_ret, corpus_names,
                                                                    nested_dirs, col_labels, input_col="embedding")
        
        
        if file_type not in self.possible_file_types:
            raise Exception(str(file_type) + " is not a valid file type")
        if emb_file_type is None:  # target file type to load from
            emb_file_type = self.default_file_type
        
        if verbose:
            print("Retrieving the following with batch_size(" + str(batch_size) +"): " 
                  + str(", ".join(col_labels)))
            
        combinations = find_combinations(self.label_path_df, corpus_names, nested_dirs, col_labels,
                                         add_topics=True, col_labels_as_list=True, as_tuples=True, 
                                         force_reload=force_reload, path_retriever=self.path_ret,
                                        batch_size=batch_size, file_type=file_type, exists_only=True)

        if len(combinations) > 0:
            for corpus_name, nested_dir, col_labels, topic_id in tqdm_notebook(combinations):
                if verbose:
                    print("corpus_name: " + str(corpus_name) + "\n"
                         + "nested_dir: " + str(nested_dir) + "\n"
                         + "col_labels: " + str(col_labels) + "\n"
                         + "topic_id: " + str(topic_id))

                emb_paths, nested_dir_path = self.path_ret.get_embedding_paths(corpus_name, nested_dir, 
                                                            file_type=emb_file_type, verbose=False, 
                                                            return_dir_path=True, topic_ids=[topic_id])
                if len(emb_paths) == 0:
                    raise Exception("No paths for " + str(corpus_name) + ", " + str(nested_dir) + ", "
                                   + str(emb_file_type) + ", topic_id: " + str(topic_id))
                    
                # load the selected labels
                loaded_labels = self.retrieve_col_data(emb_paths, col_labels, verbose=verbose)
                
                # create and save batches
                for label, label_data in loaded_labels.items():
                    batches = self.create_batches(label_data, batch_size)
                    shape = batches[0].get_shape()
                    topic_id_path = str(topic_id)  # identifier used in filename
                    update_paths = []
                    pbar = None
                    if verbose:
                        print("Saving batches for " + str(label))
                        pbar = tqdm_notebook(total=len(batches))
                    for index, batch in enumerate(batches):
                        # create file name for batch
                        path = self.generate_path(nested_dir_path, topic_id, index, label, file_type=file_type)
                        update_paths.append(path)
                        # add to path_df
                        if not os.path.exists(path) or force_reload:  # saves resaving files
                            self.add_path_to_df(corpus_name, nested_dir, topic_id, label, index, 
                                                len(batches), batch_size, shape, file_type, path, False)
                            # save file
                            self.save_object(batch, path, file_type)
                        if verbose:
                            pbar.update()
                    self.update_path_exists(update_paths) 
                    self.save_label_path_df()
                    if verbose:
                        print(str(len(batches)) + " files saved for " + str(label))
            print("Completed generating inputs/labels")
        else:
            print("Input/label combinations fully loaded")
    
    def get_paths(self, corpus_name, nested_dir, col_label, topic_ids=None, file_type='.pickle',
                  batch_size=32, return_shape=False):
        """
        Parameters:
            return_indices: add start and end index for topic into dict, if topics were to be loaded as a
                            continuous list
        Return:
            Dict where:
                    key: a topic_id or 'input_dim'
                    value: nested_dict  : keys = "path", "length", ("start_idx", "end_idx")
        """
        paths = self.label_path_df
        paths = paths[(paths['corpus_name'] == corpus_name) 
                      & (paths['nested_dir'] == nested_dir)
                      & (paths['col_label'] == col_label)
                      & (paths['batch_size'] == batch_size)
                      & (paths['file_type'] == file_type)]
        if topic_ids is not None:
            paths = paths[paths['topic_id'].isin(topic_ids)]
            
        # sort column so consistent ordering
        paths = paths.sort_values(by=['topic_id', 'batch_instance'], ascending=True)
        path_list = list(paths['path'])
        
        shape = list(paths['shape'].unique())
        if len(input_dim) > 1:
            raise Exception("Dimensions of list objects varies: " + str(shape))
        else:
            shape = shape[0]
            
        if return_shape:
            return path_list, shape
        return path_list
        
    
    def corpus_topic_ids(self, corpus_name):
        """Num topics for given corpus_name"""
        topic_ids = list(self.label_path_df[self.label_path_df['corpus_name'] == corpus_name]['topic_id'].unique())
        return topic_ids
                        
    def load_label_path_df(self, verbose=True):
        label_path_df = None
        if verbose:
            print("Loading label_path_df")
        if os.path.exists(self.label_path_df_path):
            label_path_df = read_df_file_type(self.label_path_df_path, verbose=True)
        else:
            label_path_df = pd.DataFrame(columns=self.label_path_df_cols)
            if verbose:
                print("label_path_df created from scratch")
        return label_path_df
    
    def add_path_to_df(self, corpus_name, nested_dir, topic_id, col_label, batch_instance, batches_in_topic,
                      batch_size, shape, file_type, path, exists):
        row = {"corpus_name":corpus_name, "nested_dir":nested_dir, "topic_id":topic_id, "col_label":col_label,
               "batch_instance":batch_instance, "batches_in_topic":batches_in_topic, "batch_size":batch_size,
               "shape":shape, "file_type":file_type, "path":path, "exists":exists}
        self.label_path_df = self.label_path_df.append(row, ignore_index=True)
        
    def save_label_path_df(self):
        save_df_file_type(self.label_path_df, self.label_path_df_path, verbose=False)
        
    def update_path_exists(self, path):
        path = convert_to_list(path)
        self.label_path_df.loc[self.label_path_df['path'].isin(path), 'exists'] = True
        
    def create_batches(self, samples, batch_size):
        batch_list = []
        for i in range(0, len(samples), batch_size):
            slice_end_idx = i + batch_size
            if slice_end_idx > len(samples):  # leave last potential batch if doesn't divide evenly
                break
            batch_slice = samples[i:slice_end_idx]  # end step is exclusive
            test_dims = batch_slice[0]  # debug
            is_scalar = np.isscalar(batch_slice[0])
            batch_slice = tf.convert_to_tensor(batch_slice)
            input_dim = None  # debug
            if is_scalar:
                input_dim = 1  # debug
                batch_slice = labels = tf.expand_dims(batch_slice, 1)  # add dimension to get appropriate shape
            else:
                input_dim = len(test_dims)
            if batch_slice.shape != (batch_size, input_dim):
                raise Exception("Wrong shape for batch_slice: " + str(batch_slice.shape)
                               + "\nExpected shape: " + str((batch_size, input_dim)))
            batch_list.append(batch_slice)
        return batch_list
                
    def retrieve_col_data(self, emb_paths, col_labels, verbose=True):
        # setup return variables
        labels = {}
        for col_label in col_labels:
            labels[col_label] = []
        # search through paths for labels
        pbar = None
        if verbose:
            print("Retrieving samples from dataframes")
            pbar = tqdm_notebook(total=len(emb_paths))
        for emb_path in emb_paths['path']:
            emb_df = load_embeddings(emb_path, verbose=False)
            for col_label in col_labels:
                if col_label not in emb_df.columns:
                    raise ValueError("Target label " + str(col_label) + " is not in file at " + str(emb_path))
                # collect label values from df
                labs = list(emb_df[col_label])
                labels[col_label].extend(labs)
            if verbose:
                pbar.update()
        return labels
    
    def save_object(self, obj, path, file_type, offset=None):
        if file_type == '.pickle':
            with open(path, 'wb') as handle:
                pickle.dump(obj, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    def generate_path(self, nested_dir_path, topic_id, instance_num, col_label, file_type='.pickle'):
        filename = str(col_label) + "_" + str(topic_id) + "_" + str(instance_num) + str(file_type)
        dir_path = os.path.join(nested_dir_path, self.sample_dir_name)
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        file_path = os.path.join(dir_path, filename)
        return file_path
    
    def delete_files(self):
        paths = self.label_path_df['path']
        deleted_paths = []
        print("Deleting " + str(len(paths)) + " paths")
        for path in tqdm_notebook(paths):
            if os.path.exists(path):
                os.remove(path)
            deleted_paths.append(path)
        self.label_path_df[~self.label_path_df['path'].isin(deleted_paths)]
        self.save_label_path_df()
        print("deleted")


## Training/Tuning Driver

In [None]:
class NNTrainer:
    def __init__(self, proj_dir=None, nn_base_save_dir_name=None):
        if proj_dir is None:
            self.proj_dir = '/nfs/proj-repo/AAARG-dissertation'
        else:
            self.proj_dir = proj_dir
        self.input_handler = InputLabelHandler(self.proj_dir)
        self.nn_base_save_dir_name = nn_base_save_dir_name
        if self.nn_base_save_dir_name is None:
            self.nn_base_save_dir_name = "summarization_models"
        self.nn_base_save_dir_path = os.path.join(self.proj_dir, self.nn_base_save_dir_name)
        self.nn_path_df_name = "nn_path_df.hdf"
        self.nn_path_df_path = os.path.join(self.nn_base_save_dir_path, self.nn_path_df_name)
        self.nn_path_df_cols = ['corpus_name', 'nested_dir', 'col_label', 'dir_path']
        self.default_test_topics = [1,2,3,4,5,6,8,9,10]
        self.default_train_ratio = 0.8
        self.min_train_ratio = 0.5
    
    def train(self, corpus_names=None, nested_dirs=None, col_labels=None, tuning_iterations=100,
              train_topics = None, test_topics=None, input_col_name="embedding", batch_size=32,
              train_ratio=None, sample_file_type='.pickle', force_reload=False, verbose=True):
        """
        1. Generate Data if needed
        2. Determine combinations to try
        3. Load combination
        4. Train network on it
        5. Generate summary on test topics
        5. Save tuned network, metrics, database entries
        
        """
        
            
        self.nn_path_df = self.load_nn_path_df(verbose=verbose)
        
        # generate data
        self.input_handler.generate(corpus_names=corpus_names, nested_dirs=nested_dirs, col_labels=col_labels,
                                   force_reload=force_reload, verbose=verbose, file_type=sample_file_type)
        
        # get our dataset identifiers, used to load correct inputs/labels
        corpus_names, nested_dirs, col_labels = resolve_input_params(self.input_handler.path_ret,
                                                                     corpus_names, nested_dirs, col_labels)
        
        # resolve train/test split
        if train_ratio is None:
            train_ratio = self.default_train_ratio
        else:
            if train_ratio < self.min_train_ratio:
                raise Exception("Train ratio must be at least 0.5")
        corpus_topics = self.resolve_topics_per_corpus(corpus_names, train_topics, test_topics, train_ratio)
        
        combinations = find_combinations(self.nn_path_df, corpus_names, nested_dirs, col_labels, add_topics=False,
                                        col_labels_as_list=False, as_tuples=True, force_reload=force_reload)
        if len(combinations) > 0:
            for corpus_name, nested_dir, col_label in tqdm_notebook(combinations):  # will this loop through input_col? 
                train_topics = corpus_topics[corpus_name]['train']
                if verbose:
                    print("corpus_name: " + str(corpus_name) + "\n"
                         + "nested_dir: " + str(nested_dir) + "\n"
                         + "X_input: " + str(input_col_name) + "\n"
                         + "y_labels: " + str(col_label) + "\n"
                         + "train_topics: " + str(train_topics))

#                 # get paths for inputs and total_len of samples
                X_paths, input_shape = self.input_handler.get_paths(corpus_name, nested_dir, 
                                            input_col_name, topic_ids=train_topics, file_type=sample_file_type,
                                            return_shape=True)
                # get paths for labels
                y_paths = self.input_handler.get_paths(corpus_name, nested_dir, col_label,
                                            topic_ids=train_topics, return_shape=False, file_type=sample_file_type)
                
                # ensure matching path dicts
                if len(X_paths) != len(y_paths):
                    print("Length of X and y paths do not match: " + str(len(X_paths)) + " / " + str(len(y_paths)))
                
                # create a generator to feed NN samples/batches
                batch_generator = BatchGenerator(X_paths, y_paths, batch_size, sample_file_type)
                
                save_dir, save_name = self.generate_nn_save_path(corpus_name, nested_dir, col_label,
                                                                create_dir=True)
                # generate optimised neural network
                tuner = NNTuner(save_dir, save_name, input_shape, tuning_iterations=tuning_iterations, 
                                force_reload=force_reload, batch_size=batch_size)

                best_model_path = os.path.join(save_dir, "best_model")
                best_model, best_hyperparams = tuner.search(batch_generator, save_path=best_model_path, 
                             return_hyperparams=True)

                self.add_path_to_nn_path_df(corpus_name, nested_dir, input_col_name, col_label, save_dir,
                                           save_name, best_hyperparams, batch_size, best_model_path, 
                                            verbose=verbose)
            print("Finished tuning neural networks")
        else:
            print("All neural networks have previously been tuned")
            
            
    def resolve_topics_per_corpus(self, corpus_names, train_topics, test_topics, train_ratio):
        """Resolve the train/test corpus for each corpus
        This is a bit off in logic
        """
        corpus_topics_dict = defaultdict(dict)
        for corpus_name in corpus_names:
            corpus_topics = self.input_handler.corpus_topic_ids(corpus_name)
            corp_test, corp_train = test_topics, train_topics
            # resolve test_topics for corpus
            if corp_test is None:
                corp_test = self.default_test_topics
            if corp_train is None:
                corp_train = [x for x in corpus_topics if x not in corp_test]
            
            # get rid of repeats
            corp_test, corp_train = set(corp_test), set(corp_train)
            
            cur_train_ratio = len(corpus_topics) / len(corp_train)
            if cur_train_ratio < self.min_train_ratio:
                # set to train_ratio instead
                num_train = math.floor(len(corpus_topics) * train_ratio)
                num_test = len(corpus_topics) - num_train
                corp_test = corpus_topics[0:num_test]
                corp_train = corpus_topics[num_test:]
                
            # check for overlap in train/test topics
            if not corp_test.isdisjoint(corp_train):  # overlap between topics
                raise Exception("Train and test sets contain overlapping topic_ids\nTrain: " + str(corp_train)
                               +"\nTest: " + str(corp_test))
            
            corp_train, corp_test = list(corp_train), list(corp_test)
            corpus_topics_dict[corpus_name]['train'] = corp_train
            corpus_topics_dict[corpus_name]['test'] = corp_test
        return corpus_topics_dict
                
    
    def load_nn_path_df(self, verbose=True):
        if os.path.exists(self.nn_path_df_path):
            nn_path_df = read_df_file_type(self.nn_path_df, verbose=verbose)
        else:
            nn_path_df = pd.DataFrame(columns=self.nn_path_df_cols)
            if verbose:
                print("nn_path_df created from scratch")
        return nn_path_df
    
    def add_to_nn_path_df(self, corpus_name, nested_dir, input_col_name, label_col_name, tuner_dir, tuner_name,
                          best_hyperparams, batch_size, best_model_path, verbose=True):
        row = {"corpus_name":corpus_name, "nested_dir":nested_dir, "input_col_name":input_col_name,
              "label_col_names":label_col_name, "tuner_dir":tuner_dir, "tuner_name":tuner_name,
              "best_hyperparams":best_hyperparams, "batch_size":batch_size, "best_model_path":best_model_path}
        
        self.nn_path_df = self.nn_path_df.append(row, ignore_index=True)
        save_df_file_type(self.nn_path_df, self.nn_path_df_path, verbose=verbose)
    
    def generate_nn_save_path(self, corpus_name, nested_dir, col_labels, create_dir=True):
        col_dir = "_".join(convert_to_list(col_labels))
        dir_list = [self.nn_base_save_dir_path, corpus_name, nested_dir, col_dir]
        # combine directories to form path of subdirectories, create dirs if necessary
        dir_path = None
        for cur_dir in dir_list:
            if dir_path is None:  # first iteration
                dir_path = dir_list[0]
            else:
                dir_path = os.path.join(dir_path, cur_dir)
            if not os.path.exists(dir_path) and create_dir:
                os.makedirs(dir_path)
        # generate name
        save_name = "tuner_proj"
        return dir_path, save_name

In [None]:
"""

https://numpy.org/doc/stable/reference/generated/numpy.memmap.html#numpy.memmap
memmap might provide way to access segments of an array from a binary file

numpy also provides functions to read/save individual arrays to text files (could be slow)
https://numpy.org/doc/stable/reference/generated/numpy.savetxt.html#numpy.savetxt

can save individual ndarrays as binary files  - might have an issue with precision, need to check
https://numpy.org/doc/stable/reference/generated/numpy.ndarray.tofile.html#numpy.ndarray.tofile

might have issues with it being ndarray and not np.array, difference?

"""


# corpus_names = ["mine-trects-kba2014-filtered"]
# sample_file_type = '.pickle'

# trainer = NNTrainer()

# trainer.train(corpus_names=corpus_names, sample_file_type=sample_file_type, verbose=True, force_reload=False)