# Loading/Processing Corpus

In [1]:
from bs4 import BeautifulSoup as bs
import gzip
import pandas as pd
import numpy as np
from tqdm import tqdm
from tqdm import tqdm_notebook
# from tqdm.notebook import tqdm
import os
from collections import OrderedDict
import pickle
import warnings
import copy

### Meta File Variables

In [2]:
corpus_path = "/nfs/trects-kba2014-filtered" # directory of corpus of gzipped html files
topics_path = corpus_path + "/test-topics.xml"
doc_tags = ['topic_id','streamid', 'docid', 'yyyymmddhh', 'kbastream', 'zulu', 'epoch', 'title', 'text', 'url'] # doc fields
topic_tags = ['id', 'title', 'description', 'start','end','query','type'] # topic fields
test_file_addr = corpus_path + "/1/2012-02-22-15.gz"
proj_dir = '/nfs/proj-repo/AAARG-dissertation'
csv_dir = proj_dir + '/' + 'load_data'
# csv file addresses
corp_csv_name = 'corpus_loaded.csv.gz'
corp_csv_path = csv_dir + '/' + corp_csv_name
topics_csv_name = 'topics_loaded.csv.gz'
topics_csv_path = csv_dir + '/' + topics_csv_name
# nugget/update dataframes
nugget_dir = "/nfs/TemporalSummarization/ts13/results"
updates_sampled_path = nugget_dir + "/updates_sampled.tsv"
nuggets_path = nugget_dir + "/nuggets.tsv"
nug_matches_path = nugget_dir + "/matches.tsv"
# saving nugget and update files
nugget_csv = 'nugget_df.csv.gz'
update_csv = 'update_df.csv.gz'
nugget_csv_path = csv_dir + '/' + nugget_csv
update_csv_path = csv_dir + '/' + update_csv
# supervised input/labels
# supervised_csv = 'supervised_df.csv.gz'
# supervised_csv_path = csv_dir + '/' + supervised_csv

In [3]:
class FilePathHandler:
    """Paths will be in the format:
    
    proj_dir/dataset_dir/corpus_name/file_purpose/instance_identifier+split_identifier+sfile_type
    """
    def __init__(self, proj_dir, dataset_dir="dataset", compression='gzip'):
        self.proj_dir = proj_dir
        self.dataset_dir = proj_dir + '/' + dataset_dir
        self.create_dir_if_not_exists(self.dataset_dir)
        self.path_df_path = self.dataset_dir + '/' + 'file_path_df.csv.gz'
        self.corpus_sources_pickle_path = self.dataset_dir + '/' + 'corpus_sources.pickle'
        self.compression = compression
        self.file_purposes = ["topics", "corpus", "nuggets", "embed_labels", "updates"]
        self.path_df_cols = ["corpus_name", "file_purpose", "split_identifier", "num_splits",
                            "instance_identifier", "file_type", "path", "exists"]
        
        # load meta files
        self.load_corpus_sources()
        self.load_path_df()


    def paths_in_corpus_name(self, corpus_name, selection=None):
        def get_identity_paths(ident_df, exists=True):
            ident_paths = list(ident_df[ident_df['exists']==exists]['relative_path'])
            if len(ident_paths) == 0:
                ident_paths = None
            return ident_paths
        
        if selection is None:  # get all file purposes if none selected
            selection = self.file_purposes
        
        paths = {}
        name_df = self.path_df[self.path_df['corpus_name'] == corpus_name]
        for file_purpose in selection:  # loop for each file purpose/stage of generation
            purp_df = name_df[name_df['file_purpose'] == file_purpose]
            idents = purp_df['instance_identifier'].unique()  # segregate any unique identifiers/instances
            if len(idents) == 0:
                # case where corpus_name or file_purpose not in paths_df
                # will create same dict with None entries
                idents.append(None)  
            for ident in idents:
                ident_df = purp_df[purp_df['instance_identifier'] == ident]
                paths[file_purpose][ident]['exists'] = get_identity_paths(ident_df, exists=True)
                paths[file_purpose][ident]['not_exists'] = get_identity_paths(ident_df, exists=False)
        return paths
        
    
    def get_path(self, corpus_name, file_purpose, inst_identifier, file_type, add_path=True, exists=False,
                split_identifier=None, num_splits=None):
        # do check here make sure filename compatible, or elsewhere
        path = self.dataset_dir + '/' + corpus_name + '/' + file_purpose + '/' + inst_identifier
        if split_identifier is not None:
            path += '_' + split_identifier
        path += file_type
        
        if add_path:
            self.add_path_to_df(corpus_name, file_purpose, split_identifier, num_splits, inst_identifier,
                            file_type, path, exists, save=True)
        return path
            
    def add_path_to_df(self, corpus_name, file_purpose, split_identifier, num_splits, inst_identifier,
                       file_type, path, exists, save=True):
        if not (self.path_df['path'] == path).any():  # check if row exists
            # create appropriate dir if needed
            new_dir_path = self.dataset_dir + '/' + corpus_name + '/' + file_purpose
            self.create_dir_if_not_exists(new_dir_path)
            # add to path_df
            row = {"corpus_name":corpus_name, "file_purpose":file_purpose, "split_identifier":split_identifier,
                  "num_splits":num_splits, "inst_identifier":inst_identifier, "file_type":file_type,
                  "path":path, "exists":exists}
            self.path_df = self.path_df.append(row, ignore_index=True)
#             row = [corpus_name, file_purpose, split_identifier, num_splits, inst_identifier, file_type,
#                   path, exists]
#             self.path_df.loc[len(self.path_df)] = row
            if save:  # save new path_df
                self.save_path_df()
        else:
            warnings.warn("Path already exists in dataframe: " + str(path))
            
    def update_path_exists(self, path, save=True):
        self.path_df.loc[self.path_df['path'] == path, 'exists'] = True
        if save:
            self.save_path_df()
        

    def create_dir_if_not_exists(self, dir_path, warn=True):
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
            warnings.warn("Created new directory at " + str(dir_path))
            return True
        return False
    
    def search_path_df(self, search_dict, df_slice=None):
        if df_slice is None:
            df_slice = self.path_df
        for col_name, value in search_dict.items():
            df_slice = df_slice[col_name == value]
        return df_slice
    
    def path_exists(self, path):
        return os.path.exists(path)
    
    def source_dict_correct(self, source_dict):
        false_paths = []
        for path_type, path in source_dict.items():
            if path_type == "corpus_name":  # dict entry not a path, don't check
                continue
            if not self.path_exists(path):
                false_paths.append(str(path_type) + " does not exist at " + str(path))
        if len(false_paths) > 0:
            error_str = "\n".join(false_paths)
            raise FileNotExistsError(error_str)
        else:
            return True
    
    def create_corpus_source_dict(self, corpus_name, dir_path, topics_file_path, nuggets_file_path):
        s_dict = {"corpus_name":corpus_name, "dir_path":dir_path, 
                  "topics_path":topics_file_path, "nuggets_path":nuggets_file_path}
        self.source_dict_correct(s_dict)
        return p_dict
    
    def add_corpus_source(self, corpus_source_dict, overwrite=False):
        """Add a corpus directory to load from and its meta files"""
        # check paths exist
        self.source_dict_correct(corpus_source_dict)
        corpus_name = copy.deepcopy(corpus_source_dict["corpus_name"])
        # store new entry
        if corpus_name in self.corpus_sources and overwrite==False:
            warnings.warn(str(corpus_name) + " is already present in corpus source dictionary. \n Proceeding with dict entry")
        else:
            del corpus_source_dict["corpus_name"]  # remove corpus_name from dict to add as a key
            self.corpus_sources[corpus_name] = corpus_source_dict
            # create folder for outputting new source files
            new_corpus_dir = self.dataset_dir + '/' + corpus_name
            self.create_dir_if_not_exists(new_corpus_dir)
            self.save_corpus_sources()
            
    def get_corpus_sources(self, corpus_names=None):
        """Retrieve file paths from corpus_load dicts
        Parameters:
            corpus_names: list of corpus names retrieve, if None then retrieve all
        
        Returns:
            A dictionary where keys are the corpus names and values are target file paths
        """
        if corpus_names is None:
            corpus_names = self.corpus_sources.keys()
        corpus_paths = {}
        for name in corpus_names:
            corpus_paths[name] = self.corpus_sources[name]
        return corpus_paths
    
    def save_corpus_sources(self):
        with open(self.corpus_sources_pickle_path, 'wb') as handle:
            pickle.dump(self.corpus_sources, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    def load_corpus_sources(self):
        if os.path.exists(self.corpus_sources_pickle_path):
            with open(self.corpus_sources_pickle_path, 'rb') as handle:
                self.corpus_sources = pickle.load(handle)
            return True
        else:
            self.corpus_sources = {} # create empty dictionary
            return False
        
    def save_path_df(self):
        self.path_df.to_csv(self.path_df_path, compression=self.compression)
        
    def load_path_df(self):
        """File containing info about file paths to systematically load files"""
        if os.path.exists(self.path_df_path):
            self.path_df = pd.read_csv(self.path_df_path, compression=self.compression)
            return True
        else:
            self.path_df = pd.DataFrame(columns=self.path_df_cols)  # create empty dataframe
            return False

In [4]:
# test_path_f = FilePathHandler(proj_dir)
# del test_path_f.corpus_sources['tr14_init_filtered']
# print(test_path_f.corpus_sources)

In [5]:
# # def supervised_path_generator(identifier, base="supervised_df", handle=".csv.gz"):
# #     fn = base + "_" + identifier + handle
# #     path = csv_dir + '/' + fn
# #     return path

# def create_dir_if_not_exists(dir_path, warn=True):
#     if not os.path.exists(dir_path):
#         os.makedirs(dir_path)
#         warnings.warn("Created new directory at " + str(base_dir))
#         return True
#     return False

# def save_path_generator(corpus_dir, file_purpose_dir, file_name, file_type, identifier=None, part=None):
#     proj_dir = '/nfs/proj-repo/AAARG-dissertation'
#     base_dir = proj_dir + '/' + 'dataset'  # base folder for storing corpus files
#     create_dir_if_not_exists(base_dir)
#     # select appropriate corpus directory (e.g. trects-filtered-2014)
#     path = base_dir + '/' + corpus_dir
#     create_dir_if_not_exists(path)
#     # ensure is from pre-selected file_purposes
#     file_purposes = ["corpus", "nuggets", "topics", "embed_labels", "updates"]
#     if file_purpose_dir not in file_purposes:
#         raise ValueError("File purpose must be in defined file purposes")
#     path += '/' + file_purpose_dir
#     create_dir_if_not_exists(path)
    
#     # create file name
#     path += '/' + file_name
#     if identifier is not None:
#         path += '_' + identifier
#     if part is not None:
#         path += '_' + part
#     path += file_type
    
#     return path
        

## Markup Loading Functions

In [None]:
# open and get beautifulsoup object from markup file
def open_markup_file(addr, gz=True, xml=False, verbose=False):
    markup = None
    f = None
    
    if verbose:
        print(addr)

    if gz:
        f = gzip.open(addr)
        if verbose:
            print("gz file opened")
            print("first line: " + str(f.readline())
    else:
        f = open(addr)
        
    if xml == False:
        markup = bs(f, "lxml")  # using lxml parser for speed
    else:
        markup = bs(f, "xml")
        
    f.close()
    return markup


# parse markup and return 2D list [entry:tags]
def parse_markup(markup, entry_list, tag_list, find_tag, topic_id=None):
    for e in markup.find_all(find_tag):
        entry = OrderedDict.fromkeys(tag_list)
        if topic_id is not None:
            entry['topic_id'] = topic_id
        for c in e.children:  # children use direct children, descendants uses all
            if c.name in entry:
                entry[c.name] = str(c.string)
            elif c.name is None and c.string != '\n':  # inner body of <doc> tag
                entry['text'] = str(c.string)
        entry_list.append(list(entry.values()))
        
            
# recursively find gz html files from a directory address
def search_dir(path):    
    # separate the subdirectories and html files 
    # (help maintain sequential order of insertion)
    gz_paths = []
    for f in os.scandir(path):
        if os.path.splitext(f.path)[-1].lower() == ".gz":
            gz_paths.append(f.path)
    
    return gz_paths


def list_to_dataframe(markup_list, tags):
    return pd.DataFrame(markup_list, columns=tags)

## Dataframe Loading

### Control Functions

In [7]:
def find_duplicates(df):
    seen = set()
    seen_twice = set()
    for docid in df['docid']:
        if docid not in seen:
            seen.add(docid)
        else:
            seen_twice.add(docid)
    return seen_twice

def file_exists(path):
    return os.path.isfile(path)

def load_df_control(saved_path, load_func, save=True, force_reload=False, compression='gzip', 
                    name=None, verbose=True, path_handler=None):
    df = None
    if name is not None and verbose:
        print("Loading " + name)
    if not file_exists(saved_path) or force_reload:
        df = load_func()
        if verbose:
            print("df loaded")
        if save:
            df.to_csv(saved_path, compression=compression)
            if path_handler is not None:
                path_handler.update_path_exists(saved_path)
            if verbose:
                print("saved at: " + str(saved_path))
    else:
        df = pd.read_csv(saved_path, compression=compression)
        if verbose:
            print("loaded from file")
    if verbose:
        print(display(df[0:4]))
    return df

### Dataframes from Corpus Files

#### Topics

In [8]:
# load topics into dataframe
def __load_topics(path):
    topics_list = []
    parse_markup(open_markup_file(path, gz=False, xml=True), 
                    topics_list, topic_tags, "event")
    df = list_to_dataframe(topics_list, topic_tags)
    df['id'] = pd.to_numeric(df['id'])
    return df

def load_topics(saved_path, load_path=None, save=True, force_reload=False, verbose=True, path_handler=None):
    topics = load_df_control(saved_path, lambda: __load_topics(load_path), 
                             save=save, force_reload=force_reload, name="topics", verbose=verbose, path_handler=path_handler)
    return topics

# topics = load_topics()

#### Main Corpus Files

In [25]:
# load all formatted gzipped html files into dataframe

def __load_corpus(corpus_dir, doc_tags=None, topic_ids=None, split_every=None, split_start_doc=None):
    if doc_tags is None:
        doc_tags = ['topic_id','streamid', 'docid', 'yyyymmddhh', 'kbastream', 'zulu', 'epoch', 'title', 'text', 'url'] # doc fields
    df = pd.DataFrame(columns=doc_tags)
    
    for topic_id in topic_ids:
        print("Loading topic " + str(topic_id) + "...")
        topic_list = []
        topic_path = corpus_dir + '/' + str(topic_id)
        gz_paths = search_dir(topic_path)
        
        if split_every is not None and split_start_doc is not None:
            end_split = split_start_doc + split_every
            if end_split >= len(gz_paths):  # last section
                end_split = len(gz_paths) - 1
            gz_paths = gz_paths[split_start_doc:end_split]
        
        print("creating corpus df for topic " + str(topic_id) + " starting at file no. " + str(split_start_doc)
             + " splitting every " + str(split_every) + " docs")
        for gz_path in tqdm(gz_paths, position=0, leave=True):
            parse_markup(open_markup_file(gz_path, verbose=False),
                             topic_list, doc_tags, "doc", topic_id=topic_id)
        topic_df = list_to_dataframe(topic_list, doc_tags)
        df = df.append(topic_df)
    df['epoch'] = pd.to_numeric(df['epoch'])
    return df

def load_corpus(save_path, corpus_dir=None, doc_tags=None, topic_ids=None, split_every=None, split_start_doc=None,
                save=True, force_reload=False, verbose=True, path_handler=None):
    
    corpus = load_df_control(save_path, 
                             lambda: __load_corpus(corpus_dir, doc_tags=doc_tags, 
                                                   topic_ids=topic_ids, split_every=split_every,
                                                   split_start_doc=split_start_doc), 
                             save=save, force_reload=force_reload, name="corpus", verbose=verbose, path_handler=path_handler)
    if verbose:
        print("Corpus loaded succesfully: " + str(len(corpus)) + " documents loaded.")
    return corpus

# corpus = load_corpus(doc_tags=doc_tags)

#### Nuggets (Evaluation Technique)

In [10]:
import re

def create_nugget_df():
    """Dataframe containing nugget data and its appearances in corpus"""
    def create_entry(row, reg_cols, multi_col_vals=None):
        entry_dict = {}
        for col in reg_cols:
            entry_dict[col] = row[col]
        if multi_cols is not None:
            for k,v in multi_col_vals.items():
                entry_dict[k] = v
        return entry_dict
    nuggets_tsv = pd.read_csv(nuggets_path, "\t")
    entry_list = []
    reg_cols = ['query_id', 'nugget_id', 'importance', 'nugget_len', 'nugget_text']
    multi_cols = ['docid', 'streamid', 'epoch', 'yyyymmddhh']  # multiindex cols
    num_cols = ['query_id', 'importance', 'nugget_len', 'epoch']
    
    pbar = tqdm(total=len(nuggets_tsv), position=0, leave=True)
    for index, row in nuggets_tsv.iterrows():
        # find where nugget appears in text
        nug_text = row['nugget_text']
        topic_id = 0
        try:
            topic_id = int(row['query_id'])  # make sure pattern match in correct topic
        except ValueError:
            pbar.update()
            continue  # topic_id is unknown string in tsv file, e.g. "TS13.07"
        appears = corpus[corpus['topic_id'] == topic_id]
        appears = appears[appears['text'].str.contains(re.escape(nug_text))]  # make sure no accidental regex pattern
        
        # gather information on docs it appears in
        dups = find_duplicates(appears)  # get docids where nugget appears
        for docid in dups:
            upd = appears[appears['docid'] == docid]  # get docs with this docid
            for i, r in upd.iterrows():  # gather info on each doc with this docid (e.g. streamid, epoch etc.)
                multi_col_vals = {}
                for multi_col in multi_cols:
                    multi_col_vals[multi_col] = r[multi_col]
                entry = create_entry(row, reg_cols, multi_col_vals=multi_col_vals)
                entry_list.append(entry)
        pbar.update()
    pbar.close()
    
    # form multi-index nugget dataframe
    reg_cols.extend(multi_cols)  # get new multiindex order
    nugget_df = pd.DataFrame(entry_list)
    nugget_df[num_cols] = nugget_df[num_cols].apply(pd.to_numeric, errors='coerce', axis=1)  # convert appropriate cols to numerical values
    nugget_df.rename(columns={'query_id':'topic_id'}, inplace=True)  # topic_id matches other dataframes
    return nugget_df

def load_nugget_df(save=True, force_reload=False, verbose=True):
    nugget_df = load_df_control(nugget_csv_path, create_nugget_df, 
                                save=save, force_reload=force_reload, name="nugget_df", verbose=verbose)
    return nugget_df

# nugget_df = load_nugget_df()

#### Update Dataframe (Temporal Information)

In [11]:
def create_update_df():
    """Data Frame containing information about docs which have updates/multiple instances in corpus"""
    def create_entry(row, col_tags):
        entry = {}
        for col in col_tags:
            entry[col] = row[col]
        return entry
    
    col_tags = ['docid', 'streamid', 'epoch', 'yyyymmddhh', 'zulu']
    entry_list = []
    dups = find_duplicates(corpus)
    for docid in tqdm(dups, position=0, leave=True):
        d = corpus[corpus['docid'] == docid]
        for index, row in d.iterrows():
            entry = create_entry(row, col_tags)
            entry_list.append(entry)
             
    update_df = pd.DataFrame(entry_list)
    update_df = update_df.set_index(col_tags)
    return update_df

def load_update_df(save=True, force_reload=False, verbose=True):
    update_df = load_df_control(update_csv_path, create_update_df, 
                                save=save, force_reload=force_reload, name="update_df", verbose=verbose)
    return update_df

# update_df = load_update_df()

## Supervised Learning Input Data (Embeddings) / Labels (Model Summary)

In [12]:
# preprocess corpus into cleaned sentences
# create sentence embeddings of corpus text
# create embeddings from where nuggets appear in article
# match them together in df
import spacy
from sentence_transformers import SentenceTransformer

In [13]:
class SupervisedTrainingGenerator:
    """Currently not matching nuggets to correct sentence_id"""
    def __init__(self, spacy_model_selector="en_core_web_sm"):
        self.sent_model = self.init_sent_model()
        self.spacy_model_selector = spacy_model_selector
        self.nlp = None
        
    def generate(self, corpus_df, nugget_df, topic_ids=None, save=True, force_reload=False, verbose=True):
        if topic_ids is not None:
            self.supervised_df = {}
            for topic_id in topic_ids:
                print("Processing topic " + str(topic_id))
                t_corpus = corpus_df[corpus_df['topic_id'] == topic_id]
                t_nugget = nugget_df[nugget_df['topic_id'] == topic_id]
                t_path = supervised_path_generator("topic" + str(topic_id))
                self.supervised_df[topic_id] = load_df_control(t_path, 
                                                lambda: self.__generate(t_corpus, t_nugget), 
                                                save=save, force_reload=force_reload, 
                                                verbose=verbose, name="supervised_df" + str(topic_id))
                self.sent_model = self.init_sent_model()
        else:
            # maybe check here load different csv with different topic_ids
            self.supervised_df = load_df_control(supervised_csv_path, 
                                                lambda: self.__generate(corpus_df, nugget_df), 
                                                save=save, force_reload=force_reload, 
                                                verbose=verbose, name="supervised_df")
        return self.supervised_df
    
    def __generate(self, corpus_df, nugget_df):
        # create df each row being a sentence, it's embedding, sent_id, is_nugget, nugget_text, topic_id, streamid, docid etc.
        supervised = []
        for index, article in tqdm_notebook(corpus_df.iterrows(), total=corpus_df.shape[0], position=0, leave=True):
            # preprocess sentences
            sentences = self.preprocess_text(article['text'])
            sent_ids, sentences, embeddings = self.sent_embeddings(sentences)
            
            # if nuggets in article, get the index of the sentence
            streamid = article['streamid']
            article_nugs = self.nugget_matching_sent(streamid, nugget_df, sentences)
            
            # create dictionary for later creating dataframe
            for sent_id, sent, emb in zip(sent_ids, sentences, embeddings):
                t_id = article['topic_id']
                epoch = article['epoch']
                is_nugget = False
                nugget_text = None
                nugget_id = None
                
                nug_dict_index = None
                # check if nugget
                try:
                    nug_dict_index = article_nugs['sent_id'].index(sent_id)  # throws if not in list
                    is_nugget = True
                    nugget_text = article_nugs['nugget_text'][nug_dict_index]
                    nugget_id = article_nugs['nugget_id'][nug_dict_index]
                except ValueError:
                    pass # current sentence is not nugget
                
                s_dict = {"topic_id":t_id, "streamid":streamid, "epoch":epoch, "sent_id":sent_id, 
                          "sentence":sent, "embedding":emb, "is_nugget":is_nugget, 
                          "nugget_id":nugget_id, "nugget_text":nugget_text}
                supervised.append(s_dict)
                
        supervised_df = pd.DataFrame(supervised)
        return supervised_df
                
                
    def nugget_matching_sent(self, streamid, nugget_df, sentences):
        # find sent index of where nugget appears in text
        matches = {"sent_id":[], "nugget_text":[], "nugget_id":[]}
        for index, nug in self.nuggets_in_article(streamid, nugget_df).iterrows():
            match = None
            nug_text = nug['nugget_text']
            for i in range(len(sentences)):
                match = None
                if sentences[i] in nug_text:
                    match = i  # only take first appearance in article if multiple exist
                    break
            if match is not None:
                matches["sent_id"].append(match)
                matches["nugget_text"].append(nug_text)
                matches['nugget_id'].append(nug['nugget_id'])
        return matches
        
                
    def nuggets_in_article(self, streamid, nugget_df):
        # find streamid in nugget_df
        nug_rows = nugget_df[nugget_df['streamid'] == streamid]
        return nug_rows
                
            
    def preprocess_text(self, text, use_spacy=False):
        # remove first char if \n
#         if text[:1] == "\n":
#             text = text[1:]
        
        sentences = None
        if spacy:
            if self.nlp is None:
                self.nlp = spacy.load(self.spacy_model_selector)
            text = self.nlp(text)
            sentences = list(text.sents)
            sentences = [s.text for s in sentences if len(s) != 0]
        else:  # split by newline
            sentences = text.splitlines()
        return sentences
        
    def sent_embeddings(self, sentences):
        # use sentence-transformers embeddings
        result = self.sent_model.encode(sentences, show_progress_bar=False)
        sent_ids = []
        tokens = []  # sentences as text
        embeddings = []
        for i, (tok, emb) in enumerate(zip(sentences,result)):
            sent_ids.append(i)
            tokens.append(tok)
            embeddings.append(emb)
#         embeddings = np.stack(embeddings)
        return sent_ids, tokens, embeddings
#         # we normalize embeddings, so that euclidian distance is equivalent to cosine distance
#         self.normed_embeddings = (embeddings.T / (embeddings**2).sum(axis=1) ** 0.5).T

    def init_sent_model(self):
        sent_model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
        return sent_model

In [14]:
# super_gen = SupervisedTrainingGenerator()
# super_df = super_gen.generate(corpus, nugget_df)

In [15]:
# print(display_df[0:5])

In [16]:
# sli = pd.DataFrame.copy(super_gen.supervised_df[1][0:5], deep=True)
# # testees = sli['nugget_text']
# # testees[2] = "poop"
# # sli['nugget_text'] = testees
# # print(sli['nugget_text'].unique())
# e_df_test = pd.DataFrame(columns=sli.columns)
# print(list(e_df_test[e_df_test['epoch']==None]['topic_id']))

## Filter the Larger Trects Dataset

In [98]:
class TrectsFilter:
    def __init__(self):
        self.base_dir = '/nfs/trects-kba2014'
        self.updates_dir = "/nfs/TemporalSummarization/ts14/results"
        self.updates_csv_path = self.updates_dir + '/' + "updates_sampled.extended.tsv"  # using extended version
        self.save_dir = '/nfs/trects-kba2014-filtered-mine'
        self.proc_history_path = self.save_dir + '/' + 'process_history.pickle'
        self.proc_history = None
#         self.parser = etree.HTMLParser()
#         self.streamid_strainer = SoupStrainer("streamid")  # quick search for streamid tags
        
        
    def create_filtered_dataset(self, force_reload=False, verbose=True):
        """ Outline of Process
        1. Find streamids
            1.1 open updates_sampled.tsv file (or updates_sampled.extended.tsv)
            1.2 Scrape update_id column
            1.3 transform into streamid (drop last hyphenated numbers (these are sentenceids))
            1.4 Put streamids into datastructure for comparing (e.g. set)
        2. Create a new directory for each topic folder there is in target dir
        3. Opening up documents
            3.1 Go for each topic folder
            3.2 Open up each document
            3.3 Parse into html tree
            3.4 if streamid matches, store locally in memory buffer
            3.5 when buffer is size of however many docs are in other html.gz files, print those docs to a new file
            3.6 name this file something like a number, not dates like other files, save with same file extension/format
        4. Quick test
            4.1 Open up a topicid using load corpus
        5. Final check
            5.1 Add in final check that will only run this if it hasn't already been generated
        """
        # get streamids for docs that we will filter for
        self.streamids = self.get_streamids()

        # get topicids from folder names
        topic_ids = [int(tid) for tid in os.listdir(self.base_dir) if tid.isdigit()]
        topic_ids.sort()
        
        # create dir to save filtered corpus to
        self.create_dir(self.save_dir)
        
        # load history of files already processed if exists
        self.load_process_history_dict(topic_ids)
        
        
        for topic_id in tqdm(topic_ids, position=0, leave=True):
            # create save directory
            topic_dir = self.base_dir + '/' + str(topic_id)
            self.create_dir(topic_dir)
            
            # get paths for files in target topic dir
            gz_paths = search_dir(topic_dir)
            
            # remove already processed files
            if not force_reload:
                gz_paths = [x for x in gz_paths if x not in self.proc_history[topic_id]]
            
            # process each file
            for gz_path in tqdm(gz_paths, position=1, leave=True):
                # get file markup
                markup = open_markup_file(gz_path, verbose=verbose)  # 50MB file proving hard for beautifulsoup
                # get docs in file that are in streamids
                matches = self.retrieve_matching_docs(markup, verbose=verbose)
                save_path = self.get_file_save_path(topic_id, gz_path)
                # write file and save results
                self.write_docs_to_file(matches, save_path, verbose=verbose)
                self.proc_history[topic_id].add(gz_path)
                self.save_process_history_dict()
        print("Finished filtering corpus")
        
    def save_process_history_dict(self):
        with open(self.proc_history_path, 'wb') as handle:
            pickle.dump(self.proc_history, handle, protocol=pickle.HIGHEST_PROTOCOL)
        
    def load_process_history_dict(self, topic_ids):
        if os.path.exists(self.proc_history_path):
            with open(self.proc_history_path, 'rb') as handle:
                self.proc_history = pickle.load(handle)
            return True
        else:
            self.proc_history = self.create_process_history_dict(topic_ids)
            return False
        
    def create_process_history_dict(self, topic_ids):
        """Create a dictionary to keep track of what files have already been searched"""
        proc_history = {}
        for topic_id in topic_ids:
            proc_history[int(topic_id)] = set()  # sets have faster indexing
        return proc_history

                
    def get_file_save_path(self, topic_id, gz_path):
        filename = self.get_filename_from_gz_path(gz_path)
        save_path = self.save_dir + '/' + str(topic_id) + '/' + filename
        return save_path
        
                
    def get_filename_from_gz_path(self, gz_path):
        split = gz_path.split("/")
        filename = split[-1]
        return filename  # return with file extension on
                

    def write_docs_to_file(self, doc_list, save_path, verbose=True):
        # transform docs into string
        if len(doc_list) > 0:  # don't write empty files
            out = "\n".join(list(map(str, doc_list)))
            # write
            with gzip.open(save_path, "wt") as f:
                f.write(out)
                if verbose:
                    print("File written to: " + str(save_path))
        
            
    def retrieve_matching_docs(self, markup, verbose=False):
        """Retrieve docs with matching streamids from markup"""
        matches = []
        doc_count = 0
        match_count = 0
        for doc in markup.find_all("doc"):
            d_streamid = str(doc.find("streamid").string)
            if d_streamid in self.streamids:  # matching doc
                matches.append(doc)
                match_count += 1
            doc_count +=1
        if verbose:
            print("doc count: " + str(doc_count) + "\nmatch_count: " + str(match_count))
        return matches

    def get_streamids(self):
        # read tsv file
        updates_csv = pd.read_csv(self.updates_csv_path, "\t")
        # take column with streamids
        updateids = list(updates_csv['update_id'])
        streamids = set()  # can do lookups in constant time
        for updateid in updateids:
            streamid = self.parse_streamid(updateid)
            streamids.add(streamid)
        return streamids
        
    def parse_streamid(self, updateid):
        """Convert updateid in format: epoch-docid-sentid into epoch-docid"""
        split = updateid.split("-")
        split = split[:-1]  # remove sentid from end
        streamid = "-".join(split)
        return streamid
    
    def create_dir(self, dir_path):
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
            print("Created new directory at " + str(dir_path))

In [99]:
trectsfilter = TrectsFilter()
trectsfilter.create_filtered_dataset(verbose=True, force_reload=False)

  0%|          | 0/45 [00:00<?, ?it/s]
  0%|          | 0/241 [00:00<?, ?it/s][A

/nfs/trects-kba2014/1/2012-02-27-00.gz
gz file opened



  0%|          | 1/241 [06:28<25:53:30, 388.38s/it][A

doc count: 15823
match_count:2
File written to: /nfs/trects-kba2014-filtered-mine/1/2012-02-27-00.gz
/nfs/trects-kba2014/1/2012-02-26-15.gz
gz file opened



  1%|          | 2/241 [09:13<21:20:00, 321.34s/it][A

doc count: 7763
match_count:23
File written to: /nfs/trects-kba2014-filtered-mine/1/2012-02-26-15.gz
/nfs/trects-kba2014/1/2012-02-26-17.gz
gz file opened


KeyboardInterrupt: 

In [103]:
# test1_path = "/nfs/trects-kba2014-filtered-mine/1/2012-02-27-00.gz"
# test2_path = "/nfs/trects-kba2014-filtered-mine/1/2012-02-26-15.gz"
# test1_markup = open_markup_file(test1_path)
# for doc in test1_markup.find_all("doc"):
#     print(doc.find("zulu"))

<zulu>2012-02-27T00:55:00.000000Z</zulu>
<zulu>2012-02-27T00:39:00.000000Z</zulu>


## Control Generate and Load a Corpus

In [26]:
class CorpusGenerator:
    def __init__(self, proj_dir, corpus_split_step=200):
        self.path_handler = FilePathHandler(proj_dir)
        # ["topics", "corpus", "nuggets", "embed_labels", "updates"]
        self.file_purposes = self.path_handler.file_purposes
        self.corpus_split_step = corpus_split_step
        self.topic_dfs = {}  # dict of topic dfs per corpus_name
        
    def generate(self, selection=None, corpus_names=None, new_corpuses=None, 
             force_reload=False, save=True, verbose=True):
        
        # add new corpuses to load
        if new_corpuses is not None:
            for new_corpus in new_corpuses:
                self.path_handler.add_corpus_source(new_corpus, overwrite=True)
        # get corpus paths to load from (if corpus_names is None loads all)
        self.corpus_sources = self.path_handler.get_corpus_sources(corpus_names=corpus_names)
        corpus_names = self.corpus_sources.keys()
        
        if selection is None:  # if none do all
            selection = self.file_purposes
        
        for corpus_name in corpus_names:
            print("corpus_name generate loop: " + str(corpus_name))
            for select in selection:
                if select == "topics":
                    # create topics_df csv
                    
                    self.topic_dfs[corpus_name] = self.get_topic_df(corpus_name, save=save, force_reload=force_reload, 
                                                      verbose=verbose, add_path=True)
                elif select == "corpus":
                    # create corpus df csvs
                    self.corpus_splitter(corpus_name, force_reload=force_reload, verbose=False)
                
#                 elif select == "nuggets":
#                     # create nuggets df
#                     # need to edit create_nuggets to not use local df file
#                     # need to iteratively load corpus to process nuggets
                    
                    
    def corpus_splitter(self, corpus_name, force_reload=False, verbose=True):
        # split by topic and then every 200 html gz files, then parse together in loading
        # add check for what's been done already (i.e. check current topics, if all splits taken place)
        
        # if not exists load topics
        if self.topic_dfs is None or corpus_name not in self.topic_dfs:
            self.topic_dfs = self.get_topic_df(corpus_name)
        
        corpus_dir = self.corpus_sources[corpus_name]["dir_path"]
        print("corpus_dir:" + str(corpus_dir))
            
        for topic_id in self.topic_dfs[corpus_name]['id'].unique():
            # check if path exists
            p_df = self.path_handler.path_df
            p_df = p_df[p_df['file_purpose'] == "corpus"]
            t_df_paths = p_df[p_df['instance_identifier'] == str(topic_id)]
            t_df_paths = t_df_paths[t_df_paths['exists'] == True]  # only concerned with created files
            start_split = 0
            num_split = 0
            if len(t_df_paths) == 0 or force_reload:  # not yet processed
                start_split = 0
            else:
                # check if all splits been processed
                num_split = t_df_paths['num_splits'][0]  # ensure same num_splits is inputted into path_df
                if len(t_df_paths) < num_split:  # not counting from zero
                    break
                # get start point if partway through
                start_split = max(list(map(int, list(t_df_paths['split_indentifier']))))
            
            t_dir = corpus_dir + '/' + str(topic_id)
            num_files = len(search_dir(t_dir))
            # create split indexes to feed to load_corpus
            splits = [start_split]
            add = splits[-1] + self.corpus_split_step
            while add < num_files:
                splits.append(add)
                add = splits[-1] + self.corpus_split_step
            
            if start_split == 0:  
                num_splits = len(splits)  # for inputting into path_df
            
            # create corpus_df files
            for split_num in splits:
                # get save path
                save_path = self.path_handler.get_path(corpus_name, "corpus", str(topic_id), ".csv.gz",
                                        split_identifier=str(split_num), num_splits=num_splits, add_path=True)
                
                load_corpus(save_path, corpus_dir=corpus_dir, topic_ids=[topic_id], 
                            split_every=self.corpus_split_step, split_start_doc=split_num, 
                            save=True, force_reload=force_reload, 
                            verbose=verbose, path_handler=self.path_handler)
                

    def get_topic_df(self, corpus_name, save=True, force_reload=False, verbose=True, add_path=False):
        load_path = self.corpus_sources[corpus_name]["topics_path"]
        save_path = self.path_handler.get_path(corpus_name, "topics", "topics_df", ".csv.gz", add_path=add_path)
        
        topic_df = load_topics(save_path, load_path=load_path, save=save, force_reload=force_reload, 
                               verbose=verbose, path_handler=self.path_handler)
        return topic_df

In [27]:
proj_dir = '/nfs/proj-repo/AAARG-dissertation'
tr13_filtered_dict = { "corpus_name":"tr13_filtered",
                        "dir_path":"/nfs/trects-kba2013-filtered", 
                      "topics_path":"/nfs/trects-kba2013-filtered/test-topics.xml", 
                      "nuggets_path":"/nfs/TemporalSummarization/ts13/results/nuggets.tsv"}


corp_gen = CorpusGenerator(proj_dir)


corp_gen.generate(new_corpuses=[tr13_filtered_dict])

corpus_name generate loop: tr13_filtered
Loading topics
loaded from file




Unnamed: 0.1,Unnamed: 0,id,title,description,start,end,query,type
0,0,1,2012 Buenos Aires Rail Disaster,http://en.wikipedia.org/wiki/2012_Buenos_Aires...,1329910380,1330774380,buenos aires train crash,accident
1,1,2,2012 Pakistan garment factory fires,http://en.wikipedia.org/wiki/2012_Pakistan_gar...,1347368400,1348232400,pakistan factory fire,accident
2,2,3,2012 Aurora shooting,http://en.wikipedia.org/wiki/2012_Aurora_shooting,1342766280,1343630280,colorado shooting,shooting
3,3,4,Wisconsin Sikh temple shooting,http://en.wikipedia.org/wiki/Wisconsin_Sikh_te...,1344180300,1345044300,sikh temple shooting,shooting


  1%|          | 2/200 [00:00<00:15, 12.84it/s]

None
corpus_dir:/nfs/trects-kba2013-filtered
Loading topic 1...
creating corpus df for topic 1 starting at file no. 0 splitting every 200 docs


100%|██████████| 200/200 [00:08<00:00, 22.40it/s]
  8%|▊         | 3/40 [00:00<00:01, 27.69it/s]

Loading topic 1...
creating corpus df for topic 1 starting at file no. 200 splitting every 200 docs


100%|██████████| 40/40 [00:00<00:00, 52.44it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

Loading topic 2...
creating corpus df for topic 2 starting at file no. 0 splitting every 200 docs


100%|██████████| 200/200 [03:47<00:00,  1.14s/it]
  0%|          | 0/40 [00:00<?, ?it/s]

Loading topic 2...
creating corpus df for topic 2 starting at file no. 200 splitting every 200 docs


100%|██████████| 40/40 [00:24<00:00,  1.66it/s]
  2%|▏         | 3/200 [00:00<00:08, 22.92it/s]

Loading topic 3...
creating corpus df for topic 3 starting at file no. 0 splitting every 200 docs


100%|██████████| 200/200 [01:45<00:00,  1.89it/s]
  0%|          | 0/40 [00:00<?, ?it/s]

Loading topic 3...
creating corpus df for topic 3 starting at file no. 200 splitting every 200 docs


100%|██████████| 40/40 [00:14<00:00,  2.81it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

Loading topic 4...
creating corpus df for topic 4 starting at file no. 0 splitting every 200 docs


100%|██████████| 200/200 [02:08<00:00,  1.56it/s]
  2%|▎         | 1/40 [00:00<00:06,  6.40it/s]

Loading topic 4...
creating corpus df for topic 4 starting at file no. 200 splitting every 200 docs


100%|██████████| 40/40 [00:32<00:00,  1.22it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

Loading topic 5...
creating corpus df for topic 5 starting at file no. 0 splitting every 200 docs


100%|██████████| 200/200 [01:05<00:00,  3.03it/s]
  0%|          | 0/40 [00:00<?, ?it/s]

Loading topic 5...
creating corpus df for topic 5 starting at file no. 200 splitting every 200 docs


100%|██████████| 40/40 [00:11<00:00,  3.56it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

Loading topic 6...
creating corpus df for topic 6 starting at file no. 0 splitting every 200 docs


100%|██████████| 200/200 [00:45<00:00,  4.42it/s]
  0%|          | 0/40 [00:00<?, ?it/s]

Loading topic 6...
creating corpus df for topic 6 starting at file no. 200 splitting every 200 docs


100%|██████████| 40/40 [00:13<00:00,  3.03it/s]
0it [00:00, ?it/s]
  0%|          | 0/200 [00:00<?, ?it/s]

Loading topic 7...
creating corpus df for topic 7 starting at file no. 0 splitting every 200 docs
Loading topic 8...
creating corpus df for topic 8 starting at file no. 0 splitting every 200 docs


100%|██████████| 200/200 [00:18<00:00, 10.90it/s]
  8%|▊         | 3/40 [00:00<00:01, 19.22it/s]

Loading topic 8...
creating corpus df for topic 8 starting at file no. 200 splitting every 200 docs


100%|██████████| 40/40 [00:02<00:00, 16.45it/s]
  0%|          | 1/200 [00:00<00:32,  6.03it/s]

Loading topic 9...
creating corpus df for topic 9 starting at file no. 0 splitting every 200 docs


100%|██████████| 200/200 [00:07<00:00, 27.62it/s]
  8%|▊         | 3/40 [00:00<00:01, 19.22it/s]

Loading topic 9...
creating corpus df for topic 9 starting at file no. 200 splitting every 200 docs


100%|██████████| 40/40 [00:01<00:00, 23.36it/s]
  0%|          | 1/200 [00:00<00:23,  8.55it/s]

Loading topic 10...
creating corpus df for topic 10 starting at file no. 0 splitting every 200 docs


100%|██████████| 200/200 [01:11<00:00,  2.79it/s]
  5%|▌         | 2/40 [00:00<00:02, 13.38it/s]

Loading topic 10...
creating corpus df for topic 10 starting at file no. 200 splitting every 200 docs


100%|██████████| 40/40 [00:06<00:00,  6.28it/s]


In [None]:
# class CorpusLoader:
#     def __init__(self):
#         self.path_handler = FilePathHandler()
#         self.file_purposes = self.path_handler.file_purposes
        
        
#     def load(self, selection=None, corpus_names=None, new_corpuses=None, force_reload=False, save=True, 
#              create_only=False, verbose=True):
#         """
#         Parameters:
#             selection: the data to load (e.g. corpus/nuggets), if None then load options
#             corpus_names: list of corpus names to load
#             new_corpuses: list of dicts of paths with keys {"dir_path", "topics_path", "nuggets_path"}
#             force_reload: force rebuild corpus files from original files
#             save: save built corpus files
#             create_only: only generate missing corpus files, do not load into ram
#         """
#         """Put flags in here to control process too
#         Steps:
#         1. Load corpus from gz html files
#         2. Load topics from topics file
#         3. Create nuggets_df from nuggets file
#         4. Create embeddings from nuggets and corpus
#         """
#         # add new corpuses to load
#         if new_corpuses is not None:
#             for new_corpus in new_corpuses:
#                 self.path_handler.add_corpus(new_corpus, overwrite=False)
#         # get corpus paths to load from (if corpus_names is None loads all)
#         corpus_sources = self.path_handler.get_corpus_sources(corpus_names=corpus_names)
#         corpus_names = corpus_sources.keys()
        
#         # get paths for generated files in corpus
#         if selection is None:  # if None selection get all
#             selection = self.file_purposes
#         corpus_names_paths = {}
#         for corpus_name in corpus_names:
#             name_paths = self.path_handler.paths_in_corpus_name(corpus_name, selection=selection)
#             corpus_names_paths[corpus_name] = name_paths
        
#         # go through each selected corpus_name
#         for corpus_name, corpus_paths in corpus_names_paths.items():
#             # go through selected tasks
#             for select in selection:  # maybe add tqdm here?
#                 # for each identifier
#                 # may behave differently when no entries in paths_df
#                 for identifier, ident_paths in corpus_paths[select]:
#                     exists = ident_paths['exists']
#                     not_exists = ident_paths['not_exists']
#                     if exists is None or force_reload:  # no paths loaded
#                         # create all appropriate files
# #                         not_exists = self.path_handler.convert_relative_path(not_exists)
#                         self.select_load_func(select, paths=not_exists, save=save, 
#                                     force_reload=force_reload, verbose=verbose, create_only=create_only)
#                     else:
#                         if not_exists is None:  # only exists has paths
#                             if create_only:
#                                 # change this to something better
#                                 warnings.warn("There are no new files to create")
#                             else:
# #                                 exists = self.path_handler.convert_relative_path(exists)
#                                 self.select_load_func(select, paths=exists, save=save, 
#                                     force_reload=force_reload, verbose=verbose, create_only=create_only)
#                         else:  # both have paths
#                             # need method to load/generate incomplete missing parts
#                             # self.path_handler.convert_relative_paths(...) dont forget
    
#     def fix_partially_missing_paths(self, selection):
#         """Function to organise missing paths from partially-saved/generated dataset"""
#         if selection == "topics":
#             raise ValueError("There can only be one topics file for a corpus")
    
#     def select_load_func(self, selection, paths=None, save=True, force_reload=False, 
#                              verbose=True, create_only=False, **identifiers):
#         # ["topics", "corpus", "nuggets", "embed_labels", "updates"]
#         if paths is None:
#             # create paths appropriately, add to paths_df
#             # also check paths/behaviour is correct in each load_func/after
#             # also need to account for other identifiers
#         if selection == "topics":
#             # should only need to be one meta topics_df
#             topic_dfs = []
#             # need to add save paths into paths_df
#             topic_df = load_topics(paths[0], save=save, force_reload=force_reload, verbose=verbose)
                
#         elif selection == "corpus":
            
#         elif selection == "nuggets":
            
#         elif selection == "embed_labels":
            
#         elif selection == "updates":
        
        
        