## Loading Data Set

In [2]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [3]:
## IMPORT DEPENDENCIES

from bs4 import BeautifulSoup as bs
import gzip
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from collections import OrderedDict
import sqlite3
sqlite3.register_adapter(np.int64, lambda val: int(val))  # sqlite3 won't accept int > 8 bytes, turns into blob datatype
sqlite3.register_adapter(np.int32, lambda val: int(val))
import ipynb.fs
print ("loading data set dependencies successful")

loading data set dependencies successful


In [4]:
## SET FILE META VARIABLES

corpus_path = "/nfs/trects-kba2014-filtered" # directory of corpus of gzipped html files
topics_path = corpus_path + "/test-topics.xml"
doc_tags = ['topic_id','streamid', 'docid', 'yyyymmddhh', 'kbastream', 'zulu', 'epoch', 'title', 'text', 'url'] # doc fields
topic_tags = ['id', 'title', 'description', 'start','end','query','type'] # topic fields
test_file_addr = corpus_path + "/1/2012-02-22-15.gz"
proj_dir = '/nfs/proj-repo/AAARG-dissertation'
# csv file addresses
corp_csv_name = 'corpus_loaded.csv.gz'
corp_csv_path = proj_dir + '/' + corp_csv_name
topics_csv_name = 'topics_loaded.csv.gz'
topics_csv_path = proj_dir + '/' + topics_csv_name

In [5]:
# open and get beautifulsoup object from markup file
def open_markup_file(addr, gz=True, xml=False, verbose=False):
    markup = None
    f = None
    
    if verbose:
        print(addr)

    if gz:
        f = gzip.open(addr)
    else:
        f = open(addr)
        
    if xml == False:
        markup = bs(f)  # open as html
    else:
        markup = bs(f, "xml")
        
    f.close()
    return markup


# parse markup and return 2D list [entry:tags]
def parse_markup(markup, entry_list, find_tag="doc", tag_list=doc_tags, topic_id=None):
    for e in markup.find_all(find_tag):
        entry = OrderedDict.fromkeys(tag_list)
        if topic_id is not None:
            entry['topic_id'] = topic_id
        for c in e.children:  # children use direct children, descendants uses all
            if c.name in entry:
                entry[c.name] = str(c.string)
            elif c.name is None and c.string != '\n':  # inner body of <doc> tag
                entry['text'] = str(c.string)
        entry_list.append(list(entry.values()))
        
            
# recursively find gz html files from a directory address
def search_dir(path):    
    # separate the subdirectories and html files 
    # (help maintain sequential order of insertion)
    gz_paths = []
    for f in os.scandir(path):
        if os.path.splitext(f.path)[-1].lower() == ".gz":
            gz_paths.append(f.path)
    
    return gz_paths


def list_to_dataframe(markup_list, tags):
    return pd.DataFrame(markup_list, columns=tags)

In [6]:
def file_exists(path):
    return os.path.isfile(path)

def load_df_control(path, load_func, save=True, force_reload=False, compression='gzip'):
    df = None
    if not file_exists(path) or force_reload:
        df = load_func()
        print("df loaded")
        if save:
            df.to_csv(path, compression=compression)
            print("saved at: " + str(path))
    else:
        df = pd.read_csv(path, compression=compression)
        print("loaded from file")
    return df

In [7]:
# load topics into dataframe
def __load_topics(path):
    topics_list = []
    parse_markup(open_markup_file(path, gz=False, xml=True), 
                    topics_list, find_tag="event", tag_list=topic_tags)
    df = list_to_dataframe(topics_list, topic_tags)
    df['id'] = pd.to_numeric(df['id'])
    return df

def load_topics(save=True, force_reload=False):
    topics = load_df_control(topics_csv_path, lambda: load_topics(topics_path), save=save, force_reload=force_reload)
    return topics

topics = load_topics()

loaded from file


In [8]:
print("Topics loaded successfuly")
print(display(topics[0:4]))

Topics loaded successfuly


Unnamed: 0.1,Unnamed: 0,id,title,description,start,end,query,type
0,0,1,2012 Buenos Aires Rail Disaster,http://en.wikipedia.org/wiki/2012_Buenos_Aires...,1329910380,1330774380,buenos aires train crash,accident
1,1,2,2012 Pakistan garment factory fires,http://en.wikipedia.org/wiki/2012_Pakistan_gar...,1347368400,1348232400,pakistan factory fire,accident
2,2,3,2012 Aurora shooting,http://en.wikipedia.org/wiki/2012_Aurora_shooting,1342766280,1343630280,colorado shooting,shooting
3,3,4,Wisconsin Sikh temple shooting,http://en.wikipedia.org/wiki/Wisconsin_Sikh_te...,1344180300,1345044300,sikh temple shooting,shooting


None


### Loading Topics Into Database

In [9]:
from .defs.database_management_mysql import get_connection, create_tables, populate_topics  # import database_management functions
conn, cursor = get_connection()
create_tables(conn, cursor)
populate_topics(conn, cursor, topics)
conn.close()

Finshed adding tables
is_empty_table count: 1




## Load Corpus

In [10]:
# load all formatted gzipped html files into dataframe
def __load_corpus(path, doc_tags=doc_tags, save=True, force_reload=False):
    df = pd.DataFrame(columns=doc_tags)
    for topic_id in topics['id'].to_numpy():
        print("Loading topic " + str(topic_id) + "...")
        topic_list = []
        id_path = corpus_path + "/" + str(topic_id) + "/"  # every topic id correlates to subfolder named after it
        gz_paths = search_dir(id_path)
        for gz_path in tqdm(gz_paths, position=0, leave=True):
            parse_markup(open_markup_file(gz_path, verbose=False),
                            topic_list, topic_id=topic_id)
        topic_df = list_to_dataframe(topic_list, doc_tags)
        df = df.append(topic_df)
    df['epoch'] = pd.to_numeric(df['epoch'])
    return df

def load_corpus(save=True, force_reload=False):
    corpus = load_df_control(corp_csv_path, lambda: load_corpus(corpus_path), save=save, force_reload=force_reload)
    return corpus

corpus = load_corpus()

print("Corpus loaded Successfully")

loaded from file
Corpus loaded Successfully


In [24]:
print("Corpus loaded succesfully: " + str(len(corpus)) + " documents loaded.")
print(display(corpus[0:4]))
# there is an error in the dataset that article at 1 is misplaced in topic 1

Corpus loaded succesfully: 12261 documents loaded.


Unnamed: 0.1,Unnamed: 0,topic_id,streamid,docid,yyyymmddhh,kbastream,zulu,epoch,title,text,url
0,0,1,1330269540-995ed81eafa60498872335da7dce1386,995ed81eafa60498872335da7dce1386,2012-02-26-15,news,2012-02-26T15:19:00.000000Z,1330269540,US says it's steadfast in rebuilding Afghanist...,\nUS says it's steadfast in rebuilding Afghani...,http://www.elpasotimes.com/politics/ci_20049216
1,1,1,1330268520-f42a863b58b2cc53cc716953c40f6065,f42a863b58b2cc53cc716953c40f6065,2012-02-26-15,news,2012-02-26T15:02:00.000000Z,1330268520,Argentina Train Crash: Driver Blames Faulty Br...,\nArgentina Train Crash: Driver Blames Faulty ...,http://www.thisdaylive.com/articles/argentina-...
2,2,1,1330270020-e47e013ec518f5fdd253ce28231f509f,e47e013ec518f5fdd253ce28231f509f,2012-02-26-15,news,2012-02-26T15:27:00.000000Z,1330270020,The Alaska Journal of Commerce Local News Oil ...,\nThe Alaska Journal of Commerce Local News Oi...,http://ap.alaskajournal.com/pstories/20120226/...
3,3,1,1330268700-8078290575c82c8dd0e4e99370447bd2,8078290575c82c8dd0e4e99370447bd2,2012-02-26-15,news,2012-02-26T15:05:00.000000Z,1330268700,U.S. military receives remains of last soldier...,\nU.S. military receives remains of last soldi...,http://www.islandpacket.com/2012/02/26/1978117...


None


In [25]:
# # duplicates are updates to the page
# find_nug = corpus[corpus['streamid'] == "1329915660-47ed792a77d798dda8697654e8fcbb43"]
# # 1329915300-46c9b2db03fbaf7d2a903bbfa7ff3c93-3
# # duplicate found when -3 taken away
# dup_nug = corpus[corpus['streamid'] == "1329915300-46c9b2db03fbaf7d2a903bbfa7ff3c93"]
# print(corpus[corpus['docid'] == "47ed792a77d798dda8697654e8fcbb43"])
# print(find_nug)
# print(dup_nug)
# # print(dup_nug['text'])

## Preprocessing

- Topic Modelling needs better preprocessing (stop words/lemmas etc.)
    - stop words
    - lemmatization (stemming is faster but is rule-based with more false transformations)
    - special char removal
- Could try removing junk at top of docs through REs/spacy

In [26]:
# ## IMPORT DEPENDENCIES

# import spacy

# print("preprocessing dependencies import successful")

In [27]:
# nlp = spacy.load("en_core_web_sm")  # try experimenting disabling parts of spacy pipeline see if .sents still works

# nlp.add_pipe(nlp.create_pipe('sentencizer'), before="parser")
# nlp.remove_pipe('tagger')
# nlp.remove_pipe('parser')

In [28]:
# test_docs = corpus.loc[0:3,:]  # work on just top 3 for now

# # in data frame, split sentences into list by the newline delimiter
# #test_docs['text'] = test_docs['text'].map(lambda x: x.splitlines())

# # map the non-preprocessed string list to a preprocessed string list

# #@Tokenize
# def spacy_tokenize(string):
#     tokens = list()
#     doc = nlp(string)
#     for token in doc:
#         if not token.is_stop:
#             tokens.append(token)
#     return tokens

# #@Normalize
# def normalize(tokens):
#     normalized_tokens = list()
#     for token in tokens:
#         normalized = token.text.lower().strip()
#         if ((token.is_alpha or token.is_digit)):
#             normalized_tokens.append(normalized)
#     return normalized_tokens

# #@Tokenize and normalize
# def tokenize_normalize(string):
#     return normalize(spacy_tokenize(string))

# # test_prep = []
# # for doc in test_docs['text']:
# #     d = []
# #     for sent in doc:
# #         d.append(tokenize_normalize(sent))
# #     test_prep.append(d)
        
# print("cell finished")

In [29]:
#print(test_prep[0])

## Retrieve Word and Sentence Level Embeddings

In [30]:
# from sentence_transformers import SentenceTransformer
# from transformers import AutoModel, AutoTokenizer
# #sent_model = AutoModel.from_pretrained('sentence-transformers/distilbert-base-nli-stsb-mean-tokens')
# sent_tokenizer = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
# #word_model = AutoModel.from_pretrained('distilbert-base-uncased')
# word_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

## Topic Modelling

- LDA uses K-means clustering
- HDA learns num topics automatically (Bayesian non-parametric)

In [31]:
# # word level topic modelling
# # needs better preprocessing (remove stopwords/lemmitization etc)
# # maybe add REs/other preprocessing remove uninformative junk at top of docs

# import gensim
# from gensim import corpora

# print("loaded dependencies")

In [32]:
# class TopicModeller: 
#     def __init__(self):
#         self.model = None
#         self.corpus_dict = None
#         self.weighted_tokens = None
#         self.print_topics = None
        
#     def weigh_tokens(self, texts, method="bow"):
#         """Perform token weighting scheme on text and return with dict"""
#         def create_dictionary(texts):
#             """Create a gensim dictionary of index-word mappings"""
#             return corpora.Dictionary(texts)
    
#         flat_texts = [token for sent in texts for token in sent]  # should be fast
#         self.corpus_dict = create_dictionary(flat_texts)
#         if method == "bow":
#             self.weighted_tokens = [self.corpus_dict.doc2bow(text) for text in flat_texts]
#         else:
#             raise Exception("Incorrect method parameter")
            
#     def model_topics(self, method="lda", num_topics=10):
#         if method == "lda":
#     #         model = gensim.models.ldamodel.LdaModel(weighted_tokens, num_topics=NUM_TOPICS, 
#     #                                                 id2word=corpus_dict, passes=15)
#             self.model = gensim.models.ldamulticore.LdaMulticore(self.weighted_tokens, num_topics=num_topics, 
#                                                     id2word=self.corpus_dict, passes=15)
#         else:
#             raise Exception("Incorrect method parameter")
            
#         self.print_topics = self.model.print_topics()
#         return self.print_topics
# print("loaded cell")

In [33]:
# topic_model = TopicModeller()
# topic_model.weigh_tokens(test_prep)
# print(topic_model.model_topics())

## Sentence Level Clustering

In [34]:
# # first get sentences which are nearest neighbors to the identified topics
# # https://scikit-learn.org/stable/modules/neighbors.html
# # https://stackoverflow.com/questions/60996584/bert-embedding-for-semantic-similarity
# # https://stackoverflow.com/questions/59865719/how-to-find-the-closest-word-to-a-vector-using-bert
# # https://gist.github.com/avidale/c6b19687d333655da483421880441950


# # then compare sentence results from pure extractive summariser maybe?

# from sklearn.neighbors import KDTree
# #import mxnet as mx
# from bert_embedding import BertEmbedding

# # ctx = mx.gpu(0)
# # bert = BertEmbedding(ctx=ctx)
# bert_emb = BertEmbedding()

In [35]:
# """
# Trying code from here
# https://gist.github.com/avidale/c6b19687d333655da483421880441950

# Preprocess embeddings in a formatted way as such can track sentences, words, embeddings

# do this, then pass the LDA topics into the query
# """ 

# class EmbeddingHandler:
#     def __init__(self, sentences, model):
#         self.sentences = sentences
#         self.model = model
        
#     def generate_embeddings(self):
#         result = self.model(self.sentences)
# #         result = list()
# #         for sent in self.sentences:
# #             result.append(self.model.encode(sent, is_pretokenized=True))
# #         #result = self.model.encode(self.sentences, is_pretokenized=True, show_progress_bar=True)
#         #print(result)
        
#         self.sent_ids = []
#         self.token_ids = []
#         self.tokens = []
#         embeddings = []
#         for i, (toks, embs) in enumerate(tqdm(result)):
#             for j, (tok, emb) in enumerate(zip(toks, embs)):
#                 self.sent_ids.append(i)
#                 self.token_ids.append(j)
#                 self.tokens.append(tok)
#                 embeddings.append(emb)
#         embeddings = np.stack(embeddings)
#         # we normalize embeddings, so that euclidian distance is equivalent to cosine distance
#         self.normed_embeddings = (embeddings.T / (embeddings**2).sum(axis=1) ** 0.5).T
        
#     def generate_sent_embeddings(self):
#         """test sent vs word embeddings"""
#         # use sentence-transformers embeddings
#         result = self.model.encode(self.sentences)
#         self.sent_ids = []
#         self.tokens = []
#         embeddings = []
#         for i, (tok, emb) in enumerate(tqdm(zip(self.sentences,result))):
#             self.sent_ids.append(i)
#             self.tokens.append(tok)
#             embeddings.append(emb)
#         embeddings = np.stack(embeddings)
#         # we normalize embeddings, so that euclidian distance is equivalent to cosine distance
#         self.normed_embeddings = (embeddings.T / (embeddings**2).sum(axis=1) ** 0.5).T
        
#     def create_comparitor(self):
#         # this takes some time
#         self.indexer = KDTree(self.normed_embeddings)
#         print("created KDTree")
    
#     def query(self, query_sent, query_word, k=10, filter_same_word=False):
#         toks, embs = self.model([query_sent])[0]

#         found = False
#         for tok, emb in zip(toks, embs):
#             if tok == query_word:
#                 found = True
#                 break
#         if not found:
#             raise ValueError('The query word {} is not a single token in sentence {}'.format(query_word, toks))
#         emb = emb / sum(emb**2)**0.5

#         if filter_same_word:
#             initial_k = max(k, 100)
#         else:
#             initial_k = k
#         di, idx = self.indexer.query(emb.reshape(1, -1), k=initial_k)  # this is returning our neighbours
#         distances = []
#         neighbors = []
#         contexts = []
#         # this is filtering for word matching
#         for i, index in enumerate(idx.ravel()):
#             token = self.tokens[index]
#             if filter_same_word and (query_word in token or token in query_word):  # take this away
#                 continue
#             distances.append(di.ravel()[i])
#             neighbors.append(token)
#             contexts.append(self.sentences[self.sent_ids[index]])
#             if len(distances) == k:
#                 break
#         return distances, neighbors, contexts
    
#     def topic_neighbors(self, topic_word, k=10):
#         # get average embedding of topic word
#         # maybe instead return context sentence that is closest to averaged embedding?
#         # that way can use context to get right meaning
#         topic_emb = self.avg_embedding(self.retrieve_embeddings(topic_word))
        
#         # get neighbors
#         # do I need reshape?
#         di, idx = self.indexer.query(topic_emb.reshape(1,-1), k=k)
#         distances = []
#         neighbors = []
#         contexts = []
#         for i, index in enumerate(idx.ravel()):
#             token = self.tokens[index]
#             distances.append(di.ravel()[i])
#             neighbors.append(token)
#             contexts.append(self.sentences[self.sent_ids[index]])
#         return distances, neighbors, contexts
        
        
#     def retrieve_embeddings(self, token):
#         idxs = []
#         for i, t in enumerate(self.tokens):
#             if t == token:
#                 idxs.append(i)
#             elif token in t:  # sent-embeddings temp workaround
#                 idxs.append(i)
#         embs = []
#         for i in idxs:
#             embs.append(self.normed_embeddings[i])
#         return embs
    
#     def avg_embedding(self, emb_list):
#         return np.mean(emb_list, axis=0)

In [36]:
# emb_handler = EmbeddingHandler(emb_handler_corp, bert_emb)  # [0] index taking first doco
# emb_handler.generate_embeddings()
# emb_handler.create_comparitor()

## Summarization

In [37]:
from summarizer import Summarizer
#from summarizer.coreference_handler import CoreferenceHandler
#co_handler = CoreferenceHandler(greedyness=0.4)
#sum_model = Summarizer(sentence_handler=co_handler)
sum_model = Summarizer()
print("loaded summarisation model")

I0107 12:20:15.007286 140183057614656 filelock.py:274] Lock 140182366188488 acquired on /root/.cache/torch/transformers/6dfaed860471b03ab5b9acb6153bea82b6632fb9bbe514d3fff050fe1319ee6d.788fed32bb8481a9b15ce726d41c53d5d5066b04c667e34ce3a7a3826d1573d8.lock


HBox(children=(IntProgress(value=0, description='Downloading', max=434, style=ProgressStyle(description_width=…

I0107 12:20:15.425395 140183057614656 filelock.py:318] Lock 140182366188488 released on /root/.cache/torch/transformers/6dfaed860471b03ab5b9acb6153bea82b6632fb9bbe514d3fff050fe1319ee6d.788fed32bb8481a9b15ce726d41c53d5d5066b04c667e34ce3a7a3826d1573d8.lock





I0107 12:20:18.944191 140183057614656 filelock.py:274] Lock 140177465659176 acquired on /root/.cache/torch/transformers/73e65a4648c1a5eab31ecea94e04a92a7168cd7089d588b68e5bc057aff40421.4d5343a4b979c4beeaadef17a0453d1bb183dd9b084f58b84c7cc781df343ae6.lock


HBox(children=(IntProgress(value=0, description='Downloading', max=1344997306, style=ProgressStyle(description…

I0107 12:20:35.279127 140183057614656 filelock.py:318] Lock 140177465659176 released on /root/.cache/torch/transformers/73e65a4648c1a5eab31ecea94e04a92a7168cd7089d588b68e5bc057aff40421.4d5343a4b979c4beeaadef17a0453d1bb183dd9b084f58b84c7cc781df343ae6.lock





I0107 12:20:44.631939 140183057614656 filelock.py:274] Lock 140177274339224 acquired on /root/.cache/torch/transformers/9b3c03a36e83b13d5ba95ac965c9f9074a99e14340c523ab405703179e79fc46.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.lock


HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…

I0107 12:20:45.761188 140183057614656 filelock.py:318] Lock 140177274339224 released on /root/.cache/torch/transformers/9b3c03a36e83b13d5ba95ac965c9f9074a99e14340c523ab405703179e79fc46.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084.lock



loaded summarisation model


In [38]:
# for processing massive strings
#!jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

In [39]:
"""
ValueError: [E088] Text of length 3496277 exceeds maximum of 1000000. 
The v2.x parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. 
This means long texts may cause memory allocation errors. 
If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. 
The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.
"""

class SummarizationHandler:
#     def __init__(self):
# #         self.model = model
        
    def summarize(self, texts, max_len=500000, smallest_ratio=0.1):
        def sum_texts(texts_list, ratio=None):
            sums = []
            print("Summarising text in " + str(len(texts_list)) + " pieces.")
            for text in tqdm(texts_list):
                if ratio is None:
                    sums.append(self.model(text))
                else:
                    sums.append(self.model(text, ratio=ratio))
            return ". ".join(sums)
        
        def half_list(a_list):
            half = len(a_list)//2
            return a_list[:half], a_list[half:]
        
        self.model = Summarizer()  # try reset with a fresh model each time
        
        total_len = self.total_length(texts)
        split_texts, size_split = self.split_texts(texts, total_len, max_len)
        split_ratio = size_split / total_len
        
        new_splits = [split_texts]
        new_ratios = [split_ratio]
        while split_ratio < smallest_ratio:
            new_ratios = []
            n_s = []
            for split in new_splits:  # loop for each list:
                t1, t2 = half_list(split)
                n_s.append(t1)
                n_s.append(t2)
            new_splits = n_s
            for s in new_splits:
                new_ratios.append(size_split/self.total_length(s))
            split_ratio = min(new_ratios)
        
        split_sums = []
        if new_ratios[0] <= 1:
            for i in range(len(new_splits)):
                split_sum = sum_texts(new_splits[i], ratio=new_ratios[i])
                split_sums.append(split_sum)
            s = ". ".join(split_sums)
            s = sum_texts([s])
            return s
        else:
            return sum_texts(new_splits[0])
        
#         if split_ratio >= 1:
#             cur_sum = sum_texts(split_texts)
#         else:
#             sums = sum_texts(split_texts, split_ratio)
#             cur_sum = ". ".join(sums)
#             cur_sum = sum_texts([cur_sum])
#         return cur_sum
        
    def split_texts(self, texts, total_len, max_len):
        """Split texts into list of strings under max_len"""
        num_split, size_split = self.optimal_split(total_len, max_len)
        splits = []
        cur_split = ""
        i = 1
        for text in texts:
            if i == num_split:
                # just add to last no check
                cur_split += text
            else:
                if (len(cur_split) + len(text) > size_split):
                    splits.append(cur_split)
                    cur_split = text
                    i += 1
                else:
                    cur_split += text
        splits.append(cur_split)
        return splits, size_split
            
    def optimal_split(self, total_len, max_len):
        """Find even split of text under max_len"""
        under_len = int(max_len * 0.95)  # use slightly under max_len for safety
        cur_div = 1
        cur_size = total_len / cur_div
        while (cur_size > under_len):
            cur_div += 1
            cur_size = total_len / cur_div
        return cur_div, cur_size
        
    
    def total_length(self,texts):
        total = 0
        for t in texts:
            total += len(t)
        return total



In [40]:
# # sum_handler = SummarizationHandler(sum_model)
# sum_handler = SummarizationHandler()

# test_topic = corpus.loc[corpus['topic_id'] == 1]['text'][0:10]
# test_topic_sum = sum_handler.summarize(test_topic)

# print(test_topic_sum)
# print("complete")

## Evaluation Metrics

### Reading 'Nugget' Values

In [41]:
nugget_dir = "/nfs/TemporalSummarization/ts13/results"
updates_sampled_path = nugget_dir + "/updates_sampled.tsv"
nuggets_path = nugget_dir + "/nuggets.tsv"
nug_matches_path = nugget_dir + "/matches.tsv"
# saving nugget and update files
nugget_csv = 'nugget_df.csv.gz'
update_csv = 'update_df.csv.gz'
nugget_csv_path = proj_dir + '/' + nugget_csv
update_csv_path = proj_dir + '/' + update_csv

In [46]:
import re

def find_duplicates(df):
    seen = set()
    seen_twice = set()
    for docid in df['docid']:
        if docid not in seen:
            seen.add(docid)
        else:
            seen_twice.add(docid)
    return seen_twice

def create_update_df():
    """Data Frame containing information about docs which have updates/multiple instances in corpus"""
    def create_entry(row, col_tags):
        entry = {}
        for col in col_tags:
            entry[col] = row[col]
        return entry
    
    col_tags = ['docid', 'streamid', 'epoch', 'yyyymmddhh', 'zulu']
    entry_list = []
    dups = find_duplicates(corpus)
    for docid in tqdm(dups, position=0, leave=True):
        d = corpus[corpus['docid'] == docid]
        for index, row in d.iterrows():
            entry = create_entry(row, col_tags)
            entry_list.append(entry)
             
    update_df = pd.DataFrame(entry_list)
    update_df = update_df.set_index(col_tags)
    return update_df
                    
                
def create_nugget_df():
    """Dataframe containing nugget data and its appearances in corpus"""
    def create_entry(row, reg_cols, multi_col_vals=None):
        entry_dict = {}
        for col in reg_cols:
            entry_dict[col] = row[col]
        if multi_cols is not None:
            for k,v in multi_col_vals.items():
                entry_dict[k] = v
        return entry_dict
    nuggets_tsv = pd.read_csv(nuggets_path, "\t")
    entry_list = []
    reg_cols = ['query_id', 'nugget_id', 'importance', 'nugget_len', 'nugget_text']
    multi_cols = ['docid', 'streamid', 'epoch', 'yyyymmddhh']  # multiindex cols
    num_cols = ['query_id', 'importance', 'nugget_len', 'epoch']
    
    pbar = tqdm(total=len(nuggets_tsv), position=0, leave=True)
    for index, row in nuggets_tsv.iterrows():
        # find where nugget appears in text
        nug_text = row['nugget_text']
        topic_id = 0
        try:
            topic_id = int(row['query_id'])  # make sure pattern match in correct topic
        except ValueError:
            pbar.update()
            continue  # topic_id is unknown string in tsv file, e.g. "TS13.07"
        appears = corpus[corpus['topic_id'] == topic_id]
        appears = appears[appears['text'].str.contains(re.escape(nug_text))]  # make sure no accidental regex pattern
        
        # gather information on docs it appears in
        dups = find_duplicates(appears)  # get docids where nugget appears
        for docid in dups:
            upd = appears[appears['docid'] == docid]  # get docs with this docid
            for i, r in upd.iterrows():  # gather info on each doc with this docid (e.g. streamid, epoch etc.)
                multi_col_vals = {}
                for multi_col in multi_cols:
                    multi_col_vals[multi_col] = r[multi_col]
                entry = create_entry(row, reg_cols, multi_col_vals=multi_col_vals)
                entry_list.append(entry)
        pbar.update()
    pbar.close()
    
    # form multi-index nugget dataframe
    reg_cols.extend(multi_cols)  # get new multiindex order
    nugget_df = pd.DataFrame(entry_list)
    nugget_df[num_cols] = nugget_df[num_cols].apply(pd.to_numeric, errors='coerce', axis=1)  # convert appropriate cols to numerical values
    nugget_df.rename(columns={'query_id':'topic_id'}, inplace=True)  # topic_id matches other dataframes
    return nugget_df

def load_nugget_df(save=True, force_reload=False):
    nugget_df = load_df_control(nugget_csv_path, create_nugget_df, save=save, force_reload=force_reload)
    return nugget_df

def load_update_df(save=True, force_reload=False):
    update_df = load_df_control(update_csv_path, create_update_df, save=save, force_reload=force_reload)
    return update_df

In [44]:
nugget_df = load_nugget_df()

loaded from file


In [45]:
display(nugget_df[0:5])

Unnamed: 0.1,Unnamed: 0,topic_id,nugget_id,importance,nugget_len,nugget_text,docid,streamid,epoch,yyyymmddhh
0,0,1,VMTS13.01.052,3,2,Hundreds injured,dd95d5dbbff443c3ddae4e34a5d2e9c1,1330041420-dd95d5dbbff443c3ddae4e34a5d2e9c1,1330041420,2012-02-23-23
1,1,1,VMTS13.01.052,3,2,Hundreds injured,dd95d5dbbff443c3ddae4e34a5d2e9c1,1330041420-dd95d5dbbff443c3ddae4e34a5d2e9c1,1330041420,2012-02-23-23
2,2,1,VMTS13.01.054,1,3,"February 22, 2012",f66f6668504592a391345e012800469c,1329944400-f66f6668504592a391345e012800469c,1329944400,2012-02-22-21
3,3,1,VMTS13.01.054,1,3,"February 22, 2012",f66f6668504592a391345e012800469c,1329944400-f66f6668504592a391345e012800469c,1329944400,2012-02-22-21
4,4,1,VMTS13.01.054,1,3,"February 22, 2012",ecda22bcfc10da137b49f0089bd5d7f5,1329916140-ecda22bcfc10da137b49f0089bd5d7f5,1329916140,2012-02-22-13


In [47]:
update_df = load_update_df()

loaded from file


In [48]:
display(update_df[0:5])

Unnamed: 0,docid,streamid,epoch,yyyymmddhh,zulu
0,be52a6b7690622e7ffb5fc82928ae889,1347372454-be52a6b7690622e7ffb5fc82928ae889,1347372454,2012-09-11-14,2012-09-11T14:07:34.0Z
1,be52a6b7690622e7ffb5fc82928ae889,1347372718-be52a6b7690622e7ffb5fc82928ae889,1347372718,2012-09-11-14,2012-09-11T14:11:58.0Z
2,be52a6b7690622e7ffb5fc82928ae889,1347372377-be52a6b7690622e7ffb5fc82928ae889,1347372377,2012-09-11-14,2012-09-11T14:06:17.0Z
3,be52a6b7690622e7ffb5fc82928ae889,1347372289-be52a6b7690622e7ffb5fc82928ae889,1347372289,2012-09-11-14,2012-09-11T14:04:49.0Z
4,be52a6b7690622e7ffb5fc82928ae889,1347372452-be52a6b7690622e7ffb5fc82928ae889,1347372452,2012-09-11-14,2012-09-11T14:07:32.0Z


### Insert Nuggets into Database

In [41]:
from .defs.database_management_mysql import populate_nuggets  # import database_management functions

conn, cursor = get_connection()
populate_nuggets(conn, cursor, nugget_df)
conn.close()

is_empty_table count: 1




## Evaluation Metrics

In [35]:
from rouge_score import rouge_scorer

In [36]:
class MetricHandler:
    def __init__(self, rouge_scores=['rouge1']):   
#         self.summary = summary
        self.rouge = rouge_scorer.RougeScorer(rouge_scores)
        self.nugget_df = None
        self.nugget_dict = None
        self.streamids = None
        
    def evaluate_summary(self, summary, streamids, rouge=True, importance=True):
        self.streamids = streamids
        self.nugget_df = self.nugget_frame(streamids)  # store in self.nugget_frame
        self.add_in_summary_col(summary)
        # create nested dictionary of metrics
        metrics = {}
        if importance:
#             sum_nugs = self.nuggets_in_summary(summary)
#             found_nugs = self.nugget_df[self.nugget_df['in_summary'] == True]
#             found_nugs = found_nugs.drop_duplicates('nugget_id')  # no over counting sums
            cur_imp, total_imp = self.importance_score()
            imp_dict = {}
            imp_dict['cur_imp'] = cur_imp
            imp_dict['total_imp'] = total_imp
            metrics['importance'] = imp_dict
        if rouge:
            target_text = self.target_nugget_text()
            rouges = self.rouge_score(target_text, summary)
            for k,v in rouges.items():
                r_dict = {}
                for label, value in v._asdict().items():
                    # keys: precision, recall, fmeasure
                    r_dict[label] = value
                metrics[k] = r_dict
        return metrics, self.nugget_df  # return metrics and potential nuggets
#         return metrics, sum_nugs  # return found nuggets to pass to db
        
    def rouge_score(self, target, summary):
        scores = self.rouge.score(target, summary)
        return scores
        
    def target_nugget_text(self, str_divider=" "):
        t_nugs = list(self.nugget_df['nugget_text'])
        t_nugs = str_divider.join(t_nugs)
        return t_nugs
    
    def importance_score(self):
        nugs = self.nugget_df.drop_duplicates('nugget_id')  # no over counting
        total_score = nugs['importance'].sum() # potential score
        found_nugs = nugs[nugs['in_summary'] == True]
        cur_score = found_nugs['importance'].sum()  # actual summary score
        return cur_score, total_score
        
#     def importance_score(self, sum_nugs):
#         cur_score = sum_nugs['importance'].sum()  # actual summary score
#         total_score = self.nugget_df['importance'].sum() # potential score
#         return cur_score, total_score
    
    def add_in_summary_col(self, summary):
#         self.nugget_df['in_summary'] = self.nugget_df['nugget_text'] in summary
#         self.nugget_df['in_summary'] = self.nugget_df.apply(lambda x: x['nugget_text'] in summary, axis=1)
        in_summary = self.nugget_df[self.nugget_df.apply(lambda x: x['nugget_text'] in summary, axis=1)]
        in_summary = list(in_summary['nugget_id'])
        self.nugget_df['in_summary'] = self.nugget_df['nugget_id'].isin(in_summary)
        
    def nuggets_in_summary(self, summary):
        # filter where nugget_text is in summary
        sum_nugs = self.nugget_df[self.nugget_df.apply(lambda x: x['nugget_text'] in summary, axis=1)]
        return sum_nugs
    
    def nugget_frame(self, streamids, keep_columns=None):
        if keep_columns is None:
            keep_columns = ['nugget_id', 'importance', 'nugget_text']
        # get nuggets for each streamid
        nug_rows = nugget_df[nugget_df['streamid'].isin(streamids)]
        nug_rows = nug_rows[keep_columns]
        self.nugget_df = nug_rows
        return self.nugget_df
        
#     def nugget_frame(self, streamids, keep_columns=None):
#         if keep_columns is None:
#             keep_columns = ['nugget_id', 'importance', 'nugget_text']
#         # get nuggets for each streamid
#         nug_rows = nugget_df[nugget_df['streamid'].isin(streamids)]
# #         nug_rows = nug_rows.drop_duplicates('nugget_id')
#         nug_rows = nug_rows[keep_columns]
#         self.nugget_df = nug_rows
#         return self.nugget_df
    
    def update_summary(self, summary):
        self.summary = summary

In [37]:
metric_handler = MetricHandler()
# metric_handler.evaluate_summary(test_nug_sum, test_nug['streamid'])

In [38]:
# # find difference in epoch for a day
# def epoch_diff():
#     # find instances in epoch where there is a day gap
#     day_gap['']

## Summarisation Driver

In [47]:
from datetime import datetime

class SummaryFeeder:
    """
    Need to add nugget metric info to database table and add from here
    Then also add to nugget_instances table
    """
    def __init__(self, sum_handler, tech_name, tech_descr=None, is_temporal=False):
        self.sum_handler = sum_handler
        self.tech_name = tech_name
        self.is_temporal = is_temporal
        self.tech_descr = tech_descr
        self.metric_handler = MetricHandler()
        
    def summarize_topics(self, corp_df):
        # pre-determine some columns to be inputted into meta table
        self.meta_columns = self.get_meta_columns(corp_df)
        # pre-summary database operations
        self.conn, self.cursor = get_connection()
        
        self.insert_technique(tech_descr = self.tech_descr)
        start_exec = self.cur_datetime()
        instance_id = self.store_instance(start_exec)
        self.conn.commit()
        topic_ids = corp_df['topic_id'].unique()
        for topic_id in tqdm(topic_ids, position=0, leave=True):
            # currently sum all 
            topic_df = corp_df[corp_df['topic_id'] == topic_id]
            summary = self.sum_handler.summarize(topic_df['text'])
            self.store_topic(topic_id, instance_id, summary)
            self.conn.commit()
        
        end_exec = self.cur_datetime()
        self.update_end_exec(end_exec, instance_id)
        self.conn.commit()
        self.conn.close()
        print("summarize_topics complete")
        
    def store_topic(self, topic_id, instance_id, summary, is_complete_summary=True, update_num=None):
        # get metrics
        metrics, nuggets = self.metric_handler.evaluate_summary(summary, self.meta_columns[topic_id]['streamids'])
        # store meta table
        self.store_meta(topic_id, instance_id, summary, metrics)
        # store nugget info
        self.store_nugget_instances(topic_id, instance_id, nuggets, is_complete_summary=is_complete_summary, update_num=update_num)
        
    def store_nugget_instances(self, topic_id, instance_id, nuggets, is_complete_summary=True, update_num=None):
        """Store fields of nugget_instances table as follows:
        nugget_id, topic_id, instance, technique, update_num, is_update, is_complete_summary, found
        """
        def create_tuple(nugget_id, found):
            base = [nugget_id, int(topic_id), int(instance_id), self.tech_name, is_complete_summary, found]
            return tuple(base)
        
        insert_list = []
        for index, row in nuggets.iterrows():
            insert_list.append(create_tuple(row['nugget_id'], row['in_summary']))
#         self.cursor.executemany('insert into nugget_instances values (?,?,?,?,?,?)', insert_list)  # not changed
        self.cursor.executemany("""INSERT INTO nugget_instances (nugget_id,  topic_id, instance, technique, is_complete_summary, found)
            VALUES (%s, %s, %s, %s, %s, %s)""", insert_list)
        self.conn.commit()
 
    def store_meta(self, topic_id, instance_id, summary, metrics):
        """Store fields of meta table as follows:
        topic_id, instance, summary, streamids (text), epoch_start, epoch_end, importance_score,
        total_importance, r1_precision, r1_recall, r1_fmeasure, has_updates
        """
        cur_meta = self.meta_columns[topic_id]
        imp = metrics['importance']
        r1 = metrics['rouge1']
        insert_tuple = (int(topic_id), int(instance_id), summary, ",".join(cur_meta['streamids']), int(cur_meta['epoch_start']),
                       int(cur_meta['epoch_end']), int(imp['cur_imp']), int(imp['total_imp']),
                       r1['precision'], r1['recall'], r1['fmeasure'])
        
#         self.cursor.execute('insert into instance_meta values (?,?,?,?,?,?,?,?,?,?,?)', insert_tuple) # not changed    
        self.cursor.execute("""INSERT INTO instance_meta 
                            (topic_id, instance, summary, streamids, epoch_start, epoch_end, importance_score,
                            total_importance, r1_precision, r1_recall, r1_fmeasure)
                            VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""", insert_tuple)
        self.conn.commit()
        
    def get_meta_columns(self, corp_df):
        meta_fields = {}
        for topic_id in corp_df['topic_id'].unique():
            nest = {}
            t = corp_df[corp_df['topic_id'] == topic_id]
            t = t.sort_values("epoch")
            nest['epoch_start'] = t['epoch'].iloc[0]
            nest['epoch_end'] = t['epoch'].iloc[-1]
            streamids = list(t['streamid'])
#             streamids = ",".join(streamids)
            nest['streamids'] = streamids
            meta_fields[topic_id] = nest
        return meta_fields
            
    def store_instance(self, start_exec):
        # get number of rows to get instance value
        self.cursor.execute('SELECT COUNT(instance) FROM instances')  # not changed
        rowcount = self.cursor.fetchone()[0]
        
        # insert instance
        self.cursor.execute('INSERT INTO instances (instance, technique, temporal, start_exec) VALUES (%s, %s, %s, %s)',
                           (rowcount, self.tech_name, self.is_temporal, start_exec))  # miss end_exec col
        self.conn.commit()
        return rowcount # return instance id for ease of later storage
    
    def update_end_exec(self, end_exec, instance_id):
        self.cursor.execute('UPDATE instances SET end_exec = %s WHERE instance = %s', (end_exec, instance_id))
        
    def fetch_technique_entry(self):
        self.cursor.execute('SELECT * FROM techniques WHERE name=%s', (self.tech_name,))
        entry = self.cursor.fetchone()
        return entry
        
    def insert_technique(self, tech_descr=None):
        entry = self.fetch_technique_entry()
        if entry:  # technique in database
            print("Technique " + str(self.tech_name) + " in database")
            return False
        else:
            if tech_descr is None:
                raise ValueError("Tech description must not equal none if technique not in database")
            else:
                self.cursor.execute('INSERT INTO techniques (name, description) VALUES(%s, %s)', (self.tech_name, tech_descr))
                self.conn.commit()
                print("Technique " + str(self.tech_name) + " inserted into database")
                return True
        
    def cur_datetime(self):
#         time = datetime.now().strftime("%B %d, %Y %I:%M%p")
        time = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        return time

In [48]:
# Full corpus caused float divison by zero error at new_ratios.append(size_split/self.total_length(s))

def small_corpus(start_topic, end_topic):
    t_dfs = []
    topic_ids = corpus['topic_id'].unique()
    for topic_id in topic_ids:
        t_dfs.append(corpus[corpus['topic_id'] == topic_id][start_topic:end_topic])
    small = pd.concat(t_dfs)
    return small

test_feeder_name = "bes_naive_datasplit_[0:5]"
test_feeder_descr = """
Using bert-extractive_summarizer with original naive datasplit.
Uses first 5 documents of each topic
Split entire topic into portions above 0.1 ratio, summarise iteratively
"""
# test_feeder = SummaryFeeder(SummarizationHandler(sum_model), test_feeder_name, tech_descr=test_feeder_descr)
test_feeder = SummaryFeeder(SummarizationHandler(), test_feeder_name, tech_descr=test_feeder_descr)

first_20 = small_corpus(0, 5)
test_feeder.summarize_topics(first_20)

  0%|          | 0/9 [00:00<?, ?it/s]

Technique bes_naive_datasplit_[0:5] in database



  0%|          | 0/1 [00:00<?, ?it/s][A

Summarising text in 1 pieces.


  return_n_iter=True)

100%|██████████| 1/1 [00:11<00:00, 11.65s/it][A

  0%|          | 0/1 [00:00<?, ?it/s][A

Summarising text in 1 pieces.



100%|██████████| 1/1 [00:06<00:00,  6.07s/it][A
 11%|█         | 1/9 [00:30<04:05, 30.70s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A

Summarising text in 1 pieces.


  return_n_iter=True)

100%|██████████| 1/1 [00:08<00:00,  8.95s/it][A

  0%|          | 0/1 [00:00<?, ?it/s][A

Summarising text in 1 pieces.



100%|██████████| 1/1 [00:05<00:00,  5.08s/it][A
 22%|██▏       | 2/9 [00:52<03:15, 27.98s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A

Summarising text in 1 pieces.


  return_n_iter=True)

100%|██████████| 1/1 [00:12<00:00, 12.36s/it][A

  0%|          | 0/1 [00:00<?, ?it/s][A

Summarising text in 1 pieces.



100%|██████████| 1/1 [00:06<00:00,  6.50s/it][A
 33%|███▎      | 3/9 [01:18<02:44, 27.48s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A

Summarising text in 1 pieces.


  return_n_iter=True)

100%|██████████| 1/1 [01:26<00:00, 86.53s/it][A

  0%|          | 0/1 [00:00<?, ?it/s][A

Summarising text in 1 pieces.



100%|██████████| 1/1 [00:26<00:00, 26.26s/it][A
 44%|████▍     | 4/9 [03:19<04:37, 55.58s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A

Summarising text in 1 pieces.


  return_n_iter=True)

100%|██████████| 1/1 [05:45<00:00, 345.78s/it][A

  0%|          | 0/1 [00:00<?, ?it/s][A

Summarising text in 1 pieces.



100%|██████████| 1/1 [01:22<00:00, 82.36s/it][A
 56%|█████▌    | 5/9 [10:35<11:18, 169.70s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A

Summarising text in 1 pieces.


  return_n_iter=True)

100%|██████████| 1/1 [01:45<00:00, 105.09s/it][A

  0%|          | 0/1 [00:00<?, ?it/s][A

Summarising text in 1 pieces.



100%|██████████| 1/1 [00:31<00:00, 31.55s/it][A
 67%|██████▋   | 6/9 [13:01<08:07, 162.40s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A

Summarising text in 1 pieces.


  return_n_iter=True)

100%|██████████| 1/1 [00:57<00:00, 57.21s/it][A

  0%|          | 0/1 [00:00<?, ?it/s][A

Summarising text in 1 pieces.



100%|██████████| 1/1 [00:18<00:00, 18.01s/it][A
 78%|███████▊  | 7/9 [14:24<04:37, 138.65s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A

Summarising text in 1 pieces.


  return_n_iter=True)

100%|██████████| 1/1 [01:09<00:00, 69.70s/it][A

  0%|          | 0/1 [00:00<?, ?it/s][A

Summarising text in 1 pieces.



100%|██████████| 1/1 [00:17<00:00, 17.88s/it][A
 89%|████████▉ | 8/9 [15:59<02:05, 125.71s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A

Summarising text in 1 pieces.


  return_n_iter=True)

100%|██████████| 1/1 [00:09<00:00,  9.58s/it][A

  0%|          | 0/1 [00:00<?, ?it/s][A

Summarising text in 1 pieces.



100%|██████████| 1/1 [00:05<00:00,  5.56s/it][A
100%|██████████| 9/9 [16:22<00:00, 109.20s/it]

summarize_topics complete





## Database Interaction

In [None]:
# import sqlite3

# db_dir = '/nfs/proj-repo/AAARG-dissertation'
# db_name = 'sumresults.db'
# db_path = db_dir + '/' + db_name

# conn = sqlite3.connect(db_path)  # creates db if doesn't exist
# c = conn.cursor()  # allows send commands to db

In [None]:
# c.execute("""CREATE TABLE results (
#     topic_id integer,
#     summary text
# )""")