## Loading Data Set

In [263]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [38]:
## IMPORT DEPENDENCIES

from bs4 import BeautifulSoup as bs
import gzip
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from collections import OrderedDict
print ("loading data set dependencies successful")

loading data set dependencies successful


In [39]:
## SET FILE META VARIABLES

corpus_path = "/nfs/trects-kba2014-filtered" # directory of corpus of gzipped html files
topics_path = corpus_path + "/test-topics.xml"
doc_tags = ['topic_id','streamid', 'docid', 'yyyymmddhh', 'kbastream', 'zulu', 'epoch', 'title', 'text', 'url'] # doc fields
topic_tags = ['id', 'title', 'description', 'start','end','query','type'] # topic fields
test_file_addr = corpus_path + "/1/2012-02-22-15.gz"

In [40]:
# open and get beautifulsoup object from markup file
def open_markup_file(addr, gz=True, xml=False, verbose=False):
    markup = None
    f = None
    
    if verbose:
        print(addr)

    if gz:
        f = gzip.open(addr)
    else:
        f = open(addr)
        
    if xml == False:
        markup = bs(f)  # open as html
    else:
        markup = bs(f, "xml")
        
    f.close()
    return markup


# parse markup and return 2D list [entry:tags]
def parse_markup(markup, entry_list, find_tag="doc", tag_list=doc_tags, topic_id=None):
    for e in markup.find_all(find_tag):
        entry = OrderedDict.fromkeys(tag_list)
        if topic_id is not None:
            entry['topic_id'] = topic_id
        for c in e.children:  # children use direct children, descendants uses all
            if c.name in entry:
                entry[c.name] = str(c.string)
            elif c.name is None and c.string != '\n':  # inner body of <doc> tag
                entry['text'] = str(c.string)
        entry_list.append(list(entry.values()))
        
            
# recursively find gz html files from a directory address
def search_dir(path):    
    # separate the subdirectories and html files 
    # (help maintain sequential order of insertion)
    gz_paths = []
    for f in os.scandir(path):
        if os.path.splitext(f.path)[-1].lower() == ".gz":
            gz_paths.append(f.path)
    
    return gz_paths


def list_to_dataframe(markup_list, tags):
    return pd.DataFrame(markup_list, columns=tags)

In [41]:
# load topics into dataframe
def load_topics(path):
    topics_list = []
    
    parse_markup(open_markup_file(path, gz=False, xml=True), 
                    topics_list, find_tag="event", tag_list=topic_tags)
    
    df = list_to_dataframe(topics_list, topic_tags)
    df['id'] = pd.to_numeric(df['id'])
    return df

topics = load_topics(topics_path)

In [42]:
print("Topics loaded successfuly")
print(topics.head(4))

Topics loaded successfuly
   id                                title  \
0   1      2012 Buenos Aires Rail Disaster   
1   2  2012 Pakistan garment factory fires   
2   3                 2012 Aurora shooting   
3   4       Wisconsin Sikh temple shooting   

                                         description       start         end  \
0  http://en.wikipedia.org/wiki/2012_Buenos_Aires...  1329910380  1330774380   
1  http://en.wikipedia.org/wiki/2012_Pakistan_gar...  1347368400  1348232400   
2  http://en.wikipedia.org/wiki/2012_Aurora_shooting  1342766280  1343630280   
3  http://en.wikipedia.org/wiki/Wisconsin_Sikh_te...  1344180300  1345044300   

                      query      type  
0  buenos aires train crash  accident  
1     pakistan factory fire  accident  
2         colorado shooting  shooting  
3      sikh temple shooting  shooting  


In [181]:
# load all formatted gzipped html files into dataframe
def load_corpus(path):
    #corpus_list = []
    df = pd.DataFrame(columns=doc_tags)
    for topic_id in topics['id'].to_numpy():
        print("Loading topic " + str(topic_id) + "...")
        topic_list = []
        id_path = corpus_path + "/" + str(topic_id) + "/"  # every topic id correlates to subfolder named after it
        gz_paths = search_dir(id_path)
        for gz_path in tqdm(gz_paths, position=0, leave=True):
            parse_markup(open_markup_file(gz_path, verbose=False),
                            topic_list, topic_id=topic_id)
        topic_df = list_to_dataframe(topic_list, doc_tags)
        df = df.append(topic_df)
    df['epoch'] = pd.to_numeric(df['epoch'])
    return df

corpus = load_corpus(corpus_path)
print("Corpus loaded Successfully")

  1%|          | 2/241 [00:00<00:15, 15.79it/s]

Loading topic 1...


 98%|█████████▊| 236/241 [00:10<00:00, 31.99it/s]
100%|██████████| 241/241 [00:10<00:00, 23.42it/s]
  0%|          | 0/241 [00:00<?, ?it/s]

Loading topic 2...


100%|██████████| 241/241 [04:28<00:00,  1.11s/it]
  0%|          | 1/241 [00:00<00:28,  8.31it/s]

Loading topic 3...


100%|██████████| 241/241 [02:05<00:00,  1.92it/s]
  0%|          | 0/241 [00:00<?, ?it/s]

Loading topic 4...


100%|██████████| 241/241 [02:47<00:00,  1.44it/s]
  0%|          | 1/241 [00:00<00:24,  9.68it/s]

Loading topic 5...


100%|██████████| 241/241 [01:19<00:00,  3.04it/s]
  0%|          | 0/241 [00:00<?, ?it/s]

Loading topic 6...


100%|██████████| 241/241 [01:00<00:00,  4.01it/s]
0it [00:00, ?it/s]
  0%|          | 0/241 [00:00<?, ?it/s]

Loading topic 7...
Loading topic 8...


100%|██████████| 241/241 [00:21<00:00, 11.22it/s]
  0%|          | 1/241 [00:00<00:41,  5.85it/s]

Loading topic 9...


100%|██████████| 241/241 [00:08<00:00, 27.56it/s]
  0%|          | 1/241 [00:00<00:33,  7.09it/s]

Loading topic 10...


100%|██████████| 241/241 [01:20<00:00,  2.98it/s]

Corpus loaded Successfully





In [67]:
print("Corpus loaded succesfully: " + str(len(corpus)) + " documents loaded.")
print(corpus.head(4))
# there is an error in the dataset that article at 1 is misplaced in topic 1

Corpus loaded succesfully: 12261 documents loaded.
  topic_id                                     streamid  \
0        1  1330269540-995ed81eafa60498872335da7dce1386   
1        1  1330268520-f42a863b58b2cc53cc716953c40f6065   
2        1  1330270020-e47e013ec518f5fdd253ce28231f509f   
3        1  1330268700-8078290575c82c8dd0e4e99370447bd2   

                              docid     yyyymmddhh kbastream  \
0  995ed81eafa60498872335da7dce1386  2012-02-26-15      news   
1  f42a863b58b2cc53cc716953c40f6065  2012-02-26-15      news   
2  e47e013ec518f5fdd253ce28231f509f  2012-02-26-15      news   
3  8078290575c82c8dd0e4e99370447bd2  2012-02-26-15      news   

                          zulu       epoch  \
0  2012-02-26T15:19:00.000000Z  1330269540   
1  2012-02-26T15:02:00.000000Z  1330268520   
2  2012-02-26T15:27:00.000000Z  1330270020   
3  2012-02-26T15:05:00.000000Z  1330268700   

                                               title  \
0  US says it's steadfast in rebuilding Afgha

In [78]:
#test_file_df = list_to_dataframe(parse_markup(open_markup_file(test_file_addr)), doc_tags)
#print(corpus[corpus['docid'] == "1329910380-3afda7882974e306bc75176f5ce37f3e"])
#print(corpus[corpus['yyyymmddhh'] == "2012-02-22-13"].head(5))
print(corpus.iloc[1])

topic_id                                                      1
streamid            1330268520-f42a863b58b2cc53cc716953c40f6065
docid                          f42a863b58b2cc53cc716953c40f6065
yyyymmddhh                                        2012-02-26-15
kbastream                                                  news
zulu                                2012-02-26T15:02:00.000000Z
epoch                                                1330268520
title         Argentina Train Crash: Driver Blames Faulty Br...
text          \nArgentina Train Crash: Driver Blames Faulty ...
url           http://www.thisdaylive.com/articles/argentina-...
Name: 1, dtype: object


In [68]:
# duplicates are updates to the page
find_nug = corpus[corpus['streamid'] == "1329915660-47ed792a77d798dda8697654e8fcbb43"]
# 1329915300-46c9b2db03fbaf7d2a903bbfa7ff3c93-3
# duplicate found when -3 taken away
dup_nug = corpus[corpus['streamid'] == "1329915300-46c9b2db03fbaf7d2a903bbfa7ff3c93"]
print(corpus[corpus['docid'] == "47ed792a77d798dda8697654e8fcbb43"])
print(find_nug)
print(dup_nug)
# print(dup_nug['text'])

    topic_id                                     streamid  \
116        1  1329915660-47ed792a77d798dda8697654e8fcbb43   

                                docid     yyyymmddhh kbastream  \
116  47ed792a77d798dda8697654e8fcbb43  2012-02-22-13      news   

                            zulu       epoch  \
116  2012-02-22T13:01:00.000000Z  1329915660   

                                                 title  \
116  Argentine train slams into station, killing 49...   

                                                  text  \
116  \nArgentine train slams into station, killing ...   

                                                   url  
116  http://www.seattlepi.com/news/article/Argentin...  
    topic_id                                     streamid  \
116        1  1329915660-47ed792a77d798dda8697654e8fcbb43   

                                docid     yyyymmddhh kbastream  \
116  47ed792a77d798dda8697654e8fcbb43  2012-02-22-13      news   

                            zulu       epoc

## Preprocessing

- Topic Modelling needs better preprocessing (stop words/lemmas etc.)
    - stop words
    - lemmatization (stemming is faster but is rule-based with more false transformations)
    - special char removal
- Could try removing junk at top of docs through REs/spacy

In [9]:
## IMPORT DEPENDENCIES

import spacy

print("preprocessing dependencies import successful")

preprocessing dependencies import successful


In [10]:
nlp = spacy.load("en_core_web_sm")  # try experimenting disabling parts of spacy pipeline see if .sents still works

nlp.add_pipe(nlp.create_pipe('sentencizer'), before="parser")
nlp.remove_pipe('tagger')
nlp.remove_pipe('parser')

('parser', <spacy.pipeline.pipes.DependencyParser at 0x7fa3718d21c8>)

In [11]:
test_docs = corpus.loc[0:3,:]  # work on just top 3 for now

# in data frame, split sentences into list by the newline delimiter
#test_docs['text'] = test_docs['text'].map(lambda x: x.splitlines())

# map the non-preprocessed string list to a preprocessed string list

#@Tokenize
def spacy_tokenize(string):
    tokens = list()
    doc = nlp(string)
    for token in doc:
        if not token.is_stop:
            tokens.append(token)
    return tokens

#@Normalize
def normalize(tokens):
    normalized_tokens = list()
    for token in tokens:
        normalized = token.text.lower().strip()
        if ((token.is_alpha or token.is_digit)):
            normalized_tokens.append(normalized)
    return normalized_tokens

#@Tokenize and normalize
def tokenize_normalize(string):
    return normalize(spacy_tokenize(string))

# test_prep = []
# for doc in test_docs['text']:
#     d = []
#     for sent in doc:
#         d.append(tokenize_normalize(sent))
#     test_prep.append(d)
        
print("cell finished")

cell finished


In [12]:
#print(test_prep[0])

## Retrieve Word and Sentence Level Embeddings

In [13]:
from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer
#sent_model = AutoModel.from_pretrained('sentence-transformers/distilbert-base-nli-stsb-mean-tokens')
sent_tokenizer = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
#word_model = AutoModel.from_pretrained('distilbert-base-uncased')
word_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

100%|██████████| 245M/245M [00:05<00:00, 43.5MB/s] 


HBox(children=(IntProgress(value=0, description='Downloading', max=442, style=ProgressStyle(description_width=…




HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




## Topic Modelling

- LDA uses K-means clustering
- HDA learns num topics automatically (Bayesian non-parametric)

In [14]:
# word level topic modelling
# needs better preprocessing (remove stopwords/lemmitization etc)
# maybe add REs/other preprocessing remove uninformative junk at top of docs

import gensim
from gensim import corpora

print("loaded dependencies")

loaded dependencies


In [15]:
class TopicModeller: 
    def __init__(self):
        self.model = None
        self.corpus_dict = None
        self.weighted_tokens = None
        self.print_topics = None
        
    def weigh_tokens(self, texts, method="bow"):
        """Perform token weighting scheme on text and return with dict"""
        def create_dictionary(texts):
            """Create a gensim dictionary of index-word mappings"""
            return corpora.Dictionary(texts)
    
        flat_texts = [token for sent in texts for token in sent]  # should be fast
        self.corpus_dict = create_dictionary(flat_texts)
        if method == "bow":
            self.weighted_tokens = [self.corpus_dict.doc2bow(text) for text in flat_texts]
        else:
            raise Exception("Incorrect method parameter")
            
    def model_topics(self, method="lda", num_topics=10):
        if method == "lda":
    #         model = gensim.models.ldamodel.LdaModel(weighted_tokens, num_topics=NUM_TOPICS, 
    #                                                 id2word=corpus_dict, passes=15)
            self.model = gensim.models.ldamulticore.LdaMulticore(self.weighted_tokens, num_topics=num_topics, 
                                                    id2word=self.corpus_dict, passes=15)
        else:
            raise Exception("Incorrect method parameter")
            
        self.print_topics = self.model.print_topics()
        return self.print_topics
print("loaded cell")

loaded cell


In [16]:
# topic_model = TopicModeller()
# topic_model.weigh_tokens(test_prep)
# print(topic_model.model_topics())

## Sentence Level Clustering

In [17]:
# first get sentences which are nearest neighbors to the identified topics
# https://scikit-learn.org/stable/modules/neighbors.html
# https://stackoverflow.com/questions/60996584/bert-embedding-for-semantic-similarity
# https://stackoverflow.com/questions/59865719/how-to-find-the-closest-word-to-a-vector-using-bert
# https://gist.github.com/avidale/c6b19687d333655da483421880441950


# then compare sentence results from pure extractive summariser maybe?

from sklearn.neighbors import KDTree
#import mxnet as mx
from bert_embedding import BertEmbedding

# ctx = mx.gpu(0)
# bert = BertEmbedding(ctx=ctx)
bert_emb = BertEmbedding()

Vocab file is not found. Downloading.
Downloading /root/.mxnet/models/book_corpus_wiki_en_uncased-a6607397.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/dataset/vocab/book_corpus_wiki_en_uncased-a6607397.zip...
Downloading /root/.mxnet/models/bert_12_768_12_book_corpus_wiki_en_uncased-75cc780f.zip from https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/gluon/models/bert_12_768_12_book_corpus_wiki_en_uncased-75cc780f.zip...


In [18]:
"""
Trying code from here
https://gist.github.com/avidale/c6b19687d333655da483421880441950

Preprocess embeddings in a formatted way as such can track sentences, words, embeddings

do this, then pass the LDA topics into the query
""" 

class EmbeddingHandler:
    def __init__(self, sentences, model):
        self.sentences = sentences
        self.model = model
        
    def generate_embeddings(self):
        result = self.model(self.sentences)
#         result = list()
#         for sent in self.sentences:
#             result.append(self.model.encode(sent, is_pretokenized=True))
#         #result = self.model.encode(self.sentences, is_pretokenized=True, show_progress_bar=True)
        #print(result)
        
        self.sent_ids = []
        self.token_ids = []
        self.tokens = []
        embeddings = []
        for i, (toks, embs) in enumerate(tqdm(result)):
            for j, (tok, emb) in enumerate(zip(toks, embs)):
                self.sent_ids.append(i)
                self.token_ids.append(j)
                self.tokens.append(tok)
                embeddings.append(emb)
        embeddings = np.stack(embeddings)
        # we normalize embeddings, so that euclidian distance is equivalent to cosine distance
        self.normed_embeddings = (embeddings.T / (embeddings**2).sum(axis=1) ** 0.5).T
        
    def generate_sent_embeddings(self):
        """test sent vs word embeddings"""
        # use sentence-transformers embeddings
        result = self.model.encode(self.sentences)
        self.sent_ids = []
        self.tokens = []
        embeddings = []
        for i, (tok, emb) in enumerate(tqdm(zip(self.sentences,result))):
            self.sent_ids.append(i)
            self.tokens.append(tok)
            embeddings.append(emb)
        embeddings = np.stack(embeddings)
        # we normalize embeddings, so that euclidian distance is equivalent to cosine distance
        self.normed_embeddings = (embeddings.T / (embeddings**2).sum(axis=1) ** 0.5).T
        
    def create_comparitor(self):
        # this takes some time
        self.indexer = KDTree(self.normed_embeddings)
        print("created KDTree")
    
    def query(self, query_sent, query_word, k=10, filter_same_word=False):
        toks, embs = self.model([query_sent])[0]

        found = False
        for tok, emb in zip(toks, embs):
            if tok == query_word:
                found = True
                break
        if not found:
            raise ValueError('The query word {} is not a single token in sentence {}'.format(query_word, toks))
        emb = emb / sum(emb**2)**0.5

        if filter_same_word:
            initial_k = max(k, 100)
        else:
            initial_k = k
        di, idx = self.indexer.query(emb.reshape(1, -1), k=initial_k)  # this is returning our neighbours
        distances = []
        neighbors = []
        contexts = []
        # this is filtering for word matching
        for i, index in enumerate(idx.ravel()):
            token = self.tokens[index]
            if filter_same_word and (query_word in token or token in query_word):  # take this away
                continue
            distances.append(di.ravel()[i])
            neighbors.append(token)
            contexts.append(self.sentences[self.sent_ids[index]])
            if len(distances) == k:
                break
        return distances, neighbors, contexts
    
    def topic_neighbors(self, topic_word, k=10):
        # get average embedding of topic word
        # maybe instead return context sentence that is closest to averaged embedding?
        # that way can use context to get right meaning
        topic_emb = self.avg_embedding(self.retrieve_embeddings(topic_word))
        
        # get neighbors
        # do I need reshape?
        di, idx = self.indexer.query(topic_emb.reshape(1,-1), k=k)
        distances = []
        neighbors = []
        contexts = []
        for i, index in enumerate(idx.ravel()):
            token = self.tokens[index]
            distances.append(di.ravel()[i])
            neighbors.append(token)
            contexts.append(self.sentences[self.sent_ids[index]])
        return distances, neighbors, contexts
        
        
    def retrieve_embeddings(self, token):
        idxs = []
        for i, t in enumerate(self.tokens):
            if t == token:
                idxs.append(i)
            elif token in t:  # sent-embeddings temp workaround
                idxs.append(i)
        embs = []
        for i in idxs:
            embs.append(self.normed_embeddings[i])
        return embs
    
    def avg_embedding(self, emb_list):
        return np.mean(emb_list, axis=0)

In [19]:
# # test just get sentences
# # richard said sents are actually split by \n
# def split_sentences(docs):
#     """Tokenize sentences into lists of word characters"""
#     doc_sents = []
#     for doc in nlp.pipe(docs):
#         #sents.extend([sent.text for sent in doc.sents])
#         for sent in doc.sents:
#             doc_sents.append(sent.text)
#     return doc_sents

# def split_newline(docs):
#     sents = []
#     for doc in docs:
#         sents.extend(doc.splitlines())
#     return sents

# emb_handler_corp = split_newline(test_docs)

In [20]:
emb_handler = EmbeddingHandler(emb_handler_corp, bert_emb)  # [0] index taking first doco
emb_handler.generate_embeddings()
emb_handler.create_comparitor()

NameError: name 'emb_handler_corp' is not defined

In [None]:
sent_emb_h = EmbeddingHandler(emb_handler_corp, sent_tokenizer)
sent_emb_h.generate_sent_embeddings()
sent_emb_h.create_comparitor()

In [None]:
dist, neigh, cont = emb_handler.topic_neighbors("and")
for d, w, c in zip(dist, neigh, cont):
    print('{} {}  {}'.format(w, d, c.strip()))
    print("")
#avg_emb = emb_handler.avg_embedding(ret_emb)

In [None]:
dist2, neigh2, cont2 = sent_emb_h.topic_neighbors("and")
for d, w in zip(dist2, neigh2):
    print('{} {}'.format(w, d))
    print("")

## Summarization

In [26]:
from summarizer import Summarizer
#from summarizer.coreference_handler import CoreferenceHandler
#co_handler = CoreferenceHandler(greedyness=0.4)
#sum_model = Summarizer(sentence_handler=co_handler)
sum_model = Summarizer()
print("loaded summarisation model")

HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…


loaded summarisation model


In [None]:
# for processing massive strings
#!jupyter notebook --NotebookApp.iopub_data_rate_limit=1.0e10

In [137]:
"""
ValueError: [E088] Text of length 3496277 exceeds maximum of 1000000. 
The v2.x parser and NER models require roughly 1GB of temporary memory per 100,000 characters in the input. 
This means long texts may cause memory allocation errors. 
If you're not using the parser or NER, it's probably safe to increase the `nlp.max_length` limit. 
The limit is in number of characters, so you can check whether your inputs are too long by checking `len(text)`.
"""

class SummarizationHandler:
    def __init__(self, model):
        self.model = model
        
    def summarize(self, texts, max_len=500000):
        def sum_texts(texts_list, ratio=None):
            sums = []
            print("Summarising text in " + str(len(texts_list)) + " pieces.")
            for text in tqdm(texts_list):
                if ratio is None:
                    sums.append(self.model(text))
                else:
                    sums.append(self.model(text, ratio=ratio))
            return ". ".join(sums)
        
        total_len = self.total_length(texts)
        split_texts, size_split = self.split_texts(texts, total_len, max_len)
        split_ratio = size_split / total_len
        
        if split_ratio >= 1:
            cur_sum = sum_texts(split_texts)
        else:
            sums = sum_texts(split_texts, split_ratio)
            cur_sum = ". ".join(sums)
            cur_sum = sum_texts([cur_sum])
        return cur_sum
        
        
    def split_texts(self, texts, total_len, max_len):
        """Split texts into list of strings under max_len"""
        num_split, size_split = self.optimal_split(total_len, max_len)
        splits = []
        cur_split = ""
        i = 1
        for text in texts:
            if i == num_split:
                # just add to last no check
                cur_split += text
            else:
                if (len(cur_split) + len(text) > size_split):
                    splits.append(cur_split)
                    cur_split = text
                    i += 1
                else:
                    cur_split += text
        splits.append(cur_split)
        return splits, size_split
            
    def optimal_split(self, total_len, max_len):
        """Find even split of text under max_len"""
        under_len = int(max_len * 0.95)  # use slightly under max_len for safety
        cur_div = 1
        cur_size = total_len / cur_div
        while (cur_size > under_len):
            cur_div += 1
            cur_size = total_len / cur_div
        return cur_div, cur_size
        
    
    def total_length(self,texts):
        total = 0
        for t in texts:
            total += len(t)
        return total



In [138]:
sum_handler = SummarizationHandler(sum_model)

test_topic = corpus.loc[corpus['topic_id'] == 1]['text'][0:10]
test_topic_sum = sum_handler.summarize(test_topic)

print(test_topic_sum)
print("complete")



  0%|          | 0/1 [00:00<?, ?it/s][A[A

Summarising text in 1 pieces.




100%|██████████| 1/1 [00:40<00:00, 40.62s/it][A[A

US says it's steadfast in rebuilding Afghanistan - El Paso Times Mobile e-Edition Today’s Print Ads Newsletters Customer Service Subscribe This Site Web Search powered by YAHOO ! We've got to create a situation in which al-Qaida is not coming back." The Pentagon on Sunday identified Air Force Lt. John D.
Loftis as one of the service members killed in the ministry incident. "It's an extraordinary admission of failure for us to establish the relationships that you'd have to have for a successful transition to the Afghan military and Afghan security leadership," Romney said. "So they are very much in this fight trying to protect us." This material may not be published, broadcast, rewritten or redistributed. Investigators said they would check the control room recordings. Driver, Marcos Cordoba is being investigated on suspicion of what Argentine law calls "guilty damage without an attempt to cause harm". According to Noticias Argentinas news agency, Cordoba told investigators he had alert




## Evaluation Metrics

### Reading 'Nugget' Values

In [176]:
nugget_dir = "/nfs/TemporalSummarization/ts13/results"
updates_sampled_path = nugget_dir + "/updates_sampled.tsv"
nuggets_path = nugget_dir + "/nuggets.tsv"
nug_matches_path = nugget_dir + "/matches.tsv"

In [417]:
import re

def find_duplicates(df):
    seen = set()
    seen_twice = set()
    for docid in df['docid']:
        if docid not in seen:
            seen.add(docid)
        else:
            seen_twice.add(docid)
    return seen_twice

def create_update_df():
    """Data Frame containing information about docs which have updates/multiple instances in corpus"""
    def create_entry(row, col_tags):
        entry = {}
        for col in col_tags:
            entry[col] = row[col]
        return entry
    
    col_tags = ['docid', 'streamid', 'epoch', 'yyyymmddhh', 'zulu']
    entry_list = []
    dups = find_duplicates(corpus)
    for docid in tqdm(dups, position=0, leave=True):
        d = corpus[corpus['docid'] == docid]
        for index, row in d.iterrows():
            entry = create_entry(row, col_tags)
            entry_list.append(entry)
             
    update_df = pd.DataFrame(entry_list)
    update_df = update_df.set_index(col_tags)
    return update_df
                    
                
def create_nugget_df():
    """Dataframe containing nugget data and its appearances in corpus"""
    def create_entry(row, reg_cols, multi_col_vals=None):
        entry_dict = {}
        for col in reg_cols:
            entry_dict[col] = row[col]
        if multi_cols is not None:
            for k,v in multi_col_vals.items():
                entry_dict[k] = v
        return entry_dict
        
    nuggets_tsv = pd.read_csv(nuggets_path, "\t")
    entry_list = []
    reg_cols = ['query_id', 'nugget_id', 'importance', 'nugget_len', 'nugget_text']
    multi_cols = ['docid', 'streamid', 'epoch', 'yyyymmddhh']  # multiindex cols
    num_cols = ['query_id', 'importance', 'nugget_len', 'epoch']
    
    pbar = tqdm(total=len(nuggets_tsv), position=0, leave=True)
    for index, row in nuggets_tsv.iterrows():
        # find where nugget appears in text
        nug_text = row['nugget_text']
        topic_id = 0
        try:
            topic_id = int(row['query_id'])  # make sure pattern match in correct topic
        except ValueError:
            pbar.update()
            continue  # topic_id is unknown string in tsv file, e.g. "TS13.07"
        appears = corpus[corpus['topic_id'] == topic_id]
        appears = appears[appears['text'].str.contains(re.escape(nug_text))]  # make sure no accidental regex pattern
        
        # gather information on docs it appears in
        dups = find_duplicates(appears)  # get docids where nugget appears
        for docid in dups:
            upd = appears[appears['docid'] == docid]  # get docs with this docid
            for i, r in upd.iterrows():  # gather info on each doc with this docid (e.g. streamid, epoch etc.)
                multi_col_vals = {}
                for multi_col in multi_cols:
                    multi_col_vals[multi_col] = r[multi_col]
                entry = create_entry(row, reg_cols, multi_col_vals=multi_col_vals)
                entry_list.append(entry)
        pbar.update()
    pbar.close()
    
    # form multi-index nugget dataframe
    reg_cols.extend(multi_cols)  # get new multiindex order
#     index = pd.MultiIndex.from_frame(pd.DataFrame(entry_list))
#     nugget_df = pd.DataFrame(entry_list, index=index)
    nugget_df = pd.DataFrame(entry_list)
    #nugget_df = pd.DataFrame(entry_list)
#     reg_cols.extend(multi_cols)  # get new multiindex order
    #nugget_df = nugget_df.set_index(reg_cols)
#     for col in reg_cols:
#         if col == "query_id":
#             col = "topic_id"
#     nugget_df.rename(reg_cols, inplace=True)
    nugget_df[num_cols] = nugget_df[num_cols].apply(pd.to_numeric, errors='coerce', axis=1)  # convert appropriate cols to numerical values
    nugget_df.rename(columns={'query_id':'topic_id'}, inplace=True)  # topic_id matches other dataframes
    return nugget_df

In [418]:
nugget_df = create_nugget_df()

100%|██████████| 1366/1366 [00:39<00:00, 34.96it/s] 


In [422]:
display(nugget_df[0:5])

Unnamed: 0,topic_id,nugget_id,importance,nugget_len,nugget_text,docid,streamid,epoch,yyyymmddhh
0,1,VMTS13.01.052,3,2,Hundreds injured,dd95d5dbbff443c3ddae4e34a5d2e9c1,1330041420-dd95d5dbbff443c3ddae4e34a5d2e9c1,1330041420,2012-02-23-23
1,1,VMTS13.01.052,3,2,Hundreds injured,dd95d5dbbff443c3ddae4e34a5d2e9c1,1330041420-dd95d5dbbff443c3ddae4e34a5d2e9c1,1330041420,2012-02-23-23
2,1,VMTS13.01.054,1,3,"February 22, 2012",ddd856e0a350c52b7c078c9bcdd609d9,1329930660-ddd856e0a350c52b7c078c9bcdd609d9,1329930660,2012-02-22-17
3,1,VMTS13.01.054,1,3,"February 22, 2012",ddd856e0a350c52b7c078c9bcdd609d9,1329930660-ddd856e0a350c52b7c078c9bcdd609d9,1329930660,2012-02-22-17
4,1,VMTS13.01.054,1,3,"February 22, 2012",ecda22bcfc10da137b49f0089bd5d7f5,1329916140-ecda22bcfc10da137b49f0089bd5d7f5,1329916140,2012-02-22-13


In [342]:
update_df = create_update_df()

100%|██████████| 1245/1245 [00:02<00:00, 586.81it/s]


In [351]:
display(update_df[0:5])

docid,streamid,epoch,yyyymmddhh,zulu
1ac1331e640f51ce5ec082c2d4645c1c,1347511393-1ac1331e640f51ce5ec082c2d4645c1c,1347511393,2012-09-13-04,2012-09-13T04:43:13.0Z
1ac1331e640f51ce5ec082c2d4645c1c,1347511394-1ac1331e640f51ce5ec082c2d4645c1c,1347511394,2012-09-13-04,2012-09-13T04:43:14.0Z
cc4bfe4c92b325f18a38601c9883dfd1,1353877843-cc4bfe4c92b325f18a38601c9883dfd1,1353877843,2012-11-25-21,2012-11-25T21:10:43.0Z
cc4bfe4c92b325f18a38601c9883dfd1,1353877262-cc4bfe4c92b325f18a38601c9883dfd1,1353877262,2012-11-25-21,2012-11-25T21:01:02.0Z
cc4bfe4c92b325f18a38601c9883dfd1,1353877279-cc4bfe4c92b325f18a38601c9883dfd1,1353877279,2012-11-25-21,2012-11-25T21:01:19.0Z


In [425]:
# select random selection of streams with nugs to summarise
test_nug = nugget_df[nugget_df['topic_id'] == 1].sample(n=10)['streamid']
test_nug = corpus[corpus['streamid'].isin(test_nug)]
test_nug_sum = sum_handler.summarize(test_nug['text'])
print(test_nug_sum)



  0%|          | 0/1 [00:00<?, ?it/s][A[A

Summarising text in 1 pieces.




100%|██████████| 1/1 [00:16<00:00, 16.53s/it][A[A

Five killed in Afghan protests over Quran burning – The Express Tribune RSS Today's Paper Subscribe beta 2.0 High: 28 ° C Low : 19 ° C Home Pakistan Business World Sports Life &amp ; Style Multimedia Opinion Magazine Blogs Jobs Classifieds Alerts Five killed in Afghan protests over Quran burning At least five Afghan ­s were shot dead and dozens wounde ­ d in clashe­s betwee ­ n police and demons­trator­s. By AFP Published: February 22, 2012 Afghan policemen march towards protesters during a protest near a U.S. military base in Kabul February 22, 2012. PHOTO: REUTERS KABUL: At least five Afghans were shot dead and dozens wounded Wednesday in clashes between police and demonstrators protesting over the burning of the Holy Quran at a US -run military base, officials said. In Kabul , hundreds of people poured onto the Jalalabad road, throwing stones at US military base Camp Phoenix , where troops guarding the base fired into the air and black smoke from burning tyres rose, an AFP photograp




In [435]:
# test compare the topic 1, 0:10 slice

def nugget_rows(streamids):
    nug_rows = nugget_df[nugget_df['streamid'].isin(streamids)]
    nug_rows = nug_rows.drop_duplicates('nugget_id')
    return nug_rows

def nugget_score(nugget_rows, summary):
    contains = {}
    for index, row in nugget_rows.iterrows():
        if row['nugget_text'] in summary:
            d = {}
            d['importance'] = row['importance']
            d['nugget_text'] = row['nugget_text']
            contains[row['nugget_id']] = d
    score = 0
    print_str = ""
    for k,v in contains.items():
        print_str += k
        print_str += "(" + str(v['importance']) + "): " + str(v['nugget_text'])
        print_str += "\n"
        score += v['importance']
    total = sum(list(nugget_rows['importance']))
    print("Score: " + str(score) + " out of " + str(total))
    print(print_str)

nugget_score(nugget_rows(test_nug['streamid']), test_nug_sum)

Score: 2 out of 12
VMTS13.01.054(1): February 22, 2012
VMTS13.01.080(1): 55 dead



In [None]:
class NuggetReport:
    """Class that generates text of nuggets to be used for comparison within given bounds"""
    def get_report():
        

## Database Interaction

In [None]:
# import sqlite3

# db_dir = '/nfs/proj-repo/AAARG-dissertation'
# db_name = 'sumresults.db'
# db_path = db_dir + '/' + db_name

# conn = sqlite3.connect(db_path)  # creates db if doesn't exist
# c = conn.cursor()  # allows send commands to db

In [None]:
# c.execute("""CREATE TABLE results (
#     topic_id integer,
#     summary text
# )""")