## Loading Data Set

In [1]:
## IMPORT DEPENDENCIES

from bs4 import BeautifulSoup as bs
import gzip
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from collections import OrderedDict
print ("loading data set dependencies successful")

loading data set dependencies successful


In [2]:
## SET FILE META VARIABLES

corpus_path = "/nfs/trects-kba2014-filtered" # directory of corpus of gzipped html files
topics_path = corpus_path + "/test-topics.xml"
doc_tags = ['topic_id','streamid', 'docid', 'yyyymmddhh', 'kbastream', 'zulu', 'epoch', 'title', 'text', 'url'] # doc fields
topic_tags = ['id', 'title', 'description', 'start','end','query','type'] # topic fields
test_file_addr = corpus_path + "/1/2012-02-22-15.gz"

In [3]:
# open and get beautifulsoup object from markup file
def open_markup_file(addr, gz=True, xml=False, verbose=False):
    markup = None
    f = None
    
    if verbose:
        print(addr)

    if gz:
        f = gzip.open(addr)
    else:
        f = open(addr)
        
    if xml == False:
        markup = bs(f)  # open as html
    else:
        markup = bs(f, "xml")
        
    f.close()
    return markup


# parse markup and return 2D list [entry:tags]
def parse_markup(markup, entry_list, find_tag="doc", tag_list=doc_tags, topic_id=None):
    for e in markup.find_all(find_tag):
        entry = OrderedDict.fromkeys(tag_list)
        if topic_id is not None:
            entry['topic_id'] = topic_id
        for c in e.children:  # children use direct children, descendants uses all
            if c.name in entry:
                entry[c.name] = c.string
            elif c.name is None and c.string != '\n':  # inner body of <doc> tag
                entry['text'] = str(c.string)
        entry_list.append(list(entry.values()))
        
            
# recursively find gz html files from a directory address
def search_dir(path):    
    # separate the subdirectories and html files 
    # (help maintain sequential order of insertion)
    gz_paths = []
    for f in os.scandir(path):
        if os.path.splitext(f.path)[-1].lower() == ".gz":
            gz_paths.append(f.path)
    
    return gz_paths


def list_to_dataframe(markup_list, tags):
    return pd.DataFrame(markup_list, columns=tags)

In [4]:
# load topics into dataframe
def load_topics(path):
    topics_list = []
    
    parse_markup(open_markup_file(path, gz=False, xml=True), 
                    topics_list, find_tag="event", tag_list=topic_tags)
    
    
    return  list_to_dataframe(topics_list, topic_tags)

topics = load_topics(topics_path)

In [5]:
print("Topics loaded successfuly")
print(topics.head(4))

Topics loaded successfuly
  id                                title  \
0  1      2012 Buenos Aires Rail Disaster   
1  2  2012 Pakistan garment factory fires   
2  3                 2012 Aurora shooting   
3  4       Wisconsin Sikh temple shooting   

                                         description       start         end  \
0  http://en.wikipedia.org/wiki/2012_Buenos_Aires...  1329910380  1330774380   
1  http://en.wikipedia.org/wiki/2012_Pakistan_gar...  1347368400  1348232400   
2  http://en.wikipedia.org/wiki/2012_Aurora_shooting  1342766280  1343630280   
3  http://en.wikipedia.org/wiki/Wisconsin_Sikh_te...  1344180300  1345044300   

                      query      type  
0  buenos aires train crash  accident  
1     pakistan factory fire  accident  
2         colorado shooting  shooting  
3      sikh temple shooting  shooting  


In [6]:
# load all formatted gzipped html files into dataframe
def load_corpus(path):
    corpus_list = []
    gz_paths = []
    for topic_id in topics['id'].to_numpy():
        id_path = corpus_path + "/" + topic_id + "/"  # every topic id correlates to subfolder named after it
        gz_paths = search_dir(id_path)
    for gz_path in tqdm(gz_paths, position=0, leave=True):
        parse_markup(open_markup_file(gz_path, verbose=False),
                        corpus_list, topic_id=topic_id)
    return list_to_dataframe(corpus_list, doc_tags)

corpus = load_corpus(corpus_path)
#print("Corpus loaded Successfully")

100%|██████████| 241/241 [01:20<00:00,  3.00it/s]


In [7]:
print("Corpus loaded succesfully: " + str(len(corpus)) + " documents loaded.")
print(corpus.head(4))

Corpus loaded succesfully: 1578 documents loaded.
  topic_id                                     streamid  \
0       10  1354113657-a4417f055ea5ae84207a4edb4dad881b   
1       10  1354112039-110cc86ea7a8a1b58306dfade5b300ec   
2       10  1354114192-a4417f055ea5ae84207a4edb4dad881b   
3       10  1354114426-6c8d58d994c0e3243ee8dca8f34516a4   

                              docid     yyyymmddhh        kbastream  \
0  a4417f055ea5ae84207a4edb4dad881b  2012-11-28-14  MAINSTREAM_NEWS   
1  110cc86ea7a8a1b58306dfade5b300ec  2012-11-28-14  MAINSTREAM_NEWS   
2  a4417f055ea5ae84207a4edb4dad881b  2012-11-28-14  MAINSTREAM_NEWS   
3  6c8d58d994c0e3243ee8dca8f34516a4  2012-11-28-14           WEBLOG   

                     zulu       epoch  \
0  2012-11-28T14:40:57.0Z  1354113657   
1  2012-11-28T14:13:59.0Z  1354112039   
2  2012-11-28T14:49:52.0Z  1354114192   
3  2012-11-28T14:53:46.0Z  1354114426   

                                               title  \
0  Morning Briefing: Support grows f

In [8]:
#test_file_df = list_to_dataframe(parse_markup(open_markup_file(test_file_addr)), doc_tags)

## Preprocessing

- Topic Modelling needs better preprocessing (stop words/lemmas etc.)
    - stop words
    - lemmatization (stemming is faster but is rule-based with more false transformations)
    - special char removal
- Could try removing junk at top of docs through REs/spacy

In [9]:
## IMPORT DEPENDENCIES

import spacy

print("preprocessing dependencies import successful")

preprocessing dependencies import successful


In [10]:
nlp = spacy.load("en_core_web_sm")  # try experimenting disabling parts of spacy pipeline see if .sents still works

nlp.add_pipe(nlp.create_pipe('sentencizer'), before="parser")
# nlp.remove_pipe('tagger')
# nlp.remove_pipe('parser')

In [11]:
def get_sentences_as_word_lists(docs):
    """Tokenize sentences into lists of word characters"""
    doc_sents = []
    for doc in nlp.pipe(docs):
        #sents.extend([sent.text for sent in doc.sents])
        sents = []
        word_count = 0
        for sent in doc.sents:
            words = []
            if (len(sent) + word_count > 512): # model takes maximum 512 length sequences (need a workaround)
                break
            for token in sent:
                words.append(token.text)
                word_count += 1
            sents.append(words)
        doc_sents.append(sents)
    return doc_sents

## Retrieve Word and Sentence Level Embeddings

In [12]:
from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer
#sent_model = AutoModel.from_pretrained('sentence-transformers/distilbert-base-nli-stsb-mean-tokens')
sent_tokenizer = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
#word_model = AutoModel.from_pretrained('distilbert-base-uncased')
word_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

In [13]:
# take small portion of corpus for testing currently
test_docs = corpus['text'].iloc[0:3]

def get_word_embeddings(model, sentence_word_list):
    """Transform sentences into lists of word-level embeddings"""
    word_embeddings = []
    for sentence in sentence_word_list:
        # input_ids dumps 'attention_mask part of dict'
        word_embeddings.append(model(sentence, is_pretokenized=True)['input_ids'])
    return word_embeddings  

def word_to_sentence_embeddings(model, word_sentence_embeddings):
    """Transform lists of word embeddings split by sentence into np array of sentence embeddings"""
    return model.encode(word_sentence_embeddings, is_pretokenized=True)

test_sent_word_lists = get_sentences_as_word_lists(test_docs)
test_word_emb = get_word_embeddings(word_tokenizer, test_sent_word_lists)
test_sent_emb = word_to_sentence_embeddings(sent_tokenizer, test_word_emb)

print("len word/sent emb: " + str(len(test_word_emb)) + "/" + str(len(test_sent_emb)))

len word/sent emb: 3/3


## Topic Modelling

- LDA uses K-means clustering
- HDA learns num topics automatically (Bayesian non-parametric)

In [14]:
# word level topic modelling
# needs better preprocessing (remove stopwords/lemmitization etc)
# maybe add REs/other preprocessing remove uninformative junk at top of docs

import gensim
from gensim import corpora

print("loaded dependencies")

loaded dependencies


### Token Weighting Factor

In [15]:
def weigh_tokens(texts, method="bow"):
    """Perform token weighting scheme on text and return with dict"""
    def create_dictionary(texts):
        """Create a gensim dictionary of index-word mappings"""
        return corpora.Dictionary(texts)
    
    flat_texts = [token for sent in texts for token in sent]  # should be fast
    corpus_dict = create_dictionary(flat_texts)
    weighed_tokens = []
    
    if method == "bow":
        return [corpus_dict.doc2bow(text) for text in flat_texts], corpus_dict
    else:
        raise Exception("Incorrect method parameter")

test_topic_corpus, test_corpus_dict = weigh_tokens(test_sent_word_lists)

print("end of cell")

end of cell


### Topic Classifier

In [16]:
NUM_TOPICS = 5

def model_topics(weighted_tokens, corpus_dict, method="lda"):
    if method == "lda":
#         model = gensim.models.ldamodel.LdaModel(weighted_tokens, num_topics=NUM_TOPICS, 
#                                                 id2word=corpus_dict, passes=15)
        model = gensim.models.ldamulticore.LdaMulticore(weighted_tokens, num_topics=NUM_TOPICS, 
                                                id2word=corpus_dict, passes=15)
        return model
    else:
        raise Exception("Incorrect method parameter")
        
test_lda_model = model_topics(test_topic_corpus, test_corpus_dict)

print_test_lda_topics = test_lda_model.print_topics(NUM_TOPICS)
print("LDA Topics Found: ")
for topic in print_test_lda_topics:
    print(topic)

print("end of cell")

LDA Topics Found: 
(0, '0.032*"the" + 0.024*"in" + 0.023*"a" + 0.022*"to" + 0.022*"of" + 0.019*"," + 0.019*"." + 0.019*"Abbas" + 0.014*"\n" + 0.012*"President"')
(1, '0.020*"to" + 0.020*"national" + 0.011*"is" + 0.011*"revolution" + 0.011*"any" + 0.011*"measures" + 0.011*"take" + 0.011*"The" + 0.011*"president" + 0.011*"."')
(2, '0.028*"the" + 0.027*"and" + 0.027*"." + 0.023*"\n" + 0.020*"The" + 0.019*"to" + 0.018*"News" + 0.016*":" + 0.016*"Search" + 0.016*"-"')
(3, '0.034*"," + 0.025*"\n" + 0.023*"." + 0.021*"the" + 0.018*"by" + 0.018*"for" + 0.016*"to" + 0.014*"on" + 0.014*"bid" + 0.014*"and"')
(4, '0.043*"the" + 0.043*""" + 0.034*"to" + 0.026*"\n" + 0.020*"." + 0.020*"of" + 0.020*"a" + 0.020*"that" + 0.016*"-" + 0.016*"an"')
end of cell


## Sentence Level Clustering

In [87]:
# first get sentences which are nearest neighbors to the identified topics
# https://scikit-learn.org/stable/modules/neighbors.html
# https://stackoverflow.com/questions/60996584/bert-embedding-for-semantic-similarity
# https://stackoverflow.com/questions/59865719/how-to-find-the-closest-word-to-a-vector-using-bert
# https://gist.github.com/avidale/c6b19687d333655da483421880441950


# then compare sentence results from pure extractive summariser maybe?

from sklearn.neighbors import KDTree
#import mxnet as mx
from bert_embedding import BertEmbedding

# ctx = mx.gpu(0)
# bert = BertEmbedding(ctx=ctx)
bert_emb = BertEmbedding()

In [88]:
"""
Trying code from here
https://gist.github.com/avidale/c6b19687d333655da483421880441950

Preprocess embeddings in a formatted way as such can track sentences, words, embeddings

do this, then pass the LDA topics into the query
""" 

class EmbeddingHandler:
    def __init__(self, sentences, model):
        self.sentences = sentences
        self.model = model
        
    def generate_embeddings(self):
        result = self.model(self.sentences)
#         result = list()
#         for sent in self.sentences:
#             result.append(self.model.encode(sent, is_pretokenized=True))
#         #result = self.model.encode(self.sentences, is_pretokenized=True, show_progress_bar=True)
        #print(result)
        
        self.sent_ids = []
        self.token_ids = []
        self.tokens = []
        embeddings = []
        for i, (toks, embs) in enumerate(tqdm(result)):
            for j, (tok, emb) in enumerate(zip(toks, embs)):
                self.sent_ids.append(i)
                self.token_ids.append(j)
                self.tokens.append(tok)
                embeddings.append(emb)
        embeddings = np.stack(embeddings)
        # we normalize embeddings, so that euclidian distance is equivalent to cosine distance
        self.normed_embeddings = (embeddings.T / (embeddings**2).sum(axis=1) ** 0.5).T
        
    def generate_sent_embeddings(self):
        """test sent vs word embeddings"""
        # use sentence-transformers embeddings
        result = self.model.encode(self.sentences)
        self.sent_ids = []
        self.tokens = []
        embeddings = []
        for i, (tok, emb) in enumerate(tqdm(zip(self.sentences,result))):
            self.sent_ids.append(i)
            self.tokens.append(tok)
            embeddings.append(emb)
        embeddings = np.stack(embeddings)
        # we normalize embeddings, so that euclidian distance is equivalent to cosine distance
        self.normed_embeddings = (embeddings.T / (embeddings**2).sum(axis=1) ** 0.5).T
        
    def create_comparitor(self):
        # this takes some time
        self.indexer = KDTree(self.normed_embeddings)
        print("created KDTree")
    
    def query(self, query_sent, query_word, k=10, filter_same_word=False):
        toks, embs = self.model([query_sent])[0]

        found = False
        for tok, emb in zip(toks, embs):
            if tok == query_word:
                found = True
                break
        if not found:
            raise ValueError('The query word {} is not a single token in sentence {}'.format(query_word, toks))
        emb = emb / sum(emb**2)**0.5

        if filter_same_word:
            initial_k = max(k, 100)
        else:
            initial_k = k
        di, idx = self.indexer.query(emb.reshape(1, -1), k=initial_k)  # this is returning our neighbours
        distances = []
        neighbors = []
        contexts = []
        # this is filtering for word matching
        for i, index in enumerate(idx.ravel()):
            token = self.tokens[index]
            if filter_same_word and (query_word in token or token in query_word):  # take this away
                continue
            distances.append(di.ravel()[i])
            neighbors.append(token)
            contexts.append(self.sentences[self.sent_ids[index]])
            if len(distances) == k:
                break
        return distances, neighbors, contexts
    
    def topic_neighbors(self, topic_word, k=10):
        # get average embedding of topic word
        # maybe instead return context sentence that is closest to averaged embedding?
        # that way can use context to get right meaning
        topic_emb = self.avg_embedding(self.retrieve_embeddings(topic_word))
        
        # get neighbors
        # do I need reshape?
        di, idx = self.indexer.query(topic_emb.reshape(1,-1), k=k)
        distances = []
        neighbors = []
        contexts = []
        for i, index in enumerate(idx.ravel()):
            token = self.tokens[index]
            distances.append(di.ravel()[i])
            neighbors.append(token)
            contexts.append(self.sentences[self.sent_ids[index]])
        return distances, neighbors, contexts
        
        
    def retrieve_embeddings(self, token):
        idxs = []
        for i, t in enumerate(self.tokens):
            if t == token:
                idxs.append(i)
            elif token in t:  # sent-embeddings temp workaround
                idxs.append(i)
        embs = []
        for i in idxs:
            embs.append(self.normed_embeddings[i])
        return embs
    
    def avg_embedding(self, emb_list):
        return np.mean(emb_list, axis=0)

In [89]:
# test just get sentences
# richard said sents are actually split by \n
def split_sentences(docs):
    """Tokenize sentences into lists of word characters"""
    doc_sents = []
    for doc in nlp.pipe(docs):
        #sents.extend([sent.text for sent in doc.sents])
        for sent in doc.sents:
            doc_sents.append(sent.text)
    return doc_sents

def split_newline(docs):
    sents = []
    for doc in docs:
        sents.extend(doc.splitlines())
    return sents

emb_handler_corp = split_newline(test_docs)

In [90]:
emb_handler = EmbeddingHandler(emb_handler_corp, bert_emb)  # [0] index taking first doco
emb_handler.generate_embeddings()
emb_handler.create_comparitor()

100%|██████████| 127/127 [00:00<00:00, 130654.06it/s]

created KDTree





In [91]:
sent_emb_h = EmbeddingHandler(emb_handler_corp, sent_tokenizer)
sent_emb_h.generate_sent_embeddings()
sent_emb_h.create_comparitor()

127it [00:00, 535891.96it/s]

created KDTree





In [92]:
dist, neigh, cont = emb_handler.topic_neighbors("and")
for d, w, c in zip(dist, neigh, cont):
    print('{} {}  {}'.format(w, d, c.strip()))
    print("")
#avg_emb = emb_handler.avg_embedding(ret_emb)

and 0.634126151881698  The Swiss and Danes join growing list of European countries supporting the bid, including France , Norway and Spain .

and 0.634126151881698  The Swiss and Danes join growing list of European countries supporting the bid, including France , Norway and Spain .

and 0.63657503617046  The Swiss and Danes join growing list of European countries supporting the bid, including France , Norway and Spain .

and 0.63657503617046  The Swiss and Danes join growing list of European countries supporting the bid, including France , Norway and Spain .

and 0.6419069099513167  The Muslim Brotherhood and other Islamist groups have called a rally for Saturday in support of Mr Mursi .

and 0.6535914160403863  UK and Ireland Egypt protests continue in crisis over 44 mins ago International Business Times UK Cairo Protesters Condemn Mursi Power Grab [VIDEO + PHOTOS] 2 hrs ago Sky News Egypt: Major Protest Against President Morsi 13 hrs ago BusinessWeek Egypt Anti-Mursi Protesters Test 

In [97]:
dist2, neigh2, cont2 = sent_emb_h.topic_neighbors("and")
for d, w in zip(dist2, neigh2):
    print('{} {}'.format(w, d))
    print("")

BBC News - Egypt crisis: Appeals courts launch anti-Mursi strike action Accessibility links Skip to content Skip to local navigation Accessibility Help bbc.co. uk navigation News Sport Weather Travel Future TV Radio More … Search term: Middle East Home US &amp ; Canada Latin America UK Africa Asia Europe Mid-East Business Health Sci / Environment Tech Entertainment Video 28 November 2012 Last updated at 09:04 ET Share this page Delicious Digg Facebook reddit StumbleUpon Twitter Email Print Egypt crisis: Appeals courts launch anti-Mursi strike action Protests and clashes have continued Continue reading the main story Egypt changing Mursi's gamble Who holds the power? 0.7788612088764301

( MARKO DJURICA /REUTERS ) Morning Briefing: Support grows for bid by Palestinians for UN recognition Add to ... Stephen Northfield And Jill Mahoney The Globe and Mail Published Wednesday, Nov. 28 2012, 8:26 AM EST Last updated Wednesday, Nov. 28 2012, 9:28 AM EST Comments closed Print / License A A A su

## Summarization

In [94]:
from summarizer import Summarizer