## Installing Packages

In [2]:
#print("\nsuccessfully installed packages")

## Loading Data Set

In [73]:
## IMPORT DEPENDENCIES

from bs4 import BeautifulSoup as bs
import gzip
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from collections import OrderedDict
print ("loading data set dependencies successful")

loading data set dependencies successful


In [74]:
## SET FILE META VARIABLES

corpus_path = "/nfs/trects-kba2014-filtered" # directory of corpus of gzipped html files
topics_path = corpus_path + "/test-topics.xml"
doc_tags = ['topic_id','streamid', 'docid', 'yyyymmddhh', 'kbastream', 'zulu', 'epoch', 'title', 'text', 'url'] # doc fields
topic_tags = ['id', 'title', 'description', 'start','end','query','type'] # topic fields
test_file_addr = corpus_path + "/1/2012-02-22-15.gz"

In [5]:
# open and get beautifulsoup object from markup file
def open_markup_file(addr, gz=True, xml=False, verbose=False):
    markup = None
    f = None
    
    if verbose:
        print(addr)

    if gz:
        f = gzip.open(addr)
    else:
        f = open(addr)
        
    if xml == False:
        markup = bs(f)  # open as html
    else:
        markup = bs(f, "xml")
        
    f.close()
    return markup


# parse markup and return 2D list [entry:tags]
def parse_markup(markup, entry_list, find_tag="doc", tag_list=doc_tags, topic_id=None):
    for e in markup.find_all(find_tag):
        entry = OrderedDict.fromkeys(tag_list)
        if topic_id is not None:
            entry['topic_id'] = topic_id
        for c in e.children:  # children use direct children, descendants uses all
            if c.name in entry:
                entry[c.name] = c.string
            elif c.name is None and c.string != '\n':  # inner body of <doc> tag
                entry['text'] = str(c.string)
        entry_list.append(list(entry.values()))
        
            
# recursively find gz html files from a directory address
def search_dir(path):    
    # separate the subdirectories and html files 
    # (help maintain sequential order of insertion)
    gz_paths = []
    for f in os.scandir(path):
        if os.path.splitext(f.path)[-1].lower() == ".gz":
            gz_paths.append(f.path)
    
    return gz_paths


def list_to_dataframe(markup_list, tags):
    return pd.DataFrame(markup_list, columns=tags)

In [6]:
# load topics into dataframe
def load_topics(path):
    topics_list = []
    
    parse_markup(open_markup_file(path, gz=False, xml=True), 
                    topics_list, find_tag="event", tag_list=topic_tags)
    
    
    return  list_to_dataframe(topics_list, topic_tags)

topics = load_topics(topics_path)

In [7]:
print("Topics loaded successfuly")
print(topics.head(4))

Topics loaded successfuly
  id                                title  \
0  1      2012 Buenos Aires Rail Disaster   
1  2  2012 Pakistan garment factory fires   
2  3                 2012 Aurora shooting   
3  4       Wisconsin Sikh temple shooting   

                                         description       start         end  \
0  http://en.wikipedia.org/wiki/2012_Buenos_Aires...  1329910380  1330774380   
1  http://en.wikipedia.org/wiki/2012_Pakistan_gar...  1347368400  1348232400   
2  http://en.wikipedia.org/wiki/2012_Aurora_shooting  1342766280  1343630280   
3  http://en.wikipedia.org/wiki/Wisconsin_Sikh_te...  1344180300  1345044300   

                      query      type  
0  buenos aires train crash  accident  
1     pakistan factory fire  accident  
2         colorado shooting  shooting  
3      sikh temple shooting  shooting  


In [8]:
# load all formatted gzipped html files into dataframe
def load_corpus(path):
    corpus_list = []
    gz_paths = []
    for topic_id in topics['id'].to_numpy():
        id_path = corpus_path + "/" + topic_id + "/"  # every topic id correlates to subfolder named after it
        gz_paths = search_dir(id_path)
    for gz_path in tqdm(gz_paths, position=0, leave=True):
        parse_markup(open_markup_file(gz_path, verbose=False),
                        corpus_list, topic_id=topic_id)
    return list_to_dataframe(corpus_list, doc_tags)

corpus = load_corpus(corpus_path)
#print("Corpus loaded Successfully")

100%|██████████| 241/241 [01:20<00:00,  3.00it/s]


In [9]:
print("Corpus loaded succesfully: " + str(len(corpus)) + " documents loaded.")
print(corpus.head(4))

Corpus loaded succesfully: 1578 documents loaded.
  topic_id                                     streamid  \
0       10  1354113657-a4417f055ea5ae84207a4edb4dad881b   
1       10  1354112039-110cc86ea7a8a1b58306dfade5b300ec   
2       10  1354114192-a4417f055ea5ae84207a4edb4dad881b   
3       10  1354114426-6c8d58d994c0e3243ee8dca8f34516a4   

                              docid     yyyymmddhh        kbastream  \
0  a4417f055ea5ae84207a4edb4dad881b  2012-11-28-14  MAINSTREAM_NEWS   
1  110cc86ea7a8a1b58306dfade5b300ec  2012-11-28-14  MAINSTREAM_NEWS   
2  a4417f055ea5ae84207a4edb4dad881b  2012-11-28-14  MAINSTREAM_NEWS   
3  6c8d58d994c0e3243ee8dca8f34516a4  2012-11-28-14           WEBLOG   

                     zulu       epoch  \
0  2012-11-28T14:40:57.0Z  1354113657   
1  2012-11-28T14:13:59.0Z  1354112039   
2  2012-11-28T14:49:52.0Z  1354114192   
3  2012-11-28T14:53:46.0Z  1354114426   

                                               title  \
0  Morning Briefing: Support grows f

In [10]:
#test_file_df = list_to_dataframe(parse_markup(open_markup_file(test_file_addr)), doc_tags)

## Preprocessing

In [None]:
## IMPORT DEPENDENCIES

import spacy

print("preprocessing dependencies import successful")

In [None]:
nlp = spacy.load("en_core_web_sm")  # try experimenting disabling parts of spacy pipeline see if .sents still works

nlp.add_pipe(nlp.create_pipe('sentencizer'), before="parser")
# nlp.remove_pipe('tagger')
# nlp.remove_pipe('parser')

In [96]:
def get_sentences_as_word_lists(docs):
    """Tokenize sentences into lists of word characters"""
    doc_sents = []
    for doc in nlp.pipe(docs):
        #sents.extend([sent.text for sent in doc.sents])
        sents = []
        word_count = 0
        for sent in doc.sents:
            words = []
            if (len(sent) + word_count > 512): # model takes maximum 512 length sequences (need a workaround)
                break
            for token in sent:
                words.append(token.text)
                word_count += 1
            sents.append(words)
        doc_sents.append(sents)
    return doc_sents

## Retrieve Word and Sentence Level Embeddings

In [102]:
from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer
#sent_model = AutoModel.from_pretrained('sentence-transformers/distilbert-base-nli-stsb-mean-tokens')
sent_tokenizer = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
#word_model = AutoModel.from_pretrained('distilbert-base-uncased')
word_tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

I1120 11:47:40.984783 140691828995904 SentenceTransformer.py:38] Load pretrained SentenceTransformer: distilbert-base-nli-stsb-mean-tokens
I1120 11:47:41.008171 140691828995904 SentenceTransformer.py:42] Did not find folder distilbert-base-nli-stsb-mean-tokens
I1120 11:47:41.008564 140691828995904 SentenceTransformer.py:48] Try to download model from server: https://sbert.net/models/distilbert-base-nli-stsb-mean-tokens.zip
I1120 11:47:41.009045 140691828995904 SentenceTransformer.py:99] Load SentenceTransformer from folder: /root/.cache/torch/sentence_transformers/sbert.net_models_distilbert-base-nli-stsb-mean-tokens
I1120 11:47:41.851731 140691828995904 SentenceTransformer.py:123] Use pytorch device: cuda


In [122]:
# get docs (currently select amount)
test_docs = corpus['text'].iloc[0:3]

def get_word_embeddings(model, sentence_word_list):
    """Transform sentences into lists of word-level embeddings"""
    word_embeddings = []
    for sentence in sentence_word_list:
        # input_ids dumps 'attention_mask part of dict'
        word_embeddings.append(model(sentence, is_pretokenized=True)['input_ids'])
    return word_embeddings  

def word_to_sentence_embeddings(model, word_sentence_embeddings):
    """Transform lists of word embeddings split by sentence into np array of sentence embeddings"""
    return model.encode(word_sentence_embeddings, is_pretokenized=True)

test_word_emb = get_word_embeddings(word_tokenizer, get_sentences_as_word_lists(test_docs))
test_sent_emb = word_to_sentence_embeddings(sent_tokenizer, test_word_emb)

print("len word/sent emb: " + str(len(test_word_emb)) + "/" + str(len(test_sent_emb)))

HBox(children=(IntProgress(value=0, description='Batches', max=1, style=ProgressStyle(description_width='initi…


len word/sent emb: 3/3


## Summarization

In [24]:
from summarizer import Summarizer