### Notes on current discoveries:

- All files in storage claim are accessed with the base address '/nfs/'
- Target files are HTML wrapped in .gz, use gzip to unzip
- The inner text of <doc> is stored as the None tag where tag.string != '\n' (is more than just a newline character
- .string of <doc>'s inner text parses inner <a> tags etc. into just their inner text

### Ideas for language model
- NLTK
- BERT

- Load data set -> preprocessing


## Installing Packages

In [9]:
# # !pip3 install spacy-langdetect
# # !pip3 install language-detector
# # !pip3 install symspellpy
# !pip3 install beautifulsoup4 lxml  # html/xml parser
# !pip3 install torch torchvision
# !pip3 install neuralcoref
# !pip3 install transformers
# !pip3 install sentence-transformers

# #!pip3 install -U spacy   # preprocessing
# #!python -m spacy download en_core_web_sm  # spacy model
# #!pip3 install numpy==1.18

# # !pip3 install spacy==2.1.3
# # !pip3 install transformers==2.2.2
# # !pip3 install neuralcoref

# #!pip3 install tensorflow --upgrade  # for extractive summerizar
# !pip3 install tensorflow
# #!pip3 install bert-extractive-summarizer

#!python -m spacy download en_core_web_md
print("\nsuccessfully installed packages")


successfully installed packages


## Loading Data Set

In [21]:
## IMPORT DEPENDENCIES

from bs4 import BeautifulSoup as bs
import gzip
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from collections import OrderedDict
print ("loading data set dependencies successful")

loading data set dependencies successful


In [11]:
## SET FILE META VARIABLES

corpus_path = "/nfs/trects-kba2014-filtered" # directory of corpus of gzipped html files
topics_path = corpus_path + "/test-topics.xml"
doc_tags = ['topic_id','streamid', 'docid', 'yyyymmddhh', 'kbastream', 'zulu', 'epoch', 'title', 'text', 'url'] # doc fields
topic_tags = ['id', 'title', 'description', 'start','end','query','type'] # topic fields
test_file_addr = corpus_path + "/1/2012-02-22-15.gz"

In [12]:
# open and get beautifulsoup object from markup file
def open_markup_file(addr, gz=True, xml=False, verbose=False):
    markup = None
    f = None
    
    if verbose:
        print(addr)

    if gz:
        f = gzip.open(addr)
    else:
        f = open(addr)
        
    if xml == False:
        markup = bs(f)  # open as html
    else:
        markup = bs(f, "xml")
        
    f.close()
    return markup


# parse markup and return 2D list [entry:tags]
def parse_markup(markup, entry_list, find_tag="doc", tag_list=doc_tags, topic_id=None):
    for e in markup.find_all(find_tag):
        entry = OrderedDict.fromkeys(tag_list)
        if topic_id is not None:
            entry['topic_id'] = topic_id
        for c in e.children:  # children use direct children, descendants uses all
            if c.name in entry:
                entry[c.name] = c.string
            elif c.name is None and c.string != '\n':  # inner body of <doc> tag
                entry['text'] = c.string
        entry_list.append(list(entry.values()))
        
            
# recursively find gz html files from a directory address
def search_dir(path):    
    # separate the subdirectories and html files 
    # (help maintain sequential order of insertion)
    gz_paths = []
    for f in os.scandir(path):
        if os.path.splitext(f.path)[-1].lower() == ".gz":
            gz_paths.append(f.path)
    
    return gz_paths


def list_to_dataframe(markup_list, tags):
    return pd.DataFrame(markup_list, columns=tags)

In [13]:
# load topics into dataframe
def load_topics(path):
    topics_list = []
    
    parse_markup(open_markup_file(path, gz=False, xml=True), 
                    topics_list, find_tag="event", tag_list=topic_tags)
    
    
    return  list_to_dataframe(topics_list, topic_tags)

topics = load_topics(topics_path)

In [16]:
print("Topics loaded successfuly")
print(topics.head(4))

Topics loaded successfuly
  id                                title  \
0  1      2012 Buenos Aires Rail Disaster   
1  2  2012 Pakistan garment factory fires   
2  3                 2012 Aurora shooting   
3  4       Wisconsin Sikh temple shooting   

                                         description       start         end  \
0  http://en.wikipedia.org/wiki/2012_Buenos_Aires...  1329910380  1330774380   
1  http://en.wikipedia.org/wiki/2012_Pakistan_gar...  1347368400  1348232400   
2  http://en.wikipedia.org/wiki/2012_Aurora_shooting  1342766280  1343630280   
3  http://en.wikipedia.org/wiki/Wisconsin_Sikh_te...  1344180300  1345044300   

                      query      type  
0  buenos aires train crash  accident  
1     pakistan factory fire  accident  
2         colorado shooting  shooting  
3      sikh temple shooting  shooting  


In [22]:
# load all formatted gzipped html files into dataframe
def load_corpus(path):
    corpus_list = []
    gz_paths = []
    for topic_id in topics['id'].to_numpy():
        id_path = corpus_path + "/" + topic_id + "/"  # every topic id correlates to subfolder named after it
        gz_paths = search_dir(id_path)
    for gz_path in tqdm(gz_paths, position=0, leave=True):
        parse_markup(open_markup_file(gz_path, verbose=False),
                        corpus_list, topic_id=topic_id)
    return list_to_dataframe(corpus_list, doc_tags)

corpus = load_corpus(corpus_path)
#print("Corpus loaded Successfully")

100%|██████████| 241/241 [01:19<00:00,  3.04it/s]


In [23]:
print("Corpus loaded succesfully: " + str(len(corpus)) + " documents loaded.")
print(corpus.head(4))

Corpus loaded succesfully: 1578 documents loaded.
  topic_id                                     streamid  \
0       10  1354113657-a4417f055ea5ae84207a4edb4dad881b   
1       10  1354112039-110cc86ea7a8a1b58306dfade5b300ec   
2       10  1354114192-a4417f055ea5ae84207a4edb4dad881b   
3       10  1354114426-6c8d58d994c0e3243ee8dca8f34516a4   

                              docid     yyyymmddhh        kbastream  \
0  a4417f055ea5ae84207a4edb4dad881b  2012-11-28-14  MAINSTREAM_NEWS   
1  110cc86ea7a8a1b58306dfade5b300ec  2012-11-28-14  MAINSTREAM_NEWS   
2  a4417f055ea5ae84207a4edb4dad881b  2012-11-28-14  MAINSTREAM_NEWS   
3  6c8d58d994c0e3243ee8dca8f34516a4  2012-11-28-14           WEBLOG   

                     zulu       epoch  \
0  2012-11-28T14:40:57.0Z  1354113657   
1  2012-11-28T14:13:59.0Z  1354112039   
2  2012-11-28T14:49:52.0Z  1354114192   
3  2012-11-28T14:53:46.0Z  1354114426   

                                               title  \
0  Morning Briefing: Support grows f

In [None]:
#test_file_df = list_to_dataframe(parse_markup(open_markup_file(test_file_addr)), doc_tags)

## Preprocessing

### Preprocessing Notes

####Links
-  https://stackoverflow.com/questions/54938815/data-preprocessing-for-nlp-pre-training-models-e-g-elmo-bert
- https://www.activestate.com/blog/natural-language-processing-nltk-vs-spacy/

####Notes
- Thinking of using spacy as it provides faster simpler preprocessing options


In [None]:
## IMPORT DEPENDENCIES

# import re
# import matplotlib.pyplot as plt
# from nltk.corpus import stopwords 

# import spacy

# #from stop_words import get_stop_words
# ##from nltk.stem.porter import PorterStemmer
# import re
# #import nltk
#from nltk.tokenize import word_tokenize
#from language_detector import detect_language

#import pkg_resources
#from symspellpy import SymSpell, Verbosity

print("preprocessing dependencies import successful")

In [None]:
#nlp = spacy.load("en_core_web_sm")  # en_core_web_lg nearly 80x larger, marginal differences

# was in TaD solution
# nlp.remove_pipe('tagger')
# nlp.remove_pipe('parser')

In [None]:
# ## LOAD SPELL CHECKER

# sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=7)
# dictionary_path = pkg_resources.resource_filename(
#     "symspellpy", "frequency_dictionary_en_82_765.txt")
# if sym_spell.word_count:
#     pass
# else:
#     sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

## SENTENCE LEVEL PREPROCESSING

In [None]:
# #@Tokenize
# def spacy_tokenize(string):
#   tokens = list()
#   doc = nlp(string)
#   for token in doc:
#     tokens.append(token)
#   return tokens

# #@Normalize
# def normalize(tokens):
#   normalized_tokens = list()
#   for token in tokens:
#     normalized = token.text.lower().strip()
#     if ((token.is_alpha or token.is_digit)):
#       normalized_tokens.append(normalized)
#   return normalized_tokens
#   return normalized_tokens

# #@Tokenize and normalize
# def tokenize_normalize(string):
#   return normalize(spacy_tokenize(string))

In [None]:
# print(tokenize_normalize("this string right here is an example, I'm just testing what spacy does"))

## Vectorization

### Sentence Level Embeddings

In [32]:
from sentence_transformers import SentenceTransformer

ModuleNotFoundError: No module named 'sentence_transformers'

In [30]:
sent_model = SentenceTransformer('distilbert-base-nli-mean-tokens')

test_sentences = corpus['text'].iloc(0).item()

test_sentence_embeddings = sent_model.encode(test_sentences)

for sentence, embedding in zip(sentences, test_sentence_embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

NameError: name 'SentenceTransformer' is not defined

## Summarization
### bert-extractive-summarizer is causing import/dependecy issues

In [24]:
from summarizer import Summarizer

In [26]:
sum_model = Summarizer()
print("loaded model")
# test_sum_text = str(corpus['text'].iloc(0))
# test_sum = sum_model(test_sum_text, min_length=10)
# print("".join(test_sum))

I1110 15:30:34.615583 140009003521856 configuration_utils.py:160] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-large-uncased-config.json from cache at /root/.cache/torch/transformers/6dfaed860471b03ab5b9acb6153bea82b6632fb9bbe514d3fff050fe1319ee6d.788fed32bb8481a9b15ce726d41c53d5d5066b04c667e34ce3a7a3826d1573d8
I1110 15:30:34.616859 140009003521856 configuration_utils.py:177] Model config {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "num_labels": 2,
  "output_attentions": false,
  "output_hidden_states": true,
  "output_past": true,
  "pad_token_id": 0,
  "pruned_heads": {},
  "torc

loaded model
<pandas.core.indexing._iLocIndexer object at 0x7f551345cef8>


In [28]:
#test_sum_text = corpus['text'].iloc(0)
test_sum_text = "This repo is the generalization of the lecture-summarizer repo. This tool utilizes the HuggingFace Pytorch transformers library to run extractive summarizations. This works by first embedding the sentences, then running a clustering algorithm, finding the sentences that are closest to the cluster's centroids. This library also uses coreference techniques, utilizing the https://github.com/huggingface/neuralcoref library to resolve words in summaries that need more context. The greedyness of the neuralcoref library can be tweaked in the CoreferenceHandler class."
test_sum = sum_model(test_sum_text, min_length=1)
print("".join(test_sum))

This repo is the generalization of the lecture-summarizer repo. The greedyness of the neuralcoref library can be tweaked in the CoreferenceHandler class.


In [None]:
#pip list

In [None]:
# from transformers import AutoTokenizer, AutoModelForMaskedLM

# tokenizer = AutoTokenizer.from_pretrained("bert-large-uncased")
# model = AutoModelForMaskedLM.from_pretrained("bert-large-uncased")