### Notes on current discoveries:

- All files in storage claim are accessed with the base address '/nfs/'
- Target files are HTML wrapped in .gz, use gzip to unzip
- The inner text of <doc> is stored as the None tag where tag.string != '\n' (is more than just a newline character
- .string of <doc>'s inner text parses inner <a> tags etc. into just their inner text

### Ideas for language model
- NLTK
- BERT



In [28]:
!!pip install beautifulsoup4 lxml
from bs4 import BeautifulSoup as bs
import gzip
import pandas as pd
import os
from collections import OrderedDict

In [31]:
corpus_path = "/nfs/trects-kba2014-filtered" # directory of corpus of gzipped html files
topics_path = corpus_path + "/test-topics.xml"
doc_tags = ['streamid', 'docid', 'yyyymmddhh', 'kbastream', 'zulu', 'epoch', 'title', 'text', 'url'] # doc fields
topic_tags = ['id', 'title', 'description', 'start','end','query','type'] # topic fields
test_file_addr = "/1/2012-02-22-15.gz"

In [34]:
# The inner text of <doc> is stored as the None tag where tag.string != '\n' (is more than just a newline character
# .string of <doc>'s inner text parses inner <a> tags etc. into just their inner text

# open and get beautifulsoup object from markup file
def open_markup_file(addr, gz=True, xml=False, verbose=False):
    markup = None
    f = None
    
    if verbose:
        print(addr)

    if gz:
        f = gzip.open(addr)
    else:
        f = open(addr)
        
    if xml == False:
        markup = bs(f)  # open as html
    else:
        markup = bs(f, "xml")
        
    f.close()
    return markup


# parse markup and return 2D list [entry:tags]
def parse_markup(markup, entry_list, find_tag="doc", tag_list=doc_tags):
    for e in markup.find_all(find_tag):
        entry = OrderedDict.fromkeys(tag_list)
        for c in e.children:  # children use direct children, descendants uses all
            if c.name in entry:
                entry[c.name] = c.string
            elif c.name is None and c.string != '\n':  # inner body of <doc> tag
                entry['text'] = c.string
#             elif c.name is not None:
#                 print("Entry has unexpected field: " + str(c.name))
#                 print("parent field: " + c.parent.name)
#                 print("field content: " + str(c))
        
        missing_fields = [k for (k,v) in entry.items() if v == None]
        for m in missing_fields:
            print("Entry is missing field: " + m)
        entry_list.append(list(entry.values()))
        
            


# recursively find gz html files from a directory address
def search_dir(path, gz_paths):    
    # separate the subdirectories and html files 
    # (help maintain sequential order of insertion)
    subdirs = []
#     dir_gz_paths = []
    for f in os.scandir(path):
        if f.is_dir():
            subdirs.append(f.path)
        elif os.path.splitext(f.path)[-1].lower() == ".gz":
            gz_paths.append(f.path)
    
    # search subdirs
    for sd in subdirs:
        search_dir(sd, gz_paths)
#     gz_paths.append(dir_gz_paths) # add found gz files after sub dirs searched


def list_to_dataframe(markup_list, tags):
    return pd.DataFrame(markup_list, columns=tags)

# load topics into dataframe
def load_topics(path):
    topics_list = []
    
    parse_markup(open_markup_file(path, gz=False, xml=True), 
                    topics_list, find_tag="event", tag_list=topic_tags)
    
    
    return  list_to_dataframe(topics_list, topic_tags)

# load all formatted gzipped html files into dataframe
def load_corpus(path):
    # collect all html gz files from path and all sub dirs
    gz_paths = []
    search_dir(path, gz_paths)
    
    # process all documents into formatted list
    print("Opening files...")
    corpus_list = []
    for p in gz_paths:
        parse_markup(open_markup_file(p, show_addr=True), corpus_list)
    
    # return and convert list into DataFrame
    return pd.DataFrame(corpus_list, columns=doc_tags)

In [38]:
# test_parse = []
# parse_markup(open_markup_file(corpus_path + test_file_addr, verbose=True), test_parse)
# print(test_parse)

In [24]:
topics_test = load_topics(topics_path)

In [25]:
print(topics_test.head(5))

  id                                title  \
0  1      2012 Buenos Aires Rail Disaster   
1  2  2012 Pakistan garment factory fires   
2  3                 2012 Aurora shooting   
3  4       Wisconsin Sikh temple shooting   
4  5               Hurricane Isaac (2012)   

                                         description       start         end  \
0  http://en.wikipedia.org/wiki/2012_Buenos_Aires...  1329910380  1330774380   
1  http://en.wikipedia.org/wiki/2012_Pakistan_gar...  1347368400  1348232400   
2  http://en.wikipedia.org/wiki/2012_Aurora_shooting  1342766280  1343630280   
3  http://en.wikipedia.org/wiki/Wisconsin_Sikh_te...  1344180300  1345044300   
4  http://en.wikipedia.org/wiki/Hurricane_Isaac_(...  1346170800  1347034800   

                      query      type  
0  buenos aires train crash  accident  
1     pakistan factory fire  accident  
2         colorado shooting  shooting  
3      sikh temple shooting  shooting  
4           hurricane isaac     storm  


In [30]:
# load corpus as DataFrame
corpus = load_corpus(corpus_path)
print("Corpus loaded into DataFrame")

Opening files...
/nfs/trects-kba2014-filtered/9/2012-11-10-17.gz
/nfs/trects-kba2014-filtered/9/2012-11-14-16.gz
/nfs/trects-kba2014-filtered/9/2012-11-15-01.gz
/nfs/trects-kba2014-filtered/9/2012-11-12-17.gz
/nfs/trects-kba2014-filtered/9/2012-11-12-18.gz
/nfs/trects-kba2014-filtered/9/2012-11-10-23.gz
/nfs/trects-kba2014-filtered/9/2012-11-15-04.gz
/nfs/trects-kba2014-filtered/9/2012-11-14-23.gz
/nfs/trects-kba2014-filtered/9/2012-11-08-16.gz
/nfs/trects-kba2014-filtered/9/2012-11-15-07.gz
/nfs/trects-kba2014-filtered/9/2012-11-08-03.gz
/nfs/trects-kba2014-filtered/9/2012-11-15-22.gz
/nfs/trects-kba2014-filtered/9/2012-11-15-18.gz
/nfs/trects-kba2014-filtered/9/2012-11-17-08.gz
/nfs/trects-kba2014-filtered/9/2012-11-09-04.gz
/nfs/trects-kba2014-filtered/9/2012-11-14-22.gz
/nfs/trects-kba2014-filtered/9/2012-11-13-06.gz
/nfs/trects-kba2014-filtered/9/2012-11-08-06.gz
/nfs/trects-kba2014-filtered/9/2012-11-09-06.gz
/nfs/trects-kba2014-filtered/9/2012-11-08-22.gz
/nfs/trects-kba2014-fil

KeyboardInterrupt: 

In [None]:
#print(corpus.head(10))

In [None]:
#print(corpus.iloc[6,:])