### Notes on current discoveries:

- All files in storage claim are accessed with the base address '/nfs/'
- Target files are HTML wrapped in .gz, use gzip to unzip
- The inner text of <doc> is stored as the None tag where tag.string != '\n' (is more than just a newline character
- .string of <doc>'s inner text parses inner <a> tags etc. into just their inner text

### Ideas for language model
- NLTK
- BERT



In [55]:
!!pip install beautifulsoup4 lxml
from bs4 import BeautifulSoup as bs
import gzip
import pandas as pd
import numpy as np
import os
from collections import OrderedDict

In [75]:
corpus_path = "/nfs/trects-kba2014-filtered" # directory of corpus of gzipped html files
topics_path = corpus_path + "/test-topics.xml"
doc_tags = ['topic_id','streamid', 'docid', 'yyyymmddhh', 'kbastream', 'zulu', 'epoch', 'title', 'text', 'url'] # doc fields
topic_tags = ['id', 'title', 'description', 'start','end','query','type'] # topic fields
test_file_addr = "/1/2012-02-22-15.gz"

In [None]:
# The inner text of <doc> is stored as the None tag where tag.string != '\n' (is more than just a newline character
# .string of <doc>'s inner text parses inner <a> tags etc. into just their inner text

# open and get beautifulsoup object from markup file
def open_markup_file(addr, gz=True, xml=False, verbose=False):
    markup = None
    f = None
    
    if verbose:
        print(addr)

    if gz:
        f = gzip.open(addr)
    else:
        f = open(addr)
        
    if xml == False:
        markup = bs(f)  # open as html
    else:
        markup = bs(f, "xml")
        
    f.close()
    return markup


# parse markup and return 2D list [entry:tags]
def parse_markup(markup, entry_list, find_tag="doc", tag_list=doc_tags, topic_id=None):
    for e in markup.find_all(find_tag):
        entry = OrderedDict.fromkeys(tag_list)
        if topic_id is not None:
            entry['topic_id'] = topic_id
        for c in e.children:  # children use direct children, descendants uses all
            if c.name in entry:
                entry[c.name] = c.string
            elif c.name is None and c.string != '\n':  # inner body of <doc> tag
                entry['text'] = c.string
#             elif c.name is not None:
#                 print("Entry has unexpected field: " + str(c.name))
#                 print("parent field: " + c.parent.name)
#                 print("field content: " + str(c))
        
        missing_fields = [k for (k,v) in entry.items() if v == None]
        for m in missing_fields:
            print("Entry is missing field: " + m)
        entry_list.append(list(entry.values()))
        
            
# recursively find gz html files from a directory address
def search_dir(path):    
    # separate the subdirectories and html files 
    # (help maintain sequential order of insertion)
#     subdirs = []
    gz_paths = []
    for f in os.scandir(path):
#         if f.is_dir():
#             subdirs.append(f.path)
        if os.path.splitext(f.path)[-1].lower() == ".gz":
            gz_paths.append(f.path)
    
#     # search subdirs
#     for sd in subdirs:
#         search_dir(sd, gz_paths)
    return gz_paths


def list_to_dataframe(markup_list, tags):
    return pd.DataFrame(markup_list, columns=tags)

In [None]:
# load topics into dataframe
def load_topics(path):
    topics_list = []
    
    parse_markup(open_markup_file(path, gz=False, xml=True), 
                    topics_list, find_tag="event", tag_list=topic_tags)
    
    
    return  list_to_dataframe(topics_list, topic_tags)

topics = load_topics(topics_path)

In [None]:
print("Topics loaded successfuly")
print(topics.head(4))

In [None]:
# load all formatted gzipped html files into dataframe
def load_corpus(path):
    corpus_list = []
    for topic_id in topics['id'].to_numpy():
        id_path = corpus_path + "/" + topic_id + "/"  # every topic id correlates to subfolder named after it
        gz_paths = search_dir(id_path)
        for gz_path in gz_paths:
            parse_markup(open_markup_file(gz_path, verbose=True),
                            corpus_list,topic_id=topic_id)
    return list_to_dataframe(corpus_list, doc_tags)

corpus = load_corpus(corpus_path)
#print("Corpus loaded Successfully")

/nfs/trects-kba2014-filtered/1/2012-02-27-00.gz
/nfs/trects-kba2014-filtered/1/2012-02-26-15.gz
/nfs/trects-kba2014-filtered/1/2012-02-26-17.gz
/nfs/trects-kba2014-filtered/1/2012-02-24-15.gz
/nfs/trects-kba2014-filtered/1/2012-02-28-11.gz
/nfs/trects-kba2014-filtered/1/2012-02-24-17.gz
/nfs/trects-kba2014-filtered/1/2012-02-23-20.gz
/nfs/trects-kba2014-filtered/1/2012-02-23-05.gz
/nfs/trects-kba2014-filtered/1/2012-02-24-01.gz
/nfs/trects-kba2014-filtered/1/2012-02-26-12.gz
/nfs/trects-kba2014-filtered/1/2012-02-25-16.gz
/nfs/trects-kba2014-filtered/1/2012-02-28-08.gz
/nfs/trects-kba2014-filtered/1/2012-02-26-19.gz
/nfs/trects-kba2014-filtered/1/2012-02-28-05.gz
/nfs/trects-kba2014-filtered/1/2012-02-29-14.gz
/nfs/trects-kba2014-filtered/1/2012-02-26-07.gz
/nfs/trects-kba2014-filtered/1/2012-03-02-14.gz
/nfs/trects-kba2014-filtered/1/2012-02-25-10.gz
/nfs/trects-kba2014-filtered/1/2012-02-29-20.gz
/nfs/trects-kba2014-filtered/1/2012-03-01-01.gz
/nfs/trects-kba2014-filtered/1/2012-02-2

/nfs/trects-kba2014-filtered/1/2012-02-29-17.gz
/nfs/trects-kba2014-filtered/1/2012-02-23-22.gz
/nfs/trects-kba2014-filtered/1/2012-02-26-11.gz
/nfs/trects-kba2014-filtered/1/2012-02-29-22.gz
/nfs/trects-kba2014-filtered/1/2012-02-29-09.gz
/nfs/trects-kba2014-filtered/1/2012-03-01-08.gz
/nfs/trects-kba2014-filtered/1/2012-02-27-22.gz
/nfs/trects-kba2014-filtered/1/2012-02-26-00.gz
/nfs/trects-kba2014-filtered/1/2012-02-29-13.gz
/nfs/trects-kba2014-filtered/1/2012-02-23-04.gz
/nfs/trects-kba2014-filtered/1/2012-02-25-07.gz
/nfs/trects-kba2014-filtered/1/2012-02-28-15.gz
/nfs/trects-kba2014-filtered/1/2012-02-28-17.gz
/nfs/trects-kba2014-filtered/1/2012-02-23-17.gz
/nfs/trects-kba2014-filtered/1/2012-02-24-10.gz
/nfs/trects-kba2014-filtered/1/2012-02-23-16.gz
/nfs/trects-kba2014-filtered/1/2012-02-28-16.gz
/nfs/trects-kba2014-filtered/1/2012-02-29-04.gz
/nfs/trects-kba2014-filtered/1/2012-02-27-12.gz
/nfs/trects-kba2014-filtered/1/2012-03-01-19.gz
/nfs/trects-kba2014-filtered/1/2012-02-2

/nfs/trects-kba2014-filtered/2/2012-09-16-05.gz
/nfs/trects-kba2014-filtered/2/2012-09-14-04.gz
/nfs/trects-kba2014-filtered/2/2012-09-18-17.gz
/nfs/trects-kba2014-filtered/2/2012-09-15-06.gz
/nfs/trects-kba2014-filtered/2/2012-09-17-11.gz
/nfs/trects-kba2014-filtered/2/2012-09-14-11.gz
/nfs/trects-kba2014-filtered/2/2012-09-11-15.gz
/nfs/trects-kba2014-filtered/2/2012-09-17-01.gz
/nfs/trects-kba2014-filtered/2/2012-09-13-16.gz
/nfs/trects-kba2014-filtered/2/2012-09-17-14.gz
/nfs/trects-kba2014-filtered/2/2012-09-13-18.gz
/nfs/trects-kba2014-filtered/2/2012-09-13-23.gz
/nfs/trects-kba2014-filtered/2/2012-09-17-03.gz
/nfs/trects-kba2014-filtered/2/2012-09-15-01.gz
/nfs/trects-kba2014-filtered/2/2012-09-14-09.gz
/nfs/trects-kba2014-filtered/2/2012-09-19-00.gz
/nfs/trects-kba2014-filtered/2/2012-09-20-07.gz
/nfs/trects-kba2014-filtered/2/2012-09-19-04.gz
/nfs/trects-kba2014-filtered/2/2012-09-19-07.gz
/nfs/trects-kba2014-filtered/2/2012-09-14-22.gz
/nfs/trects-kba2014-filtered/2/2012-09-1

/nfs/trects-kba2014-filtered/3/2012-07-24-05.gz
/nfs/trects-kba2014-filtered/3/2012-07-25-16.gz
/nfs/trects-kba2014-filtered/3/2012-07-29-05.gz
/nfs/trects-kba2014-filtered/3/2012-07-28-06.gz
/nfs/trects-kba2014-filtered/3/2012-07-23-13.gz
/nfs/trects-kba2014-filtered/3/2012-07-25-02.gz
/nfs/trects-kba2014-filtered/3/2012-07-21-03.gz
/nfs/trects-kba2014-filtered/3/2012-07-20-20.gz
/nfs/trects-kba2014-filtered/3/2012-07-25-19.gz
/nfs/trects-kba2014-filtered/3/2012-07-29-16.gz
/nfs/trects-kba2014-filtered/3/2012-07-24-20.gz
/nfs/trects-kba2014-filtered/3/2012-07-23-18.gz
/nfs/trects-kba2014-filtered/3/2012-07-23-22.gz
/nfs/trects-kba2014-filtered/3/2012-07-23-17.gz
/nfs/trects-kba2014-filtered/3/2012-07-20-10.gz
/nfs/trects-kba2014-filtered/3/2012-07-22-23.gz
/nfs/trects-kba2014-filtered/3/2012-07-27-09.gz
/nfs/trects-kba2014-filtered/3/2012-07-29-01.gz
/nfs/trects-kba2014-filtered/3/2012-07-21-04.gz
/nfs/trects-kba2014-filtered/3/2012-07-27-17.gz
/nfs/trects-kba2014-filtered/3/2012-07-2

/nfs/trects-kba2014-filtered/3/2012-07-22-01.gz
/nfs/trects-kba2014-filtered/3/2012-07-20-08.gz
/nfs/trects-kba2014-filtered/3/2012-07-29-03.gz
/nfs/trects-kba2014-filtered/3/2012-07-27-19.gz
/nfs/trects-kba2014-filtered/3/2012-07-20-16.gz
/nfs/trects-kba2014-filtered/3/2012-07-21-16.gz
/nfs/trects-kba2014-filtered/3/2012-07-28-14.gz
/nfs/trects-kba2014-filtered/3/2012-07-27-03.gz
/nfs/trects-kba2014-filtered/3/2012-07-25-08.gz
/nfs/trects-kba2014-filtered/3/2012-07-29-17.gz
/nfs/trects-kba2014-filtered/3/2012-07-28-19.gz
/nfs/trects-kba2014-filtered/3/2012-07-22-07.gz
/nfs/trects-kba2014-filtered/3/2012-07-23-21.gz
/nfs/trects-kba2014-filtered/3/2012-07-29-07.gz
/nfs/trects-kba2014-filtered/3/2012-07-29-18.gz
/nfs/trects-kba2014-filtered/3/2012-07-28-22.gz
/nfs/trects-kba2014-filtered/3/2012-07-28-08.gz
/nfs/trects-kba2014-filtered/3/2012-07-25-12.gz
/nfs/trects-kba2014-filtered/3/2012-07-29-00.gz
/nfs/trects-kba2014-filtered/3/2012-07-27-04.gz
/nfs/trects-kba2014-filtered/4/2012-08-0

/nfs/trects-kba2014-filtered/4/2012-08-08-22.gz
/nfs/trects-kba2014-filtered/4/2012-08-08-08.gz
/nfs/trects-kba2014-filtered/4/2012-08-14-20.gz
/nfs/trects-kba2014-filtered/4/2012-08-11-16.gz
/nfs/trects-kba2014-filtered/4/2012-08-08-01.gz
/nfs/trects-kba2014-filtered/4/2012-08-08-16.gz
/nfs/trects-kba2014-filtered/4/2012-08-09-08.gz
/nfs/trects-kba2014-filtered/4/2012-08-06-18.gz
/nfs/trects-kba2014-filtered/4/2012-08-07-23.gz
/nfs/trects-kba2014-filtered/4/2012-08-10-05.gz
/nfs/trects-kba2014-filtered/4/2012-08-11-13.gz
/nfs/trects-kba2014-filtered/4/2012-08-05-16.gz
/nfs/trects-kba2014-filtered/4/2012-08-15-08.gz
/nfs/trects-kba2014-filtered/4/2012-08-09-12.gz
/nfs/trects-kba2014-filtered/4/2012-08-08-02.gz
/nfs/trects-kba2014-filtered/4/2012-08-09-10.gz
/nfs/trects-kba2014-filtered/4/2012-08-11-18.gz
/nfs/trects-kba2014-filtered/4/2012-08-13-07.gz
/nfs/trects-kba2014-filtered/4/2012-08-14-19.gz
/nfs/trects-kba2014-filtered/4/2012-08-10-13.gz
/nfs/trects-kba2014-filtered/4/2012-08-1

/nfs/trects-kba2014-filtered/5/2012-08-29-13.gz
/nfs/trects-kba2014-filtered/5/2012-09-01-03.gz
/nfs/trects-kba2014-filtered/5/2012-09-01-05.gz
/nfs/trects-kba2014-filtered/5/2012-09-05-08.gz
/nfs/trects-kba2014-filtered/5/2012-08-31-10.gz
/nfs/trects-kba2014-filtered/5/2012-08-30-22.gz
/nfs/trects-kba2014-filtered/5/2012-09-03-10.gz
/nfs/trects-kba2014-filtered/5/2012-09-03-17.gz
/nfs/trects-kba2014-filtered/5/2012-09-03-08.gz
/nfs/trects-kba2014-filtered/5/2012-09-04-10.gz
/nfs/trects-kba2014-filtered/5/2012-09-05-06.gz
/nfs/trects-kba2014-filtered/5/2012-08-30-10.gz
/nfs/trects-kba2014-filtered/5/2012-09-01-17.gz
/nfs/trects-kba2014-filtered/5/2012-09-06-06.gz
/nfs/trects-kba2014-filtered/5/2012-09-06-19.gz
/nfs/trects-kba2014-filtered/5/2012-09-05-02.gz
/nfs/trects-kba2014-filtered/5/2012-09-03-15.gz
/nfs/trects-kba2014-filtered/5/2012-09-05-16.gz
/nfs/trects-kba2014-filtered/5/2012-09-03-09.gz
/nfs/trects-kba2014-filtered/5/2012-08-28-17.gz
/nfs/trects-kba2014-filtered/5/2012-08-3

/nfs/trects-kba2014-filtered/6/2012-11-01-09.gz
/nfs/trects-kba2014-filtered/6/2012-10-28-16.gz
/nfs/trects-kba2014-filtered/6/2012-10-29-00.gz
/nfs/trects-kba2014-filtered/6/2012-10-28-07.gz
/nfs/trects-kba2014-filtered/6/2012-10-28-08.gz
/nfs/trects-kba2014-filtered/6/2012-10-29-18.gz
/nfs/trects-kba2014-filtered/6/2012-10-30-16.gz
/nfs/trects-kba2014-filtered/6/2012-10-30-21.gz
/nfs/trects-kba2014-filtered/6/2012-10-30-05.gz
/nfs/trects-kba2014-filtered/6/2012-11-02-19.gz
/nfs/trects-kba2014-filtered/6/2012-10-26-05.gz
/nfs/trects-kba2014-filtered/6/2012-10-24-18.gz
/nfs/trects-kba2014-filtered/6/2012-11-02-06.gz
/nfs/trects-kba2014-filtered/6/2012-10-28-01.gz
/nfs/trects-kba2014-filtered/6/2012-10-27-22.gz
/nfs/trects-kba2014-filtered/6/2012-10-28-20.gz
/nfs/trects-kba2014-filtered/6/2012-10-27-19.gz
/nfs/trects-kba2014-filtered/6/2012-11-03-04.gz
/nfs/trects-kba2014-filtered/6/2012-10-25-18.gz
/nfs/trects-kba2014-filtered/6/2012-10-26-22.gz
/nfs/trects-kba2014-filtered/6/2012-10-2

/nfs/trects-kba2014-filtered/6/2012-10-31-15.gz
/nfs/trects-kba2014-filtered/6/2012-10-24-20.gz
/nfs/trects-kba2014-filtered/6/2012-11-03-01.gz
/nfs/trects-kba2014-filtered/6/2012-11-01-20.gz
/nfs/trects-kba2014-filtered/6/2012-11-01-19.gz
/nfs/trects-kba2014-filtered/6/2012-11-02-14.gz
/nfs/trects-kba2014-filtered/6/2012-10-30-00.gz
/nfs/trects-kba2014-filtered/6/2012-10-30-23.gz
/nfs/trects-kba2014-filtered/6/2012-10-31-22.gz
/nfs/trects-kba2014-filtered/6/2012-11-03-12.gz
/nfs/trects-kba2014-filtered/6/2012-10-25-07.gz
/nfs/trects-kba2014-filtered/6/2012-10-26-14.gz
/nfs/trects-kba2014-filtered/6/2012-11-02-08.gz
/nfs/trects-kba2014-filtered/6/2012-11-02-20.gz
/nfs/trects-kba2014-filtered/6/2012-10-26-12.gz
/nfs/trects-kba2014-filtered/6/2012-10-28-17.gz
/nfs/trects-kba2014-filtered/6/2012-10-30-13.gz
/nfs/trects-kba2014-filtered/6/2012-10-31-12.gz
/nfs/trects-kba2014-filtered/6/2012-11-01-14.gz
/nfs/trects-kba2014-filtered/6/2012-11-02-05.gz
/nfs/trects-kba2014-filtered/6/2012-10-2

/nfs/trects-kba2014-filtered/8/2012-12-02-01.gz
/nfs/trects-kba2014-filtered/8/2012-12-03-06.gz
/nfs/trects-kba2014-filtered/8/2012-12-09-09.gz
/nfs/trects-kba2014-filtered/8/2012-12-08-08.gz
/nfs/trects-kba2014-filtered/8/2012-12-02-05.gz
/nfs/trects-kba2014-filtered/8/2012-12-09-23.gz
/nfs/trects-kba2014-filtered/8/2012-12-05-06.gz
/nfs/trects-kba2014-filtered/8/2012-12-08-20.gz
/nfs/trects-kba2014-filtered/8/2012-12-06-08.gz
/nfs/trects-kba2014-filtered/8/2012-12-04-02.gz
/nfs/trects-kba2014-filtered/8/2012-12-09-20.gz
/nfs/trects-kba2014-filtered/8/2012-12-09-02.gz
/nfs/trects-kba2014-filtered/8/2012-12-08-03.gz
/nfs/trects-kba2014-filtered/8/2012-12-04-07.gz
/nfs/trects-kba2014-filtered/8/2012-12-03-08.gz
/nfs/trects-kba2014-filtered/8/2012-12-04-06.gz
/nfs/trects-kba2014-filtered/8/2012-12-05-23.gz
/nfs/trects-kba2014-filtered/8/2012-12-08-16.gz
/nfs/trects-kba2014-filtered/8/2012-12-03-21.gz
/nfs/trects-kba2014-filtered/8/2012-12-04-15.gz
/nfs/trects-kba2014-filtered/8/2012-12-0

/nfs/trects-kba2014-filtered/9/2012-11-11-07.gz
/nfs/trects-kba2014-filtered/9/2012-11-12-21.gz
/nfs/trects-kba2014-filtered/9/2012-11-11-04.gz
/nfs/trects-kba2014-filtered/9/2012-11-13-01.gz
/nfs/trects-kba2014-filtered/9/2012-11-16-02.gz
/nfs/trects-kba2014-filtered/9/2012-11-10-08.gz
/nfs/trects-kba2014-filtered/9/2012-11-12-19.gz
/nfs/trects-kba2014-filtered/9/2012-11-16-11.gz
/nfs/trects-kba2014-filtered/9/2012-11-15-15.gz
/nfs/trects-kba2014-filtered/9/2012-11-12-00.gz
/nfs/trects-kba2014-filtered/9/2012-11-15-20.gz
/nfs/trects-kba2014-filtered/9/2012-11-14-10.gz
/nfs/trects-kba2014-filtered/9/2012-11-11-03.gz
/nfs/trects-kba2014-filtered/9/2012-11-12-07.gz
/nfs/trects-kba2014-filtered/9/2012-11-16-12.gz
/nfs/trects-kba2014-filtered/9/2012-11-16-13.gz
/nfs/trects-kba2014-filtered/9/2012-11-12-11.gz
/nfs/trects-kba2014-filtered/9/2012-11-08-17.gz
/nfs/trects-kba2014-filtered/9/2012-11-15-21.gz
/nfs/trects-kba2014-filtered/9/2012-11-08-09.gz
/nfs/trects-kba2014-filtered/9/2012-11-1

/nfs/trects-kba2014-filtered/9/2012-11-12-05.gz
/nfs/trects-kba2014-filtered/9/2012-11-12-02.gz
/nfs/trects-kba2014-filtered/9/2012-11-17-16.gz
/nfs/trects-kba2014-filtered/9/2012-11-07-17.gz
/nfs/trects-kba2014-filtered/9/2012-11-11-01.gz
/nfs/trects-kba2014-filtered/10/2012-11-28-14.gz
/nfs/trects-kba2014-filtered/10/2012-11-30-15.gz
/nfs/trects-kba2014-filtered/10/2012-11-28-19.gz
/nfs/trects-kba2014-filtered/10/2012-11-28-08.gz
/nfs/trects-kba2014-filtered/10/2012-11-21-18.gz
/nfs/trects-kba2014-filtered/10/2012-11-27-22.gz
/nfs/trects-kba2014-filtered/10/2012-11-24-21.gz
/nfs/trects-kba2014-filtered/10/2012-11-25-12.gz
/nfs/trects-kba2014-filtered/10/2012-11-22-08.gz
/nfs/trects-kba2014-filtered/10/2012-11-24-23.gz
/nfs/trects-kba2014-filtered/10/2012-11-26-19.gz
/nfs/trects-kba2014-filtered/10/2012-11-29-10.gz
/nfs/trects-kba2014-filtered/10/2012-11-24-20.gz
/nfs/trects-kba2014-filtered/10/2012-11-27-03.gz
/nfs/trects-kba2014-filtered/10/2012-11-23-10.gz
/nfs/trects-kba2014-filte

In [2]:
print("Corpus loaded succesfully: " + str(len(corpus)) + " documents loaded.")
print(corpus.head(4))

NameError: name 'corpus' is not defined

In [None]:
#print(corpus.iloc[6,:])