### Notes on current discoveries:

- All files in storage claim are accessed with the base address '/nfs/'
- Target files are HTML wrapped in .gz, use gzip to unzip
- The inner text of <doc> is stored as the None tag where tag.string != '\n' (is more than just a newline character
- .string of <doc>'s inner text parses inner <a> tags etc. into just their inner text

### Ideas for language model
- NLTK
- BERT

- Load data set -> preprocessing


## Installing Packages

In [1]:
!pip install spacy-langdetect
!pip install language-detector
!pip install symspellpy
!pip install sentence-transformers
!pip install beautifulsoup4 lxml  # html/xml parser
print("successfully installed packages")

Collecting spacy-langdetect
  Downloading spacy_langdetect-0.1.2-py3-none-any.whl (5.0 kB)
Collecting langdetect==1.0.7
  Downloading langdetect-1.0.7.zip (998 kB)
[K     |████████████████████████████████| 998 kB 3.3 MB/s eta 0:00:01
[?25hCollecting pytest
  Downloading pytest-6.1.2-py3-none-any.whl (272 kB)
[K     |████████████████████████████████| 272 kB 15.5 MB/s eta 0:00:01
Collecting toml
  Downloading toml-0.10.2-py2.py3-none-any.whl (16 kB)
Collecting packaging
  Downloading packaging-20.4-py2.py3-none-any.whl (37 kB)
Collecting importlib-metadata>=0.12; python_version < "3.8"
  Downloading importlib_metadata-2.0.0-py2.py3-none-any.whl (31 kB)
Collecting iniconfig
  Downloading iniconfig-1.1.1-py2.py3-none-any.whl (5.0 kB)
Collecting py>=1.8.2
  Downloading py-1.9.0-py2.py3-none-any.whl (99 kB)
[K     |████████████████████████████████| 99 kB 14.0 MB/s eta 0:00:01
Collecting pluggy<1.0,>=0.12
  Downloading pluggy-0.13.1-py2.py3-none-any.whl (18 kB)
Collecting zipp>=0.5
  Down

Collecting beautifulsoup4
  Downloading beautifulsoup4-4.9.3-py3-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 3.2 MB/s eta 0:00:01
[?25hCollecting lxml
  Downloading lxml-4.6.1-cp36-cp36m-manylinux1_x86_64.whl (5.5 MB)
[K     |████████████████████████████████| 5.5 MB 14.8 MB/s eta 0:00:01 eta 0:00:01MB 14.8 MB/s eta 0:00:01
[?25hCollecting soupsieve>1.2; python_version >= "3.0"
  Downloading soupsieve-2.0.1-py3-none-any.whl (32 kB)
Installing collected packages: soupsieve, beautifulsoup4, lxml
Successfully installed beautifulsoup4-4.9.3 lxml-4.6.1 soupsieve-2.0.1
successfully installed packages


## Loading Data Set

In [23]:
## IMPORT DEPENDENCIES

from bs4 import BeautifulSoup as bs
import gzip
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook
import os
from collections import OrderedDict
print ("loading data set dependencies successful")

loading data set dependencies successful


In [4]:
## SET FILE META VARIABLES

corpus_path = "/nfs/trects-kba2014-filtered" # directory of corpus of gzipped html files
topics_path = corpus_path + "/test-topics.xml"
doc_tags = ['topic_id','streamid', 'docid', 'yyyymmddhh', 'kbastream', 'zulu', 'epoch', 'title', 'text', 'url'] # doc fields
topic_tags = ['id', 'title', 'description', 'start','end','query','type'] # topic fields
#test_file_addr = "/1/2012-02-22-15.gz"

In [29]:
# open and get beautifulsoup object from markup file
def open_markup_file(addr, gz=True, xml=False, verbose=False):
    markup = None
    f = None
    
    if verbose:
        print(addr)

    if gz:
        f = gzip.open(addr)
    else:
        f = open(addr)
        
    if xml == False:
        markup = bs(f)  # open as html
    else:
        markup = bs(f, "xml")
        
    f.close()
    return markup


# parse markup and return 2D list [entry:tags]
def parse_markup(markup, entry_list, find_tag="doc", tag_list=doc_tags, topic_id=None):
    for e in markup.find_all(find_tag):
        entry = OrderedDict.fromkeys(tag_list)
        if topic_id is not None:
            entry['topic_id'] = topic_id
        for c in e.children:  # children use direct children, descendants uses all
            if c.name in entry:
                entry[c.name] = c.string
            elif c.name is None and c.string != '\n':  # inner body of <doc> tag
                entry['text'] = c.string
#             elif c.name is not None:
#                 print("Entry has unexpected field: " + str(c.name))
#                 print("parent field: " + c.parent.name)
#                 print("field content: " + str(c))
        
#         missing_fields = [k for (k,v) in entry.items() if v == None]
#         for m in missing_fields:
#             print("Entry is missing field: " + m)
#             try:
#                 for (k,v) in entry.items():
#                     print(k,v)
#             except:
#                 print("could not print entry dictionary")
        entry_list.append(list(entry.values()))
        
            
# recursively find gz html files from a directory address
def search_dir(path):    
    # separate the subdirectories and html files 
    # (help maintain sequential order of insertion)
#     subdirs = []
    gz_paths = []
    for f in os.scandir(path):
#         if f.is_dir():
#             subdirs.append(f.path)
        if os.path.splitext(f.path)[-1].lower() == ".gz":
            gz_paths.append(f.path)
    
#     # search subdirs
#     for sd in subdirs:
#         search_dir(sd, gz_paths)
    return gz_paths


def list_to_dataframe(markup_list, tags):
    return pd.DataFrame(markup_list, columns=tags)

In [6]:
# load topics into dataframe
def load_topics(path):
    topics_list = []
    
    parse_markup(open_markup_file(path, gz=False, xml=True), 
                    topics_list, find_tag="event", tag_list=topic_tags)
    
    
    return  list_to_dataframe(topics_list, topic_tags)

topics = load_topics(topics_path)

In [7]:
print("Topics loaded successfuly")
print(topics.head(4))

Topics loaded successfuly
  id                                title  \
0  1      2012 Buenos Aires Rail Disaster   
1  2  2012 Pakistan garment factory fires   
2  3                 2012 Aurora shooting   
3  4       Wisconsin Sikh temple shooting   

                                         description       start         end  \
0  http://en.wikipedia.org/wiki/2012_Buenos_Aires...  1329910380  1330774380   
1  http://en.wikipedia.org/wiki/2012_Pakistan_gar...  1347368400  1348232400   
2  http://en.wikipedia.org/wiki/2012_Aurora_shooting  1342766280  1343630280   
3  http://en.wikipedia.org/wiki/Wisconsin_Sikh_te...  1344180300  1345044300   

                      query      type  
0  buenos aires train crash  accident  
1     pakistan factory fire  accident  
2         colorado shooting  shooting  
3      sikh temple shooting  shooting  


In [30]:
# load all formatted gzipped html files into dataframe
def load_corpus(path):
    corpus_list = []
    gz_paths = []
    for topic_id in topics['id'].to_numpy():
        id_path = corpus_path + "/" + topic_id + "/"  # every topic id correlates to subfolder named after it
        gz_paths = search_dir(id_path)
    for gz_path in tqdm(gz_paths, position=0, leave=True):
        parse_markup(open_markup_file(gz_path, verbose=False),
                        corpus_list, topic_id=topic_id)
#     for topic_id in topics['id'].to_numpy():
#         id_path = corpus_path + "/" + topic_id + "/"  # every topic id correlates to subfolder named after it
#         gz_paths = search_dir(id_path)
#         for gz_path in gz_paths:
#             parse_markup(open_markup_file(gz_path, verbose=True),
#                             corpus_list, topic_id=topic_id)
    return list_to_dataframe(corpus_list, doc_tags)

corpus = load_corpus(corpus_path)
#print("Corpus loaded Successfully")

 99%|█████████▉| 239/241 [01:24<00:00,  4.81it/s]




[A[A[A[A[A

[A[A


[A[A[A
[A



100%|██████████| 241/241 [01:24<00:00,  2.86it/s]


In [21]:
print("Corpus loaded succesfully: " + str(len(corpus)) + " documents loaded.")
print(corpus.head(4))

Corpus loaded succesfully: 1578 documents loaded.
  topic_id                                     streamid  \
0       10  1354113657-a4417f055ea5ae84207a4edb4dad881b   
1       10  1354112039-110cc86ea7a8a1b58306dfade5b300ec   
2       10  1354114192-a4417f055ea5ae84207a4edb4dad881b   
3       10  1354114426-6c8d58d994c0e3243ee8dca8f34516a4   

                              docid     yyyymmddhh        kbastream  \
0  a4417f055ea5ae84207a4edb4dad881b  2012-11-28-14  MAINSTREAM_NEWS   
1  110cc86ea7a8a1b58306dfade5b300ec  2012-11-28-14  MAINSTREAM_NEWS   
2  a4417f055ea5ae84207a4edb4dad881b  2012-11-28-14  MAINSTREAM_NEWS   
3  6c8d58d994c0e3243ee8dca8f34516a4  2012-11-28-14           WEBLOG   

                     zulu       epoch  \
0  2012-11-28T14:40:57.0Z  1354113657   
1  2012-11-28T14:13:59.0Z  1354112039   
2  2012-11-28T14:49:52.0Z  1354114192   
3  2012-11-28T14:53:46.0Z  1354114426   

                                               title  \
0  Morning Briefing: Support grows f

## Preprocessing

In [33]:
## IMPORT DEPENDENCIES

from nltk.corpus import wordnet
import re
import matplotlib.pyplot as plt
from nltk.corpus import stopwords 

#from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
import re
import nltk
from nltk.tokenize import word_tokenize
from language_detector import detect_language

import pkg_resources
from symspellpy import SymSpell, Verbosity

print("preprocessing dependencies import successful")

preprocessing dependencies import successful


In [34]:
## SPELL CHECKER

sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")
if sym_spell.word_count:
    pass
else:
    sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
