# Building the Corpus

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# Setting Snorkel DB location
import os
import sys

#For PostgreSQL
postgres_location = 'postgresql://jdunnmon:123@localhost:5432'
postgres_db_name = 'memex_db_snorkel'
os.environ['SNORKELDB'] = os.path.join(postgres_location,postgres_db_name)

# Adding path above for utils
sys.path.append('..')
# For SQLite
#db_location = '.'
#db_name = "snorkel_memex.db"
#os.environ['SNORKELDB'] = '{0}:///{1}/{2}'.format("sqlite", db_location, db_name)

# Start Snorkel session
from snorkel import SnorkelSession
session = SnorkelSession()

In [13]:
from snorkel_utils import MemexTSVDocPreprocessor, MEMEXJsonLGZIPPreprocessor, retrieve_all_files

# Setting path to MEMEX source data
data_loc = '../../../data/data_sample'
 
path_list = retrieve_all_files(data_loc)

# Applying arbitrary conditions to file path list
path_list = [a for a in path_list if a.endswith('gz')]
path_list = path_list[0:1]

# Setting max number of docs to ingest
max_docs = 1000

# Setting parallelism
parallelism = 8

# Preprocessing documents from path_list
doc_preprocessor = MEMEXJsonLGZIPPreprocessor(data_loc,\
                                file_list=path_list,encoding='utf-8',lines_per_entry=1, max_docs=max_docs)

In [13]:
import gzip
import json
import pandas as pd
from bs4 import BeautifulSoup
import re

def _lines_per_n(f, n):
        for line in f:
            yield ''.join(chain([line], islice(f, n - 1)))
        
def _read_content_file(fl):
    json_lst = []
    if fl.endswith('gz'):
        with gzip.GzipFile(fl, 'r') as fin: 
            f = fin.read()
        for chunk in f.splitlines():
            jfile = json.loads(chunk)
            json_lst.append(jfile)

    elif fl.endswith('jsonl'):
        with open(fl) as f:
            for chunk in _lines_per_n(f, self.lines_per_entry):
                jfile = json.loads(chunk)
                json_lst.append(jfile)
    else:
        print('Unrecognized file type!')

    json_pd = pd.DataFrame(json_lst)
    #json_pd = pd.DataFrame(json_lst).dropna()
    return json_pd

def parse_file(file_name):
    
    df = _read_content_file(file_name)
    if 'raw_content' in df.keys():
        for index, row in df.iterrows():
            name = row.url
            stable_id = 5
            html = BeautifulSoup(row.raw_content, 'lxml')
            txt = list(filter(_cleaner, html.findAll(text=True)))
            txt = ' '.join(str(_strip_special(s)) for s in txt if s != '\n')
            #text = row.raw_content[1:-1].encode(self.encoding)
  #              yield Document(name=name, stable_id=stable_id, text=str(text),
   #                                meta={'file_name' : file_name}), str(text)
    else:
        print('File with no raw content!')

def _cleaner(s):
    if s.parent.name in ['style', 'script', '[document]', 'head', 'title']:
        return False
    elif re.match('<!--.*-->', str(s)):
        return False
    return True

def _strip_special(s):
    return (''.join(c for c in s if ord(c) < 128)).encode('ascii', 'ignore')

In [19]:
a = _read_content_file(path_list[0])

In [None]:
parse_file(path_list[0])

In [21]:
from bs4 import BeautifulSoup
c = BeautifulSoup(a.iloc[0]['raw_content'],'lxml')

In [22]:
c

<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8"/>
<title>Live Escort Reviews - 917-891-5995 - Get the Party Started *** Ring in the NEW YEAR with a BRUNETTE HOTTIE ** Available ALL NIGHT LONG - 24</title>
<script type="text/javascript">
//<![CDATA[
try{if (!window.CloudFlare) {var CloudFlare=[{verbose:0,p:0,byc:0,owlid:"cf",bag2:1,mirage2:0,oracle:0,paths:{cloudflare:"/cdn-cgi/nexp/dok2v=1613a3a185/"},atok:"093a2aa7472f102b67e5c93a9603fd2a",petok:"59e5637babc9427bfc627a8f0ae9a4286f0d5649-1420081702-1800",zone:"liveescortreviews.com",rocket:"0",apps:{"ga_key":{"ua":"UA-38179730-1","ga_bs":"2"}}}];!function(a,b){a=document.createElement("script"),b=document.getElementsByTagName("script")[0],a.async=!0,a.src="//ajax.cloudflare.com/cdn-cgi/nexp/dok2v=919620257c/cloudflare.min.js",b.parentNode.insertBefore(a,b)}()}}catch(e){};
//]]>
</script>
<link href="http://liveescortreviews.com/assets/graphics/favicon.ico" rel="icon" type="image/x-icon"/>
<link href="http://liveescortreviews.com/as

In [11]:
from snorkel.parser import CorpusParser
from snorkel.parser.spacy_parser import Spacy

# Applying corpus parser
corpus_parser = CorpusParser(parser=Spacy())
%time corpus_parser.apply(list(doc_preprocessor), parallelism=parallelism)

Clearing existing...
Running UDF...
CPU times: user 7.23 s, sys: 1.31 s, total: 8.54 s
Wall time: 16.7 s


In [12]:
from snorkel.models import Document, Sentence

print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

Documents: 100
Sentences: 1447


In [6]:
from snorkel_utils import create_test_train_splits

# Getting all documents parsed by Snorkel
docs = session.query(Document).order_by(Document.name).all()

# Creating train, test, dev splits
%time train_docs, dev_docs, test_docs, train_sents, dev_sents, test_sents = create_test_train_splits(docs, 'location')

Malformatted JSON Entry!
Train: 79325 Docs, 425966 Sentences
Dev: 9916 Docs, 74223 Sentences
Test: 9916 Docs, 72434 Sentences
CPU times: user 2min 45s, sys: 12.2 s, total: 2min 57s
Wall time: 5min 28s


In [6]:
from snorkel.models import Candidate, candidate_subclass

# Designing candidate subclasses
LocationExtraction = candidate_subclass('Location', ['location'])

In [6]:
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.matchers import LocationMatcher

# Defining ngrams and matcher for candidate extractor
location_ngrams   = Ngrams(n_max=7)
location_matcher  = LocationMatcher(longest_match_only=True)
cand_extractor    = CandidateExtractor(LocationExtraction, [location_ngrams], [location_matcher])

In [9]:
for k, sents in enumerate([train_sents, dev_sents, test_sents]):
    %time cand_extractor.apply(sents, split=k, parallelism=parallelism)
    print("Number of candidates:", session.query(LocationExtraction).filter(LocationExtraction.split == k).count())

Clearing existing...
Running UDF...
CPU times: user 1min 5s, sys: 22.6 s, total: 1min 27s
Wall time: 1min 40s
Number of candidates: 16159
Clearing existing...
Running UDF...
CPU times: user 10.4 s, sys: 16.5 s, total: 26.8 s
Wall time: 32 s
Number of candidates: 3981
Clearing existing...
Running UDF...


Process CandidateExtractorUDF-298:
Traceback (most recent call last):
  File "/lfs/local/0/jdunnmon/repos/anaconda3/envs/py36torch/lib/python3.6/site-packages/sqlalchemy/engine/base.py", line 2158, in _wrap_pool_connect
    return fn()
  File "/lfs/local/0/jdunnmon/repos/anaconda3/envs/py36torch/lib/python3.6/site-packages/sqlalchemy/pool.py", line 403, in connect
    return _ConnectionFairy._checkout(self)
  File "/lfs/local/0/jdunnmon/repos/anaconda3/envs/py36torch/lib/python3.6/site-packages/sqlalchemy/pool.py", line 788, in _checkout
    fairy = _ConnectionRecord.checkout(pool)
  File "/lfs/local/0/jdunnmon/repos/anaconda3/envs/py36torch/lib/python3.6/site-packages/sqlalchemy/pool.py", line 532, in checkout
    rec = pool._do_get()
  File "/lfs/local/0/jdunnmon/repos/anaconda3/envs/py36torch/lib/python3.6/site-packages/sqlalchemy/pool.py", line 1193, in _do_get
    self._dec_overflow()
  File "/lfs/local/0/jdunnmon/repos/anaconda3/envs/py36torch/lib/python3.6/site-packages/sqlalche

CPU times: user 10.9 s, sys: 15.4 s, total: 26.3 s
Wall time: 56.3 s
Number of candidates: 4172


In [7]:
cands_test = session.query(LocationExtraction).filter(LocationExtraction.split == 2).all()

In [8]:
from snorkel_utils import get_gold_labels_from_meta

# Adding dev gold labels
%time missed = get_gold_labels_from_meta(session, LocationExtraction, 'location', 1, annotator='gold')

# Adding test gold labels
%time missed = get_gold_labels_from_meta(session, LocationExtraction, 'location', 2, annotator='gold')

Loading 3981 candidate labels

AnnotatorLabels created: 3981
CPU times: user 35.9 s, sys: 1.27 s, total: 37.1 s
Wall time: 45.9 s
Loading 4172 candidate labels

AnnotatorLabels created: 4172
CPU times: user 37.2 s, sys: 1.12 s, total: 38.3 s
Wall time: 47.4 s


In [22]:
from snorkel.annotations import load_gold_labels

L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_test = load_gold_labels(session, annotator_name='gold', split=2)