# Building the Corpus

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
# Setting Snorkel DB location
import os
import sys

#For PostgreSQL
#postgres_location = 'postgresql://jdunnmon:123@localhost:5432'
postgres_location = 'postgresql://saeideh:123@localhost:5432'
#postgres_db_name = 'memex_db_snorkel_large'
#postgres_db_name = 'memex_snorkel_db_extracted_text_10K'
#postgres_db_name = 'memex_snorkel_db_extracted_text_150K'
#postgres_db_name = 'memex_db_snorkel_tsv_1M'
postgres_db_name = 'memex_db_saeideh_10k_test'
os.environ['SNORKELDB'] = os.path.join(postgres_location,postgres_db_name)

# Adding path above for utils
sys.path.append('..')

# For SQLite
#db_location = '.'
#db_name = "snorkel_memex.db"
#os.environ['SNORKELDB'] = '{0}:///{1}/{2}'.format("sqlite", db_location, db_name)

# Start Snorkel session
from snorkel import SnorkelSession
session = SnorkelSession()

In [5]:
# Set data source: options are content.tsv, memex_jsons
data_source = 'content.tsv'

# Setting max number of docs to ingest
max_docs = 10000

## creating a preprocessor based on the files

In [6]:
from snorkel_utils import MemexTSVDocPreprocessor, MEMEXJsonLGZIPPreprocessor, ESTSVDocPreprocessor, retrieve_all_files

if data_source == 'content.tsv':
    data_loc = '/lfs/local/0/jdunnmon/data/memex-data/gold_labels/data_sample'
    
    # Setting path to MEMEX source data
    file_path = '/lfs/local/0/jdunnmon/data/memex-data/gold_labels/content.tsv'

    # Setting path to unique URL MEMEX source data
    file_path_unique = '/dfs/scratch1/jdunnmon/data/memex-data/gold_labels/content_unique.tsv'
 

    # Initializing document preprocessor
    doc_preprocessor = MemexTSVDocPreprocessor(
        path=file_path_unique,
        max_docs=max_docs,
        verbose=False,
        clean_docs=True
    )
    
elif data_source == 'es':
    # Setting path to MEMEX source data
    file_path_unique = '/dfs/scratch1/jdunnmon/data/memex-data/es/es_locations.tsv'
    
        # Initializing document preprocessor
    doc_preprocessor = ESTSVDocPreprocessor(
        path=file_path_unique,
        max_docs=max_docs,
        verbose=False,
        clean_docs=True
    )

elif data_source == 'memex_jsons':
    # Location on raiders
    data_loc = '/lfs/local/0/jdunnmon/data/memex-data/gold_labels/data_sample'

    # Getting all file paths
    path_list = retrieve_all_files(data_loc)

    # Applying arbitrary conditions to file path list
    path_list = [a for a in path_list if a.endswith('gz')]

    # Preprocessing documents from path_list
    # Set "content field" to "extracted_text" to use extracted text as raw content
    doc_preprocessor = MEMEXJsonLGZIPPreprocessor(data_loc,\
                                    file_list=path_list,encoding='utf-8', max_docs=max_docs, verbose=False, content_field='extracted_text')
else:
    raise ValueError('Invalid data source!')


In [5]:
#a = doc_preprocessor._read_content_file(path_list[1])

In [6]:
#b = a[a['content_type'] == 'text/html; charset=UTF-8']
#s =a['extracted_text'][801].replace('\n',' ').replace('\t',' ')
#" ".join(s.split())

In [6]:
from snorkel.parser import CorpusParser
from snorkel.parser.spacy_parser import Spacy

# Applying corpus parser
corpus_parser = CorpusParser(parser=Spacy())
%time corpus_parser.apply(list(doc_preprocessor), parallelism=8, verbose=False)

Clearing existing...
Running UDF...
CPU times: user 2.14 s, sys: 392 ms, total: 2.53 s
Wall time: 24.8 s


In [3]:
from snorkel.models import Document, Sentence

# Printing number of docs/sentences
print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

Documents: 10000
Sentences: 45381


In [13]:
import pickle

# Importing gold label dict
with open('/lfs/local/0/jdunnmon/data/memex-data/gold_labels/gold_loc.pickle', 'rb') as handle:
    gold_dict = pickle.load(handle)

In [8]:
from snorkel_utils import create_test_train_splits

# Getting all documents parsed by Snorkel
docs = session.query(Document).order_by(Document.name).all()

# Creating train, test, dev splits
%time train_docs, dev_docs, test_docs, train_sents, dev_sents, test_sents = create_test_train_splits(docs, 'location', gold_dict=None, dev_frac=0.01, test_frac=0.01,)

Train: 9800 Docs, 44248 Sentences
Dev: 100 Docs, 640 Sentences
Test: 100 Docs, 493 Sentences
CPU times: user 13.3 s, sys: 940 ms, total: 14.3 s
Wall time: 22.1 s


In [9]:
from snorkel.models import Candidate, candidate_subclass

# Designing candidate subclasses
LocationExtraction = candidate_subclass('Location', ['location'])

In [10]:
from snorkel.candidates import Ngrams
from snorkel.candidates import CandidateExtractor
#from snorkel.matchers import LocationMatcher
from snorkel_utils import get_location_matcher, get_candidate_filter, CandidateExtractorFilter, LocationMatcher

# Defining ngrams and matcher for candidate extractor
location_ngrams   = Ngrams(n_max=3)
#location_matcher  = get_location_matcher()
location_matcher = LocationMatcher(longest_match_only=True)
#candidate_filter =  get_candidate_filter()
#cand_extractor = CandidateExtractorFilter(LocationExtraction,[location_ngrams],[location_matcher],candidate_filter=candidate_filter)
cand_extractor    = CandidateExtractor(LocationExtraction, [location_ngrams], [location_matcher])

In [11]:
for k, sents in enumerate([train_sents, dev_sents, test_sents]):
    %time cand_extractor.apply(sents, split=k, parallelism=8)
    print("Number of candidates:", session.query(LocationExtraction).filter(LocationExtraction.split == k).count())

Clearing existing...
Running UDF...
CPU times: user 5.26 s, sys: 1.39 s, total: 6.66 s
Wall time: 11.7 s
Number of candidates: 3785
Clearing existing...
Running UDF...
CPU times: user 120 ms, sys: 316 ms, total: 436 ms
Wall time: 3.51 s
Number of candidates: 43
Clearing existing...
Running UDF...
CPU times: user 100 ms, sys: 292 ms, total: 392 ms
Wall time: 3.48 s
Number of candidates: 58


In [19]:
from snorkel_utils import get_gold_labels_from_meta

# Adding dev gold labels using dictionary
%time missed_dev = get_gold_labels_from_meta(session, LocationExtraction, 'location', 1, annotator='gold', gold_dict = None)

# Adding test gold labels using dictionary
%time missed_test = get_gold_labels_from_meta(session, LocationExtraction, 'location', 2, annotator='gold', gold_dict= None)

Loading 43 candidate labels

AnnotatorLabels created: 43
CPU times: user 812 ms, sys: 40 ms, total: 852 ms
Wall time: 902 ms
Loading 58 candidate labels

AnnotatorLabels created: 58
CPU times: user 1.11 s, sys: 76 ms, total: 1.19 s
Wall time: 1.26 s


In [18]:
from snorkel_utils import remove_gold_labels
#Remove gold labels if you want -- uncomment!
#remove_gold_labels(session)

In [20]:
# Checking percent of gold labels that are positive
from snorkel_utils import check_gold_perc
perc_pos = check_gold_perc(session)

Percent Positive: 0.06


## SANDBOX (area for testing)

In [22]:
a = train_docs[6]

In [23]:
a.sentences

[Sentence(Document http://920-850-6302.escortphonelist.com/im-here-i-hope-your-near-3225785.html,0,b'"Fargo.'),
 Sentence(Document http://920-850-6302.escortphonelist.com/im-here-i-hope-your-near-3225785.html,1,b'920-850-6302'),
 Sentence(Document http://920-850-6302.escortphonelist.com/im-here-i-hope-your-near-3225785.html,2,b"I'm back"),
 Sentence(Document http://920-850-6302.escortphonelist.com/im-here-i-hope-your-near-3225785.html,3,b'and I need to finish what I started,'),
 Sentence(Document http://920-850-6302.escortphonelist.com/im-here-i-hope-your-near-3225785.html,4,b'clean comfortable incalls'),
 Sentence(Document http://920-850-6302.escortphonelist.com/im-here-i-hope-your-near-3225785.html,5,b"I'm hot sexy and ready for you."),
 Sentence(Document http://920-850-6302.escortphonelist.com/im-here-i-hope-your-near-3225785.html,6,b"5'5 brown eyes and a body to die for"),
 Sentence(Document http://920-850-6302.escortphonelist.com/im-here-i-hope-your-near-3225785.html,7,b"I'm looki

In [17]:
from snorkel.annotations import load_gold_labels

L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_test = load_gold_labels(session, annotator_name='gold', split=2)

In [18]:
train_sents[6]

Sentence(Document http://liveescortreviews.com/ad/sanjose/408-621-7949/1/199206,1,b'Ph@t @$$')

In [13]:
doc = docs[1002]
dict_string = doc.meta['extractions'].replace('|','').strip('\n').strip('b').replace('""','"').replace('\\"',"\\").replace('\\','\\\\')
#dict_string = dict_string[1:-1]
print(dict_string)
import json
a = json.loads(dict_string)

"{'location': 'Lisboa, Portugal',\\n 'title': '? ? ? ? ? ? ? ? VIOLET ? ? ? ? ? ? ? ? - Lisboa acompanhantes - '\\n          'backpage.com'}"


JSONDecodeError: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)

In [30]:
a = eval(doc.meta['extractions'])

In [38]:
doc.meta['extractions']

'b"{\'location\': \'Lisboa, Portugal\',\\n \'title\': \'? ? ? ? ? ? ? ? VIOLET ? ? ? ? ? ? ? ? - Lisboa acompanhantes - \'\\n          \'backpage.com\'}"\r\n'

In [39]:
"\'"

"'"