# Building the Corpus

In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
# Setting Snorkel DB location
import os
import sys

#For PostgreSQL
postgres_location = 'postgresql://jdunnmon:123@localhost:5432'
#postgres_db_name = 'memex_db_snorkel_large'
#postgres_db_name = 'memex_snorkel_db_extracted_text_10K'
#postgres_db_name = 'memex_snorkel_db_extracted_text_150K'
#postgres_db_name = 'memex_db_snorkel_tsv_1M'
postgres_db_name = 'es_locs_small'
os.environ['SNORKELDB'] = os.path.join(postgres_location,postgres_db_name)

# Adding path above for utils
sys.path.append('..')

# For SQLite
#db_location = '.'
#db_name = "snorkel_memex.db"
#os.environ['SNORKELDB'] = '{0}:///{1}/{2}'.format("sqlite", db_location, db_name)

# Start Snorkel session
from snorkel import SnorkelSession
session = SnorkelSession()

In [4]:
# Set data source: options are content.tsv, memex_jsons
data_source = 'es'

# Setting max number of docs to ingest
max_docs = 10000

In [5]:
from snorkel_utils import MemexTSVDocPreprocessor, MEMEXJsonLGZIPPreprocessor, ESTSVDocPreprocessor, retrieve_all_files

if data_source == 'content.tsv':
    data_loc = '/lfs/local/0/jdunnmon/data/memex-data/gold_labels/data_sample'
    
    # Setting path to MEMEX source data
    file_path = '/lfs/local/0/jdunnmon/data/memex-data/gold_labels/content.tsv'

    # Setting path to unique URL MEMEX source data
    file_path_unique = '/lfs/local/0/jdunnmon/data/memex-data/gold_labels/content_unique.tsv'
 

    # Initializing document preprocessor
    doc_preprocessor = MemexTSVDocPreprocessor(
        path=file_path_unique,
        max_docs=max_docs,
        verbose=False,
        clean_docs=True
    )
    
elif data_source == 'es':
    # Setting path to MEMEX source data
    #file_path_unique = '/dfs/scratch1/jdunnmon/data/memex-data/es/es_locations.tsv'
    file_path_unique = '/lfs/local/0/jdunnmon/chtap/extractors/src/elasticsearch_preprocessing/output_location.tsv'
    
        # Initializing document preprocessor
    doc_preprocessor = ESTSVDocPreprocessor(
        path=file_path_unique,
        max_docs=max_docs,
        verbose=False,
        clean_docs=True
    )

elif data_source == 'memex_jsons':
    # Location on raiders
    data_loc = '/lfs/local/0/jdunnmon/data/memex-data/gold_labels/data_sample'

    # Getting all file paths
    path_list = retrieve_all_files(data_loc)

    # Applying arbitrary conditions to file path list
    path_list = [a for a in path_list if a.endswith('gz')]

    # Preprocessing documents from path_list
    # Set "content field" to "extracted_text" to use extracted text as raw content
    doc_preprocessor = MEMEXJsonLGZIPPreprocessor(data_loc,\
                                    file_list=path_list,encoding='utf-8', max_docs=max_docs, verbose=False, content_field='extracted_text')
else:
    raise ValueError('Invalid data source!')


  from ._conv import register_converters as _register_converters


In [5]:
#a = doc_preprocessor._read_content_file(path_list[1])

In [6]:
#b = a[a['content_type'] == 'text/html; charset=UTF-8']
#s =a['extracted_text'][801].replace('\n',' ').replace('\t',' ')
#" ".join(s.split())

In [5]:
from snorkel.parser import CorpusParser
from snorkel.parser.spacy_parser import Spacy

# Applying corpus parser
corpus_parser = CorpusParser(parser=Spacy())
%time corpus_parser.apply(list(doc_preprocessor), parallelism=8, verbose=False)

Clearing existing...
Running UDF...
CPU times: user 22.5 s, sys: 1.18 s, total: 23.6 s
Wall time: 10min


In [6]:
from snorkel.models import Document, Sentence

# Printing number of docs/sentences
print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

Documents: 7508
Sentences: 104986


In [6]:
import pickle

# Importing gold label dict
with open('/lfs/local/0/jdunnmon/data/memex-data/gold_labels/gold_loc.pickle', 'rb') as handle:
    gold_dict = pickle.load(handle)

In [7]:
from snorkel_utils import create_test_train_splits

# Getting all documents parsed by Snorkel
docs = session.query(Document).order_by(Document.name).all()

# Creating train, test, dev splits
%time train_docs, dev_docs, test_docs, train_sents, dev_sents, test_sents = create_test_train_splits(docs, 'location', gold_dict=None, dev_frac=0.01, test_frac=0.01,)

Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Malformatted JSON Entry!
Train: 7358 Docs, 102888 Sentences
Dev: 75 Docs, 976 Sentences
Test: 75 Docs, 1122 Sentences
CPU times: user 18.1 s, sys: 1.08 s, total: 19.2 s
Wall time: 26.8 s


In [9]:
from snorkel.models import Candidate, candidate_subclass
# Designing candidate subclasses
LocationExtraction = candidate_subclass('Location', ['location'])

In [10]:
from snorkel.candidates import Ngrams
from snorkel.candidates import CandidateExtractor
#from snorkel.matchers import LocationMatcher
from snorkel_utils import get_location_matcher, get_candidate_filter, CandidateExtractorFilter, LocationMatcher

# Defining ngrams and matcher for candidate extractor
location_ngrams   = Ngrams(n_max=3)
#location_matcher  = get_location_matcher()
location_matcher = LocationMatcher(longest_match_only=True)
#candidate_filter =  get_candidate_filter()
#cand_extractor = CandidateExtractorFilter(LocationExtraction,[location_ngrams],[location_matcher],candidate_filter=candidate_filter)
cand_extractor    = CandidateExtractor(LocationExtraction, [location_ngrams], [location_matcher])

In [10]:
for k, sents in enumerate([train_sents, dev_sents, test_sents]):
    %time cand_extractor.apply(sents, split=k, parallelism=8)
    print("Number of candidates:", session.query(LocationExtraction).filter(LocationExtraction.split == k).count())

Clearing existing...
Running UDF...
CPU times: user 13.4 s, sys: 1.5 s, total: 14.9 s
Wall time: 20.4 s
Number of candidates: 7202
Clearing existing...
Running UDF...
CPU times: user 164 ms, sys: 232 ms, total: 396 ms
Wall time: 3.49 s
Number of candidates: 53
Clearing existing...
Running UDF...
CPU times: user 208 ms, sys: 248 ms, total: 456 ms
Wall time: 3.55 s
Number of candidates: 93


In [11]:
from snorkel_utils import get_gold_labels_from_meta

# Adding dev gold labels using dictionary
%time missed_dev = get_gold_labels_from_meta(session, LocationExtraction, 'location', 1, annotator='gold', gold_dict=None)

# Adding test gold labels using dictionary
%time missed_test = get_gold_labels_from_meta(session, LocationExtraction, 'location', 2, annotator='gold', gold_dict=None)

Loading 53 candidate labels

AnnotatorLabels created: 53
CPU times: user 2.01 s, sys: 152 ms, total: 2.16 s
Wall time: 2.19 s
Loading 93 candidate labels

AnnotatorLabels created: 93
CPU times: user 1.8 s, sys: 92 ms, total: 1.89 s
Wall time: 1.99 s


In [23]:
from snorkel_utils import remove_gold_labels
# Remove gold labels if you want -- uncomment!
#remove_gold_labels(session)

In [12]:
# Checking percent of gold labels that are positive
from snorkel_utils import check_gold_perc
perc_pos = check_gold_perc(session)

Percent Positive: 0.15


## SANDBOX (area for testing)

In [13]:
a = train_docs[6]

In [14]:
a.sentences

[Sentence(Document b'44FB22984AAED962B4E98E08E1A2861D58E1127760A2F4017593BABE75F0238D',0,b'b\'"Sou uma mulher encantadora, culta, meiga, carinhosa, bom nvel, boa conversa, experiente.'),
 Sentence(Document b'44FB22984AAED962B4E98E08E1A2861D58E1127760A2F4017593BABE75F0238D',1,b'Convivo com homens de'),
 Sentence(Document b'44FB22984AAED962B4E98E08E1A2861D58E1127760A2F4017593BABE75F0238D',2,b'bom gosto'),
 Sentence(Document b'44FB22984AAED962B4E98E08E1A2861D58E1127760A2F4017593BABE75F0238D',3,b'e que gostem de estar'),
 Sentence(Document b'44FB22984AAED962B4E98E08E1A2861D58E1127760A2F4017593BABE75F0238D',4,b'em boa companhia ao lado de uma mulher bonita, bem cuidada, charmosa, sensual e sem preconceitos.'),
 Sentence(Document b'44FB22984AAED962B4E98E08E1A2861D58E1127760A2F4017593BABE75F0238D',5,b'Tenho um corpo bonito e bem cuidado, seios durinhos, bumbum grande, pele cheirosa e macia, cabelos castanhos.'),
 Sentence(Document b'44FB22984AAED962B4E98E08E1A2861D58E1127760A2F4017593BABE75F0

In [17]:
from snorkel.annotations import load_gold_labels

L_gold_dev = load_gold_labels(session, annotator_name='gold', split=1)
L_gold_test = load_gold_labels(session, annotator_name='gold', split=2)

In [18]:
train_sents[6]

Sentence(Document http://liveescortreviews.com/ad/sanjose/408-621-7949/1/199206,1,b'Ph@t @$$')

In [15]:
doc = docs[5]
dict_string = doc.meta['extractions'].replace('\\n','').replace('\'','"').replace('|','').strip('\n').strip('\r').strip('b').strip('"').replace('""','"').replace('\\"',"\\").replace('\\','\\\\')
dict_string = " ".join(dict_string.split()).replace('" "','')
#dict_string = dict_string[1:-1]
print(dict_string)
import json
a = json.loads(str(dict_string))

{"ethnicity": "rican", "location": "Df, Mexico", "title": "Katya, flaquita sexy, real y cogelona. Df, 5522139760 - Df anuncios clasificados de escorts, modelos, edecanes, acompaantes - escorts - backpage.mx"}


In [97]:
a = doc.meta['extractions'].replace('\\n','').strip('\n').strip('b').strip('\r')

In [73]:
c = json.loads(a).replace("'",'"')

In [74]:
c

'{"age": "29", "location": "Pretoria, South Africa", "post_date": "2015-11-28", "title": "Kinky Martin - Sophisticated, Naughty, Well Hung African Hunk "          "Playmate for Women - Pretoria male escorts - backpage.com"}'

In [83]:
doc.meta['extractions']

'b"{\'location\': \'Df, Mexico\',\\n \'title\': \'ALEXXANDRA  Nena Super Coqueta y Atractiva !! Rico ORAL Natural y \'\\n          \'Polaco.. - Df anuncios clasificados erticos y para adultos - \'\\n          \'erticos - backpage.mx\'}"\r\n'

In [80]:
doc.meta['extractions'].strip('b').strip('\n').strip('\r').split(':')

['"{\'age\'',
 " '29',\\n 'location'",
 " 'Pretoria, South Africa',\\n 'post_date'",
 " '2015-11-28',\\n 'title'",
 ' \'Kinky Martin - Sophisticated, Naughty, Well Hung African Hunk \'\\n          \'Playmate for Women - Pretoria male escorts - backpage.com\'}"']

In [103]:
json.loads('{"location": "Brussel, Belgium", "title": "Zoe infirmiere - Brussel escortes - backpage.com"}'

{'location': 'Brussel, Belgium',
 'title': 'Zoe infirmiere - Brussel escortes - backpage.com'}

In [120]:
" ".join(dict_string.split()).replace('" "','')

'{"location": "Guadalajara, Mexico", "phone": "(331) 768-3420", "title": "riquisimo servio escort de primer nivel - Guadalajara anuncios clasificados de escorts, modelos, edecanes, acompaantes - escorts - backpage.mx"}'