# Step 1: Build the Dataset

The first thing to do is ensure that modules are auto-reloaded at runtime to allow for development in other files.

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

We then set the Snorkel database location and start and connect to it.  By default, we use a PosgreSQL database backend, which can be created using `createdb DB_NAME` once psql is installed.  Note that Snorkel does *not* currently support parallel database processing with a SQLite backend.

In [2]:
# Setting Snorkel DB location
import os
import sys

import random
import numpy as np

#For networked PostgreSQL
postgres_location = 'postgresql://saeideh:123@localhost:5432'
postgres_db_name = 'sse_250K_price_test1'
os.environ['SNORKELDB'] = os.path.join(postgres_location,postgres_db_name)

#For local PostgreSQL
#os.environ['SNORKELDB'] = 'postgres:///es_locs_small'

# Adding path above for utils
sys.path.append('../utils')

# For SQLite
#db_location = '.'
#db_name = "es_locs_small.db"
#os.environ['SNORKELDB'] = '{0}:///{1}/{2}'.format("sqlite", db_location, db_name)

# Start Snorkel session
from snorkel import SnorkelSession
session = SnorkelSession()

# Setting random seed
seed = 1701
random.seed(seed)
np.random.seed(seed)

We now set the document preprocessor to read raw data into the Snorkel database.  There exist three possible data source options: JSONL files from the MEMEX project (option: `memex_jsons`), a raw tsv file of extractions from the memex project `content.tsv` (option: `content.tsv`), and tsvs with a similar format to `content.tsv` drawn from an Elasticsearch index of the data (option: `es`).  `max_docs` controls the number of documents read by the preprocessor, and `data_source` sets the location of the data.  For MEMEX json source, this should be a directory, while in all other cases it should be a tsv file.

In [53]:
# from dataset_utils import set_preprocessor, combine_dedupe

# # Set data source: options are 'content.tsv', 'memex_jsons', 'es'
# data_source = 'es'

# # Setting max number of docs to ingest
# max_docs = 1000

# # Setting location of data source

# # For ES:
# data_loc ="/dfs/scratch0/jdunnmon/data/memex-data/tsvs/price/output_all/output_all_shard_00.tsv"
# # Optional: add tsv with additional documents to create combined tsv without duplicates
# #data_all_loc = '/dfs/scratch1/jdunnmon/data/memex-data/es/output_all.tsv'
# #data_loc = combine_dedupe(data_loc, data_all_loc, '/dfs/scratch1/jdunnmon/data/memex-data/es/combined_phone_1M.tsv')

# # Setting preprocessor
# doc_preprocessor = set_preprocessor(data_source, data_loc, max_docs=max_docs, verbose=True,
#                                    clean_docs=True, content_field=['raw_content'])

Now, we execute the preprocessor.  Parallelism can be changed using the `parallelism` flag.  Note that we use the Spacy parser rather than CoreNLP, as this tends to give superior results.

In [None]:
from snorkel.parser import CorpusParser
from snorkel.parser.spacy_parser import Spacy

# Applying corpus parser
corpus_parser = CorpusParser(parser=Spacy())
%time corpus_parser.apply(list(doc_preprocessor), parallelism=16, verbose=True)

Checking the number of parsed documents and sentences in the database.

In [3]:
from snorkel.models import Document, Sentence

# Printing number of docs/sentences
print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

Documents: 247439
Sentences: 4891586


Separating into train, dev, and test sets

In [4]:
from dataset_utils import create_test_train_splits
from random import shuffle
docs = session.query(Document).order_by(Document.name).all()# load docs
#print (len(docs))
docs_sample = docs[:500]
#docs[0].sentences
shuffle(docs_sample)  # shuffle order

train_sents = set()
dev_sents = set()
test_sents = set()
numDocs = len(docs_sample)

splits = (0.9, 0.95) 
###
for i, doc in enumerate(docs_sample):
    #print (i)
    for s in doc.sentences:
      
        if i < splits[0] * numDocs:
            train_sents.add(s)
        elif i < splits[1] * numDocs:
            dev_sents.add(s)
        else:
            test_sents.add(s)

print('Training size: \t{},\
      \nDev size:     \t{},\
      \nTest size:    \t{}'.format(len(train_sents), len(dev_sents), len(test_sents)))
# # Getting all documents parsed by Snorkel
# docs = session.query(Document).order_by(Document.name).all()

# # Creating train, test, dev splits
# %time train_docs, dev_docs, test_docs, train_sents, dev_sents, test_sents = create_test_train_splits(docs, 'price', gold_dict=None, dev_frac=0.01, test_frac=0.01, hand_label=True)

Training size: 	8644,      
Dev size:     	660,      
Test size:    	488


Create candidate extractor.

## Candidate Extractons and Filters

### Defining Matchers

In [6]:
#price_regex = re.compile(r'[0-9]+[\.\,]?[0-9]*[:blank:]*\$')
# price_matcher_1= RegexMatchSpan(rgx =r'[0-9]+[\.\,]?[0-9]*[:blank:]*\$', longest_match_only = True)
# price_matcher_2= RegexMatchSpan(rgx =r'$\[0-9]+[\.\,]?[0-9]*[:blank:]', longest_match_only = True)
# price_matcher_3 = RegexMatchSpan(rgx =r'(\d+\.\d{1,2})')
# price_matcher_4 = RegexMatchSpan(rgx = r"[-+]?\d*\.\d+|\d+")#ur'([£$€])(\d+(?:\.\d{2})?)'
#price_matcher_5 = RegexMatchSpan(rgx = r'([£$€])(\d+(?:\.\d{2})?)')
####Define a relation's ContextSpaces


from snorkel.matchers import *
from snorkel.candidates import Ngrams, CandidateExtractor
import re
number_matcher = NumberMatcher(longest_match_only=True) 

#price_matcher= RegexMatchSpan(rgx =u'(\d+).(\d+)', longest_match_only = True)
price_matcher_1_=RegexMatchSpan(rgx = r"^\$+\d+(\d{1,2})?[.]\d{1,2}$")
price_matcher_2_=RegexMatchSpan(rgx = r"^\d+(\d{1,2})?[.]\d{1,2}$")
price_matcher_3_=RegexMatchSpan(rgx = r"^\$\d+(\d{1,2})?$")
price_matcher_4_=RegexMatchSpan(rgx = r"^\d+(\d{1,2})?$")

price_matcher = Union(price_matcher_1_, price_matcher_2_,price_matcher_3_,price_matcher_4_)#,price_matcher_5)
#price_ngrams = OmniNgrams(n_max=6, split_tokens=[])


### Defining Filters

In [15]:
from snorkel.lf_helpers import (
    get_left_tokens, get_right_tokens, get_between_tokens,
    get_text_between, get_tagged_text)

def ngrams_halfhour_price_filter(cand):
    #lst = [,'hr','hour']
    price_terms =["hr","/hr","hour",'$hour,"hourly", "h", "$h","/hour']
    #time_term = [15]
    #$hh'
    half_terms = ["half", "hh",'$hh','hlf','hhr',"halfhour"]
    right_tokens =list(get_right_tokens(cand, window=2))
    left_tokens = list(get_left_tokens(cand, window=2))
    
    
    if cand[0].get_span()[-1] in ['5','0']:
        if len(list(set(half_terms) & set(right_tokens)))>0 or len(list(set(half_terms) & set(left_tokens)))>0:
            if len(list(set(price_terms) & set(right_tokens)))==0:
                if len(list(set(price_terms) & set(left_tokens)))==0:
                    return True
    
#     return False
# def ngrams_hourly_price_filter(cand):
#     #lst = [,'hr','hour']
#     price_terms =["hr","hour","/hr",'hour','$hour',"hourly",'h',"$h","/hour"]
#     #time_term = [15]
#     #$hh'
#     half_terms = ["half", "hh",'$hh','hlf','hhr',"halfhour"]
#     right_tokens =list(get_right_tokens(cand, window=2))
#     left_tokens = list(get_left_tokens(cand, window=2))
    
    
#     if cand[0].get_span()[-1] in ['5','0']:
#         if len(list(set(price_terms) & set(right_tokens)))>0 or len(list(set(price_terms) & set(left_tokens)))>0:
#             if len(list(set(half_terms) & set(right_tokens)))==0:
#                 if len(list(set(half_terms) & set(left_tokens)))==0:
#                     return True
    
#     return False
# def right_ngrams_price_filter(cand):
#     #lst = [,'hr','hour']
#     price_terms =["minutes","price","hr","hour","min","/hr","minutes$","mins",'hhr',
#             'roses', 'hlf', 'fh', 'hh',
#             'hour', '$hour', 'donation', '$hh', 'reg', 'qk', 'min$'
#             , 'prices', 'hourly', 'qh', '$h', 'nonnegotiable', 'half', 'varies', 'stays',"cash",'/hour',"diamonds"]

#     right_tokens =list(get_right_tokens(cand[0], window=4))
# #     if len(list(set(price_terms) & set(right_tokens)))>0:
# #         print (cand)
# #         #return cand
# #         return True
# #     else:
# #         return False
#     for c in right_tokens:
#         if c in price_terms:
#             return True
#         else:
#             return False
    
   
   
##if "half" not in cand_right_negrams:
#         for token in cand_right_negrams:
#             if token in lst:
#                 return cand
#     else:
#         return False
            
        
            
#candidate_filter= ngrams_hourly_price_filter

NameError: name 'ngrams_hourly_price_filter' is not defined

In [16]:
from snorkel.candidates import Ngrams
from snorkel.candidates import CandidateExtractor
from dataset_utils import create_candidate_class,CandidateExtractorFilter

# Setting extraction type -- should be a subfield in your data source extractions field!
extraction_type = 'price'

# Creating candidate class
candidate_class, candidate_class_name = create_candidate_class(extraction_type)

# Defining ngrams for candidates
ngrams = Ngrams(n_max=5)

# Uand matcher for candidate extractor
matcher = price_matcher
#cand_extractor = CandidateExtractor(candidate_class ,[ngrams],[matcher])#,candidate_filter=None)

In [17]:

cand_extractor = CandidateExtractorFilter(candidate_class ,[ngrams],[matcher])#,candidate_filter=candidate_filter)

Applying candidate extractor to each split (train, dev, test)

In [9]:
# Applying candidate extractor to each split
for k, sents in enumerate([train_sents, dev_sents, test_sents]):
    %time cand_extractor.apply(sents, split=k, parallelism=1)
    print("Number of candidates:", session.query(candidate_class).filter(candidate_class.split == k).count())

Clearing existing...
Running UDF...

CPU times: user 18.1 s, sys: 896 ms, total: 19 s
Wall time: 24.9 s
Number of candidates: 5314
Clearing existing...
Running UDF...

CPU times: user 1.53 s, sys: 84 ms, total: 1.61 s
Wall time: 1.9 s
Number of candidates: 326
Clearing existing...
Running UDF...

CPU times: user 1.44 s, sys: 84 ms, total: 1.53 s
Wall time: 1.86 s
Number of candidates: 302


### candidate investigations

In [10]:
train_cand = session.query(candidate_class).filter(candidate_class.split == 0).all()

In [13]:
def ngrams_halfhour_price_filter(cand):
    #lst = [,'hr','hour']
    price_terms =["hr","/hr","hour",'$hour,"hourly", "h", "$h","/hour']
    #time_term = [15]
    #$hh'
    half_terms = ["half", "hh",'$hh','hlf','hhr',"halfhour"]
    right_tokens =list(get_right_tokens(cand, window=2))
    left_tokens = list(get_left_tokens(cand, window=2))
    
    
    if cand[0].get_span()[-1] in ['5','0']:
        if len(list(set(half_terms) & set(right_tokens)))>0 or len(list(set(half_terms) & set(left_tokens)))>0:
            if len(list(set(price_terms) & set(right_tokens)))==0:
                if len(list(set(price_terms) & set(left_tokens)))==0:
                    return True
    

In [18]:
train_cands = session.query(candidate_class).filter(candidate_class.split == 0).all()
print(len(train_cands))
train_dict = {}
for i, c in enumerate(train_cands):
    doc = c[0].sentence.document.name
    price = c[0].get_span().lower().split()
    text = c[0].get_parent().text.lower()
    #print (doc)
    r = ngrams_halfhour_price_filter(c)
    internal_dict = {}
    if r:
        
        #internal_dict["id"]= doc
        internal_dict["price/hour"]=price
        internal_dict["text"]= text
        train_dict[doc] = internal_dict


# for i, c in enumerate(train_cand):
#     name = c[0].get_span().lower().split()
#     name_1 = c[0].get_parent().text.lower()
#     print ("canidate:",name)
#     print(name_1)

5314


In [20]:
print(len(list(train_dict.keys())))
train_dict

22


{"b'0075B4FC8D25FD95FFE5788F142BD9CD3BE10AFA1AFEA883D2BB6D179979F814'": {'price/hour': ['80'],
  'text': 'url maryland backpage femaleescorts 5oqv b ty full bbw 80 hh e get drained 17551672.'},
 "b'002162F17D43B54820A34F077974DBFD7211E7618510417A503310AD5260DD11'": {'price/hour': ['10'],
  'text': '- michelles magic body rubs.$80 hh delight 10 am - 7 pm - 47'},
 "b'005FDC2C41D375B514448469AA8D138D78727373E4A0CCBC8CB4C93F6220B5B6'": {'price/hour': ['60'],
  'text': 'url washingtondc backpage bodyrubs special 40 hh 60 1hr forever wellness 21407858.'},
 "b'0088E6A27F9FCC5DE8D1369B097589649A922CDE0E4665E1B96FDF5B11D50CEC'": {'price/hour': ['$80'],
  'text': 'hi iam ready show you a good time iam available 24-7 my hh is $80 and my hr is $120 - 26'},
 "b'0006C58C1E38399E623B2DA48B5C3AEA2346AA74EF557C98EECC33B402C35D62'": {'price/hour': ['$80'],
  'text': 'hi iam ready show you a good time iam available 24-7 my hh is $80 and my hr is $120 - 26'},
 "b'004FBFA990D5F28DC377CCB97FDC2BAD7082B41A4C

In [21]:
dev_cands = session.query(candidate_class).filter(candidate_class.split == 1).all()
print(len(dev_cands))
dev_dict = {}
for i, c in enumerate(dev_cands):
    price = c[0].get_span().lower().split()
    text = c[0].get_parent().text.lower()
    #print (doc)
    r = ngrams_halfhour_price_filter(c)
    internal_dict = {}
    if r:
        
        internal_dict["id"]= doc
        internal_dict["price/hour"]=price
        internal_dict["text"]= text
        dev_dict[doc] = internal_dict



326


In [22]:
print(len(list(dev_dict.keys())))
dev_dict

1


{"b'000933C0D6D4007511B1473E84CF086EF23CB1181AD09182572E8796A2749434'": {'id': "b'000933C0D6D4007511B1473E84CF086EF23CB1181AD09182572E8796A2749434'",
  'price/hour': ['150'],
  'text': 'slim, exotic, major hottie clean, outgoing, fun, sweet independent, professional hygiene is a must, must be clean be respectful when contacting me always text before you call no law enforcement no black men no pimps or thugs donation is for my time only 150 hhr 200 hr || do not negotiate || 615-810-0510.'}}

In [24]:
test_cands = session.query(candidate_class).filter(candidate_class.split == 2).all()
print(len(test_cands))
test_dict = {}
for i, c in enumerate(test_cands):
    
    price = c[0].get_span().lower().split()
    print(price)
    text = c[0].get_parent().text.lower()
    print(text)
    #print (doc)
    r = ngrams_halfhour_price_filter(c)
    internal_dict = {}
    if r:
        
        internal_dict["id"]= doc
        internal_dict["price/hour"]=price
        internal_dict["text"]= text
        test_dict[doc] = internal_dict



302
['35']
man gets 35 years for impersonating cop to get free sex from hooker
['2017']
tacoma.backpage.com is an interactive computer service that enables access by multiple users and should not be treated as the publisher or speaker of any information provided by another information content provider. 2017
['7541168']
title 423-888-8141 chattanooga tennessee escort 7541168
['30']
college park, maryland prostitution sting: 30 arrested
['160']
hello gentleman. my name is natalie. im a 25 year old young lady. i have an exotic lineage with my mother being half korean and half black and my father hailing from the dominican republic.im interested in meeting respectful generous gentlemen who just want to kickback and spend some quality time with a beautiful young woman. *always clean. *always well dressed. *57 160 lbs with all natural 36f breasts.. really. natalie 786-592-4019 *explicit talk ends call.**. see my menu of services on my profile before calling...
['25']
hello gentleman. my name

['25']
doing in calls and out calls 330- 734- 9971 - 25
['22']
- sexy beauty desiree 1000% real - 22
['9971']
doing in calls and out calls 330- 734- 9971 - 25
['25']
doing in calls and out calls 330- 734- 9971 - 25
['22']
- a l l o f t h e a b o v e - 22
['23']
doin outcalls 330- 734- 9971 - 23
['9971']
doin outcalls 330- 734- 9971 - 23
['22']
- n.e.w vanessa mind skills truly a mans best friend the real trophy - 22
['9971']
doing in calls 330- 734- 9971 - 25
['25']
doing in calls 330- 734- 9971 - 25
['20']
- new mixed caramel beauty bo0bies & bo0ty $80 hh available now. - 20
['$80']
- new mixed caramel beauty bo0bies & bo0ty $80 hh available now. - 20
['25']
in calls only for grown sexy serious men - 25
['21']
- 60sexy freak vbe 24/7come let me bounce this ass & - 21
['25']
in calls and out calls for grown serious men - 25
['20']
- earlybird now. [azzzzzzz clapping fun] new pics[curvy big booty] % real call - 20
['21']
you will me. gorgeously sexy latin - 21
['20']
- n os euctive & ai

['20']
- new funsized ebony bombshell super freaky open minded & kinky the best new - 20
['5960']
1 866 969 5960 hott lesbian strippers delivered 2 your upstate...
['2']
1 866 969 5960 hott lesbian strippers delivered 2 your upstate...
['1']
1 866 969 5960 hott lesbian strippers delivered 2 your upstate...
['866']
1 866 969 5960 hott lesbian strippers delivered 2 your upstate...
['969']
1 866 969 5960 hott lesbian strippers delivered 2 your upstate...
['20']
- ebony bombshell model can u handle me ur personal porn - 20
['$245']
$245 no time limit female & male strippers 800-788-1785 - 21
['21']
$245 no time limit female & male strippers 800-788-1785 - 21
['250']
var ad_idzone = 2167013,ad_width = 250,ad_height = 250
['250']
var ad_idzone = 2166993,ad_width = 250,ad_height = 250
['20']
cute petite cocoa complected goddess - 20
['40']
title mature european blonde --naturally busty 40 dd curvy thick voluptuous escort massage fetishes toys - toronto body rubs - backpage ca
['50']
mature eu

In [25]:
print(len(list(test_dict.keys())))
test_dict

1


{"b'000933C0D6D4007511B1473E84CF086EF23CB1181AD09182572E8796A2749434'": {'id': "b'000933C0D6D4007511B1473E84CF086EF23CB1181AD09182572E8796A2749434'",
  'price/hour': ['$80'],
  'text': '- new mixed caramel beauty bo0bies & bo0ty $80 hh available now. - 20'}}

### Testing Functions

In [29]:
def right_ngrams_price_filter(cand):
    #lst = [,'hr','hour']
    price_terms =["hr","hour","/hr",'hour','$hour',"hourly","h", "$h","/hour"]
    #time_term = [15]
    #$hh'
    half_terms = ["half", "hh",'$hh','hlf','hhr',"halfhour"]
    right_tokens =list(get_right_tokens(cand, window=2))
    left_tokens = list(get_left_tokens(cand, window=2))
    
    
    if cand[0].get_span()[-1] in ['5','0']:
        if len(list(set(price_terms) & set(right_tokens)))>0 or len(list(set(price_terms) & set(left_tokens)))>0:
            if len(list(set(half_terms) & set(right_tokens)))==0:
                if len(list(set(half_terms) & set(left_tokens)))==0:
                    return True
    
    return False
#################################################################      
# def ngrams_halfhour_price_filter(cand):
#     #lst = [,'hr','hour']
#     price_terms =["hr","/hr","hour",'$hour,"hourly", "h", "$h","/hour']
#     #time_term = [15]
#     #$hh'
#     half_terms = ["half", "hh",'$hh','hlf','hhr',"halfhour"]
#     right_tokens =list(get_right_tokens(cand, window=2))
#     left_tokens = list(get_left_tokens(cand, window=2))
    
    
#     if cand[0].get_span()[-1] in ['5','0']:
#         if len(list(set(half_terms) & set(right_tokens)))>0 or len(list(set(half_terms) & set(left_tokens)))>0:
#             if len(list(set(price_terms) & set(right_tokens)))==0:
#                 if len(list(set(price_terms) & set(left_tokens)))==0:
#                     return True
    
#     return False
###############################################################      
 

In [36]:
train_cands = session.query(candidate_class).filter(candidate_class.split == 0).all()

from snorkel.lf_helpers import (
    get_left_tokens, get_right_tokens, get_between_tokens,
    get_text_between, get_tagged_text)
half_term = ["half", "hh",'$hh','hlf','hhr']
for i, c in enumerate(train_cand):
    name = c[0].get_span().lower().split()
    name_1 = c[0].get_parent().text.lower()
    #right_tokens =list(get_right_tokens(c[0], window=4))
    r = right_ngrams_price_filter(c)
    
    if r:
        left_tokens = list(get_left_tokens(c, window=2))
        
        right_tokens = list(get_right_tokens(c, window=2))
#         for h in half_term:
        if 'half' not in right_tokens:
        #if len(list(set(half_term) & set(left_tokens)))<0:
            print (c)
            print("candidate:",name)
            print(" ")
            print("get parent words:", name_1)
        
            print ( " left_tokens:",left_tokens)
            print ( " right_tokens:",right_tokens)
            print(" ")
            print(" ")
    #print (right_tokens)
        #print(r)
    
    

Price(Span("b'100'", sentence=4430698, chars=[99,101], words=[17,17]))
candidate: ['100']
 
get parent words: title arriving soon fetishes of all kinds not the law, non smoker and drug free,donations start at 100 hr - 102 - a sexy service com
 left_tokens: ['start', 'at']
 right_tokens: ['hr', '-']
 
 
Price(Span("b'100'", sentence=4430744, chars=[94,96], words=[15,15]))
candidate: ['100']
 
get parent words: arriving soon. fetishes of all kinds.not the law, non smoker and drug free,donations start at 100 hr - 102
 left_tokens: ['start', 'at']
 right_tokens: ['hr', '-']
 
 
Price(Span("b'$60'", sentence=2469227, chars=[27,29], words=[5,5]))
candidate: ['$60']
 
get parent words: special with cash one hour $60
 left_tokens: ['one', 'hour']
 right_tokens: []
 
 
Price(Span("b'200'", sentence=3730554, chars=[293,295], words=[49,49]))
candidate: ['200']
 
get parent words: hey guys i am available for companionship all night. i am100 percent real and ready to spend time with you. i onlysche

Price(Span("b'65'", sentence=5021289, chars=[43,44], words=[5,5]))
candidate: ['65']
 
get parent words: url calgary backpage ca therapeuticmassage 65 hour at elite massage_ 5 masseuse waiting for you 15560696.
 left_tokens: ['ca', 'therapeuticmassage']
 right_tokens: ['hour', 'at']
 
 
Price(Span("b'50'", sentence=1191820, chars=[15,16], words=[3,3]))
candidate: ['50']
 
get parent words: special only $ 50 per hour fancy beautiful asian spa hauppauge 631.626.0400
 left_tokens: ['only', '$']
 right_tokens: ['per', 'hour']
 
 
Price(Span("b'40'", sentence=4512812, chars=[41,42], words=[4,4]))
candidate: ['40']
 
get parent words: url colorado backpage therapeuticmassage 40 1 hr massage englewood on hampden and pearl new young girl 720 216 3708 17625435.
 left_tokens: ['backpage', 'therapeuticmassage']
 right_tokens: ['1', 'hr']
 
 
Price(Span("b'250'", sentence=3126572, chars=[425,427], words=[87,87]))
candidate: ['250']
 
get parent words: hello, my name is rachelle and i am visiting m

Add gold labels.

In [None]:
# from dataset_utils import get_gold_labels_from_meta

# # Adding dev gold labels using dictionary
# %time missed_dev = get_gold_labels_from_meta(session, candidate_class, extraction_type, 1, annotator='gold', gold_dict=None)

# # Adding test gold labels using dictionary
# %time missed_test = get_gold_labels_from_meta(session, candidate_class, extraction_type, 2, annotator='gold', gold_dict=None)

In [None]:
# # Checking percent of gold labels that are positive
# from dataset_utils import check_gold_perc
# perc_pos = check_gold_perc(session)

In [None]:
# from dataset_utils import remove_gold_labels
# # Remove gold labels if you want -- uncomment!
# #remove_gold_labels(session)