# Entity Extraction from Tables

This notebook demonstrates the full extraction and learning process for _entities_ in tables with a data set of transistor spec sheets, extracting minimum storage temperatures.

In [23]:
%load_ext autoreload
%autoreload 2

import numpy as np

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


### Extraction

In [24]:
load_pickle = False # with pickle ~15s; without pickle ~75s
save_pickle = False

corpus_loaded = False
import cPickle
if load_pickle:
    try:
        with open("data/hardware/hardware_corpus.pkl","r") as pkl:
            %time corpus = cPickle.load(pkl)
        corpus_loaded = True
        print "Corpus has been loaded."
    except:
        print "Corpus could not be loaded."
        print "Corpus will be parsed instead..."
if not corpus_loaded:
    from snorkel.parser import CorpusParser
    from snorkel.parser import HTMLParser
    from snorkel.parser import TableParser

    html_parser = HTMLParser(path='data/hardware/hardware_html/')
    table_parser = TableParser()

    cp = CorpusParser(html_parser, table_parser, max_docs=5)
    %time corpus = cp.parse_corpus(name='Hardware Corpus')
    print "Corpus has been parsed."

    if save_pickle:
        with open("data/hardware/hardware_corpus.pkl","w") as pkl:
            %time cPickle.dump(corpus, pkl)
            print "Corpus has been pickled."

CPU times: user 5.59 s, sys: 265 ms, total: 5.86 s
Wall time: 8.11 s
Corpus has been parsed.


In [25]:
# from snorkel import SnorkelSession
# session = SnorkelSession()
# session.add(corpus)
# session.commit()
# from snorkel.models import Corpus
# corpus = session.query(Corpus).filter(Corpus.name == 'Hardware Corpus').one()
# corpus

In [26]:
from snorkel.candidates import TableNgrams
from snorkel.matchers import NumberMatcher, RangeMatcher

# Define a candidate space
ngrams = TableNgrams(n_max=2)

# Define a matcher
number_matcher = RangeMatcher(low=-70,high=-50)

In [27]:
from snorkel.candidates import EntityExtractor
ce = EntityExtractor(ngrams, number_matcher)
%time candidates = ce.extract(corpus.get_tables(), name='all')
for cand in candidates[:10]: 
    print cand
print "%s candidates extracted" % len(candidates)

CPU times: user 111 ms, sys: 3.32 ms, total: 115 ms
Wall time: 113 ms
Span("-55", context=None, chars=[0,2], words=[0,0])
Span("-50", context=None, chars=[0,2], words=[0,0])
Span("-50", context=None, chars=[0,2], words=[0,0])
Span("-50", context=None, chars=[4,6], words=[2,2])
Span("-50", context=None, chars=[4,6], words=[2,2])
Span("-50", context=None, chars=[4,6], words=[2,2])
Span("-55", context=None, chars=[0,2], words=[0,0])
7 candidates extracted


### Learning

First, generate gold data.

In [11]:
from utils import collect_hardware_entity_gold
filename='data/hardware/gold_all.csv'
(gold_candidates, gold_labels) = collect_hardware_entity_gold(filename, 'stg_temp_min', candidates)
print "%s out of %s candidates have gold labels" % (len(gold_candidates), len(candidates))
print "%s out of %s labeled candidates have positive label" % (gold_labels.count(1), len(gold_candidates))

# Split into train and test set
training_candidates = []
n_half = len(candidates)/2
for idx, c in enumerate(candidates[:n_half]):
    if c in gold_candidates:
        gold_candidates.append(c)
        gold_labels.append(gold_labels[idx])
    else:
        training_candidates.append(c)
training_candidates.extend(candidates[n_half:])
gold_labels = np.array(gold_labels)
# print "Training set size: %s" % len(training_candidates)
# print "Gold set size: %s" % len(gold_candidates)
print "Positive labels in gold set: %s" % sum(gold_labels==1)
print "Negative labels in gold set: %s" % sum(gold_labels==-1)

98 gold annotations
59 out of 108 candidates have gold labels
51 out of 59 labeled candidates have positive label
Positive labels in gold set: 81
Negative labels in gold set: 12


Let's take a quick peek at the features:

In [12]:
from snorkel.features import TableNgramFeaturizer
featurizer = TableNgramFeaturizer()
featurizer.fit_transform(candidates)
for f in featurizer.get_features_by_candidate(candidates[0])[:10]: print f

Building feature index...
Extracting features...
0/1980
DDLIB_WORD_SEQ_[-55]
DDLIB_LEMMA_SEQ_[-55]
DDLIB_POS_SEQ_[CD]
DDLIB_DEP_SEQ_[ROOT]
DDLIB_W_LEFT_1_[_NUMBER]
DDLIB_W_LEFT_POS_1_[CD]
DDLIB_W_LEFT_2_[to _NUMBER]
DDLIB_W_LEFT_POS_2_[TO CD]
DDLIB_W_LEFT_3_[_NUMBER to _NUMBER]
DDLIB_W_LEFT_POS_3_[CD TO CD]


Define labeling functions:

In [13]:
def LF_to_range(m):
    return 1 if 'to' in m.post_window('words') else 0
def LF_tilde_range(m):
    return 1 if '~' in m.post_window('words') else 0
def LF_storage(m):
    return 1 if 'storage' in m.aligned_ngrams('words') else -1
def LF_tstg(m):
    return 1 if 'tstg' in m.aligned_ngrams('words') else -1
def LF_tj(m):
    return 1 if 'tj' in m.aligned_ngrams('words') else -1
def LF_temperature(m):
    return 1 if 'temperature' in m.aligned_ngrams('words') else -1
def LF_celsius(m):
    return 1 if 'c' in m.aligned_ngrams('words') else -1
def LF_max(m):
    return 1 if 'max' in m.aligned_ngrams('words') else 0
def LF_min(m):
    return 1 if 'min' in m.aligned_ngrams('words') else 0

In [14]:
LFs = [LF_to_range, LF_tilde_range, LF_storage, LF_tstg, LF_tj, LF_temperature,
      LF_celsius, LF_max, LF_min]

In [15]:
from snorkel.snorkel import TrainingSet
from snorkel.features import NgramFeaturizer

training_set = TrainingSet(training_candidates, LFs, featurizer=TableNgramFeaturizer())

Applying LFs...
Featurizing...
Building feature index...
Extracting features...
0/1774
LF Summary Statistics: 9 LFs applied to 74 candidates
------------------------------------------------------------
Coverage (candidates w/ > 0 labels):		97.30%
Overlap (candidates w/ > 1 labels):		100.00%
Conflict (candidates w/ conflicting labels):	55.41%


In [16]:
lf_stats = training_set.lf_stats()
lf_stats[:5]

Unnamed: 0,conflicts,coverage,j,overlaps
LF_to_range,0.162162,0.216216,0,0.216216
LF_tilde_range,0.094595,0.094595,1,0.094595
LF_storage,0.554054,1.0,2,1.0
LF_tstg,0.554054,1.0,3,1.0
LF_tj,0.554054,1.0,4,1.0


Now learn, baby, learn!

In [17]:
from snorkel.snorkel import Learner
from snorkel.learning import LogReg

learner = Learner(training_set, model=LogReg(bias_term=True))

In [18]:
# Splitting into CV and test set
n_half = len(gold_candidates)/2
test_candidates = gold_candidates[:n_half]
test_labels     = gold_labels[:n_half]
cv_candidates   = gold_candidates[n_half:]
cv_labels       = gold_labels[n_half:]

In [19]:
from snorkel.learning_utils import GridSearch

gs       = GridSearch(learner, ['mu', 'lf_w0'], [[1e-5, 1e-7],[1.0,2.0]])
gs_stats = gs.fit(cv_candidates, cv_labels)

Testing mu = 1.00e-05, lf_w0 = 1.00e+00
Begin training for rate=0.01, mu=1e-05
	Learning epoch = 0	Gradient mag. = 0.097144
	Learning epoch = 250	Gradient mag. = 0.134018
	Learning epoch = 500	Gradient mag. = 0.164930
	Learning epoch = 750	Gradient mag. = 0.178923
Final gradient magnitude for rate=0.01, mu=1e-05: 0.191
Applying LFs...
Featurizing...
Testing mu = 1.00e-05, lf_w0 = 2.00e+00
Begin training for rate=0.01, mu=1e-05
	Learning epoch = 0	Gradient mag. = 0.176071
	Learning epoch = 250	Gradient mag. = 0.200011
	Learning epoch = 500	Gradient mag. = 0.208780
	Learning epoch = 750	Gradient mag. = 0.208816
Final gradient magnitude for rate=0.01, mu=1e-05: 0.209
Testing mu = 1.00e-07, lf_w0 = 1.00e+00
Begin training for rate=0.01, mu=1e-07
	Learning epoch = 0	Gradient mag. = 0.097144
	Learning epoch = 250	Gradient mag. = 0.134062
	Learning epoch = 500	Gradient mag. = 0.164978
	Learning epoch = 750	Gradient mag. = 0.178963
Final gradient magnitude for rate=0.01, mu=1e-07: 0.191
Testin

In [20]:
gs_stats

Unnamed: 0,mu,lf_w0,Prec.,Rec.,F1
0,1e-05,1.0,0.944444,0.809524,0.871795
1,1e-05,2.0,0.944444,0.809524,0.871795
2,1e-07,1.0,0.944444,0.809524,0.871795
3,1e-07,2.0,0.944444,0.809524,0.871795


In [21]:
learner.test(test_candidates, test_labels)

Applying LFs...
Featurizing...
Test set size:	46
----------------------------------------
Precision:	1.0
Recall:		0.923076923077
F1 Score:	0.96
----------------------------------------
TP: 36 | FP: 0 | TN: 7 | FN: 3




In [22]:
learner.feature_stats(n_max=10)

Unnamed: 0,j,w
DDLIB_WORD_SEQ_[-50],1371,-0.317235
DDLIB_LEMMA_SEQ_[-50],947,-0.317235
TABLE_ROW_WORDS_v,380,-0.288601
TABLE_ROW_WORDS_voltage,1755,-0.211786
TABLE_ROW_WORDS_-lrb-,1272,-0.159091
TABLE_ROW_WORDS_-rrb-,837,-0.159091
TABLE_COL_WORDS_-5.0,1442,-0.153402
TABLE_ROW_WORDS_-10,1544,-0.147894
TABLE_ROW_WORDS_=,1245,-0.133727
TABLE_ROW_NUM_[2],1316,-0.126362


Tune in next time for relation extraction!