# HARDWARE

## Setup

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

"""
To change attributes:
1) Change ATTRIBUTE and you're good to go
"""
ATTRIBUTE = 'part'
COUNTER = '_scaling'
PARALLEL = 80
PARALLEL_EXTRACTION = 8
SCALE_SIZE = 10000

import os
import sys
sys.path.append(os.environ['SNORKELHOME'] + '/tutorials/tables/')
snorkel_postgres = os.environ['SNORKELDB'].startswith('postgres')
print snorkel_postgres

In [None]:
import os
if snorkel_postgres:
    os.environ['SNORKELDBNAME'] = ATTRIBUTE + str(COUNTER)
    print os.system("dropdb " + os.environ['SNORKELDBNAME'])
    print os.system("createdb " + os.environ['SNORKELDBNAME'])
    print "SNORKELDBNAME = %s" % os.environ['SNORKELDBNAME']

from snorkel import SnorkelSession
session = SnorkelSession()

## Parsing

In [None]:
import os
if snorkel_postgres:
    from snorkel.async_parser import parse_corpus, HTMLParser, AsyncOmniParser
    print "Starting async parse..."
    
    # PARSE TRAIN
    docs_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/symlinked_html/'
    pdf_path = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/symlinked_pdf/'
    doc_parser = HTMLParser()
    context_parser = AsyncOmniParser(blacklist=['style'], flatten=['span','br'], 
                                     tabular=True, lingual=True,
                                     visual=True, pdf_path=pdf_path)
    %time corpus = parse_corpus(session, 'Hardware Scale', docs_path,\
                                doc_parser, context_parser,\
                                max_docs=SCALE_SIZE, parallel=PARALLEL)

    print "%s contains %d documents" % (corpus, len(corpus))

### Timing Results

All parsing features to true:

|    ATTRIBUTE | PARALLEL | PARALLEL_EXTRACTION | SCALE_SIZE |     RUNTIME |
| -----------: | -------: | ------------------: | ---------: | ----------: |
| stg_temp_min |       80 |                   8 |        1e2 |       32.2s |
| stg_temp_min |       80 |                   8 |        1e3 |  2min 13s |
| stg_temp_min |       80 |                   8 |        1e4 | 18min 41s |
| stg_temp_min |       80 |                   8 |        1e5 | 3h 41min 1s |
| stg_temp_min |       80 |                   8 |        1e6 |             |


Turning Lingual to False

Run 0: Parallel = 80, SCALE_SIZE = 1e4. Runtime = 11min 34s

Turing visual to False

Run 0: Parallel = 80, SCALE_SIZE = 1e4. Runtime = 21min 28s



## Candidate Extraction

In [None]:
from snorkel.models import candidate_subclass

session.commit()

# Part_Attr = candidate_subclass('Part_Attr', ['part','attr'])
Part = candidate_subclass('Part', ['part'])

from hardware_matchers import get_matcher

dict_path = os.environ['SNORKELHOME'] +\
    '/tutorials/tables/data/hardware/gold_raw/digikey_part_dictionary.csv'
part_matcher = get_matcher('part', dict_path)
# attr_matcher = get_matcher(ATTRIBUTE)

from hardware_spaces import get_space
    
part_ngrams = get_space('part')
# attr_ngrams = get_space(ATTRIBUTE)

from hardware_throttlers import get_throttler

throttler = get_throttler(ATTRIBUTE)
throttler

from snorkel.models import Corpus
from snorkel.candidates import CandidateExtractor
from snorkel.utils import get_ORM_instance
from snorkel.async_candidates import parallel_extract

# ce = CandidateExtractor(Part_Attr, 
#                         [part_ngrams, attr_ngrams], 
#                         [part_matcher, attr_matcher], 
#                         throttler=throttler)
ce = CandidateExtractor(Part, 
                        [part_ngrams], 
                        [part_matcher], 
                        throttler=throttler)

corpus_names = ['Hardware Scale']

for corpus_name in corpus_names:
    corpus = get_ORM_instance(Corpus, session, corpus_name)
    print "Extracting Candidates from %s" % corpus
    %time candidates = parallel_extract(session, ce, corpus, \
                                        corpus_name + ' Candidates', \
                                        parallel=PARALLEL_EXTRACTION)
    session.add(candidates)
    print "%s contains %d Candidates" % (candidates, len(candidates))
session.commit()

Using combined matcher.
Extracting Candidates from Corpus (Hardware Scale)
[====                                    ] 8%

### Timing Results

|    ATTRIBUTE | PARALLEL | PARALLEL_EXTRACTION | SCALE_SIZE |   RUNTIME |
| -----------: | -------: | ------------------: | ---------: | --------: |
| stg_temp_min |       80 |                   8 |        1e2 |     26s |
| stg_temp_min |       80 |                   8 |        1e3 | 37min 25s |
| stg_temp_min |       80 |                   8 |        1e4 |           |
| stg_temp_min |       80 |                   8 |        1e5 |           |
| stg_temp_min |       80 |                   8 |        1e6 |           |



## Featurization

In [None]:
from snorkel.models import CandidateSet
from snorkel.utils import get_ORM_instance

scale = get_ORM_instance(CandidateSet, session, 'Hardware Scale Candidates')

from snorkel.async_annotations import annotate
print "Starting async featurization..."
%time F_scale = annotate(scale, parallel=PARALLEL)

### Timing Results

|    ATTRIBUTE | PARALLEL | PARALLEL_EXTRACTION | SCALE_SIZE |   RUNTIME |
| -----------: | -------: | ------------------: | ---------: | --------: |
| stg_temp_min |       80 |                   8 |        1e2 |  10.7s |
| stg_temp_min |       80 |                   8 |        1e3 | 15min 32s |
| stg_temp_min |       80 |                   8 |        1e4 |           |
| stg_temp_min |       80 |                   8 |        1e5 |           |
| stg_temp_min |       80 |                   8 |        1e6 |           |



## Apply LFs

In [None]:
from hardware_lfs import get_lfs

LFs = get_lfs(ATTRIBUTE)

from snorkel.async_annotations import annotate
%time L_scale = annotate(scale, parallel=PARALLEL, lfs=LFs)

### Timing Results

|    ATTRIBUTE | PARALLEL | PARALLEL_EXTRACTION | SCALE_SIZE |   RUNTIME |
| -----------: | -------: | ------------------: | ---------: | --------: |
| stg_temp_min |       80 |                   8 |        1e2 |     8.19s |
| stg_temp_min |       80 |                   8 |        1e3 | 15min 21s |
| stg_temp_min |       80 |                   8 |        1e4 |           |
| stg_temp_min |       80 |                   8 |        1e5 |           |
| stg_temp_min |       80 |                   8 |        1e6 |           |


## Learning

In [None]:
from snorkel.learning import NaiveBayes

gen_model = NaiveBayes()
%time gen_model.train(L_scale, n_iter=2000, rate=1e-3, mu=1e-6)
scale_marginals = gen_model.marginals(L_scale)

In [None]:
from snorkel.learning import LogReg

disc_model = LogReg()
%time disc_model.train(F_scale, scale_marginals, n_iter=5000, rate=1e-4)

### Timing Results

|    ATTRIBUTE | PARALLEL | PARALLEL_EXTRACTION | SCALE_SIZE | GEN RUNTIME | DISC RUNTIME |
| -----------: | -------: | ------------------: | ---------: | ----------: | -----------: |
| stg_temp_min |       80 |                   8 |        1e2 |    4.4ms |        1.91s |
| stg_temp_min |       80 |                   8 |        1e3 |   9.3ms |    4.7s |
| stg_temp_min |       80 |                   8 |        1e4 |             |              |
| stg_temp_min |       80 |                   8 |        1e5 |             |              |
| stg_temp_min |       80 |                   8 |        1e6 |             |              |





## Inference

In [None]:
import os
from snorkel.models import CandidateSet
from hardware_utils import load_hardware_labels

data_sets = ['Scale']
gold_file = {}
gold_file['Scale'] = os.environ['SNORKELHOME'] + '/tutorials/tables/data/hardware/dev/hardware_dev_gold.csv'

for data_set in data_sets:
    candidate_set_name = 'Hardware %s Candidates' % data_set
    candidates = session.query(CandidateSet).filter(
        CandidateSet.name == candidate_set_name).one()
    label_set_name = 'Hardware %s Candidates -- Gold' % data_set
    annotation_key_name = 'Hardware %s Labels -- Gold' % data_set
    gold_candidates, annotation_key = load_hardware_labels(session,\
                           label_set_name, \
                           annotation_key_name, \
                           candidates, \
                           gold_file[data_set], \
                           ATTRIBUTE)
    candidates_gold = session.query(CandidateSet).filter(
        CandidateSet.name == candidate_set_name + ' -- Gold').one()
    print "%d/%d Candidates in %s have positive Labels" % (
        len(candidates_gold), len(candidates), candidates)

    
dev_gold = session.query(CandidateSet).filter(
    CandidateSet.name == 'Hardware Scale Candidates -- Gold').one()

from snorkel.annotations import LabelManager
label_manager = LabelManager()
L_scale = label_manager.load(session, scale, 'Hardware Scale Labels -- Gold')
L_scale.shape

%time tp, fp, tn, fn = disc_model.score(F_scale, L_scale, dev_gold)

##### Timing Results

| ATTRIBUTE    | PARALLEL | PARALLEL_EXTRACTION | SCALE_SIZE |    RUNTIME |
| :----------- | -------: | ------------------: | ---------: | ---------: |
| stg_temp_min |       80 |                   8 |        1e2 | 2.5s |
| stg_temp_min |       80 |                   8 |        1e3 |   17s        |
| stg_temp_min |       80 |                   8 |        1e4 |            |
| stg_temp_min |       80 |                   8 |        1e5 |            |
| stg_temp_min |       80 |                   8 |        1e6 |            |