# Virus-Host Species Relation Extraction
## Notebook 4 -  End Model (Sparse Logistic Regression)
### UC Davis Epicenter for Disease Dynamics

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import re
import numpy as np
from snorkel import SnorkelSession
import pandas as pd
session = SnorkelSession()
# Connect to the database backend and initalize a Snorkel session

#from lib.init import *
from snorkel.models import candidate_subclass
from snorkel.annotations import load_gold_labels

from snorkel.lf_helpers import (
    get_left_tokens, get_right_tokens, get_between_tokens,
    get_text_between, get_tagged_text,
)

VirusHost = candidate_subclass('VirusHost', ['virus', 'host'])

In [2]:
from snorkel.annotations import load_marginals

train_cands = session.query(VirusHost).filter(VirusHost.split == 0).order_by(VirusHost.id).all()
dev_cands   = session.query(VirusHost).filter(VirusHost.split == 1).order_by(VirusHost.id).all()
test_cands  = session.query(VirusHost).filter(VirusHost.split == 2).order_by(VirusHost.id).all()

L_gold_dev  = load_gold_labels(session, annotator_name='gold', split=1, load_as_array=True, zero_one=True)

In [3]:
from util_virushost import load_external_labels

missed = load_external_labels(session, VirusHost, annotator_name = 'gold', split=2)

L_gold_test = load_gold_labels(session, annotator_name='gold', split=2, zero_one=True)
L_gold_test

AnnotatorLabels created: 0


<539x1 sparse matrix of type '<class 'numpy.int32'>'
	with 110 stored elements in Compressed Sparse Row format>

### Feature Extraction
Instead of using a deep learning approach to start, let's look at a standard sparse logistic regression model. First, we need to extract out features. This can take a while, but we only have to do it once!

In [4]:
from features import hybrid_span_mention_ftrs
from snorkel.annotations import FeatureAnnotator

featurizer = FeatureAnnotator(f=hybrid_span_mention_ftrs)

In [5]:

F_train = featurizer.load_matrix(session, split=0)
F_dev   = featurizer.load_matrix(session, split=1)
F_test  = featurizer.load_matrix(session, split=2)

if F_train.size == 0:    
    %time F_train = featurizer.apply(split=0, parallelism=1)
if F_dev.size == 0:     
    %time F_dev  = featurizer.apply_existing(split=1, parallelism=1)
if F_test.size == 0:
    %time F_test = featurizer.apply_existing(split=2, parallelism=1)

print(F_train.shape)
print(F_dev.shape)
print(F_test.shape)

Clearing existing...
Running UDF...


100%|██████████████████████████████████████| 3805/3805 [03:16<00:00, 19.32it/s]


Wall time: 3min 19s
Clearing existing...
Running UDF...


100%|████████████████████████████████████████| 428/428 [00:27<00:00, 15.74it/s]


Wall time: 29.5 s
Clearing existing...
Running UDF...


100%|████████████████████████████████████████| 539/539 [01:27<00:00,  6.17it/s]


Wall time: 1min 29s
(3805, 54775)
(428, 54775)
(539, 54775)


In [6]:
# reload marginals
train_marginals = load_marginals(session, split=0)

In [7]:
from snorkel.learning import RandomSearch
from snorkel.learning.tensorflow import SparseLogisticRegression

seed = 1234
num_model_search = 5

# search over this parameter grid
param_grid = {}
param_grid['batch_size'] = [64, 128]
param_grid['lr']         = [1e-4, 1e-3, 1e-2]
param_grid['l1_penalty'] = [1e-6, 1e-4, 1e-2]
param_grid['l2_penalty'] = [1e-6, 1e-4, 1e-2]
param_grid['rebalance']  = [0.0, 0.5]

model_class_params = {
    'n_threads':1
}

model_hyperparams = {
    'n_epochs': 30,
    'print_freq': 10,
    'dev_ckpt_delay': 0.5,
    'X_dev': F_dev,
    'Y_dev': L_gold_dev
}

searcher = RandomSearch(SparseLogisticRegression, param_grid, F_train, train_marginals,
                        n=num_model_search, seed=seed,
                        model_class_params=model_class_params,
                        model_hyperparams=model_hyperparams)

print("Discriminitive Model Parameter Space (seed={}):".format(seed))
for i, params in enumerate(searcher.search_space()):
    print("{} {}".format(i, params))

disc_model, run_stats = searcher.fit(X_valid=F_dev, Y_valid=L_gold_dev, n_threads=1)
run_stats


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Discriminitive Model Parameter Space (seed=1234):
0 (128, 0.0001, 0.01, 0.01, 0.0)
1 (128, 0.0001, 0.01, 0.01, 0.5)
2 (64, 0.001, 0.01, 0.01, 0.0)
3 (128, 0.001, 1e-06, 1e-06, 0.5)
4 (64, 0.001, 1e-06, 1e-06, 0.5)
[1] Testing batch_size = 64, lr = 1.00e-04, l1_penalty = 1.00e-06, l2_penalty = 1.00e-04, rebalance = 0.00e+00
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


[SparseLogisticRegression] Training model
[SparseLogisticRegression] n_train=3747  #epochs=30  batch size=64
[SparseLogisticRegression] Epoch 0 (3.74s)	Average loss=0.766726	Dev F1=31.46
[SparseLogisticRegression] Epoch 10 (25.09s)	Average loss=0.634736	Dev F1=33.99
[SparseLogisticRegression] Epoch 20 (46.08s)	Average loss=0.611430	Dev F1=34.73
[SparseLogisticRegression] Model saved as <SparseLogisticRegression>
[SparseLogisticRegression] Epoch 29 (65.78s)	Average loss=0.600980	Dev F1=35.10
[SparseLogisticRegression] Model saved as <SparseLogisticRegression>
[SparseLogisticRegression] Training done (66.32s)
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from checkpoints\SparseLogisticRegression\SparseLogisticRegression-29
[SparseLogisticRegression] Loaded model <SparseLogisticRegression>
[SparseLogisticRegression] F-1 Score: 0.35097493036211697
[SparseLogisticRegression] Model saved as <SparseLogisticRegressio

Unnamed: 0,batch_size,lr,l1_penalty,l2_penalty,rebalance,Prec.,Rec.,F-1
3,64,0.01,0.01,0.01,0.0,0.328125,0.807692,0.466667
4,64,0.01,1e-06,1e-06,0.5,0.253846,0.846154,0.390533
1,64,0.001,0.0001,0.01,0.5,0.241135,0.871795,0.377778
0,64,0.0001,1e-06,0.0001,0.0,0.224199,0.807692,0.350975
2,128,0.01,1e-06,0.0001,0.0,0.208202,0.846154,0.334177


In [8]:
# Extracting features allows us to inspect and interperet our learned weights
from scoring import *
print_top_k_features(session, disc_model, F_train, top_k=25)

54775
[-0.23769023, 'BETWEEN_SEQ_POS_TAGS[NNP NNP NNP]']
[-0.15579031, 'WIN_RIGHT_SEQ_POS_TAGS[NNP NNP]']
[-0.14817533, 'BETWEEN_SEQ_POS_TAGS[NNP NNP]']
[-0.14448415, 'BETWEEN_LEMMAS[40]']
[-0.1382108, 'BETWEEN_SEQ_POS_TAGS[CD NNP]']
[-0.107101396, 'BETWEEN_SEQ_POS_TAGS[NNP NNP CD]']
[-0.09070521, 'BETWEEN_SEQ_POS_TAGS[CD NNP NNP]']
[-0.086096965, 'BETWEEN_POS_TAGS[CD]']
[-0.085858606, 'BETWEEN_LEMMAS[wnv]']
[-0.047634784, 'BETWEEN_SEQ_POS_TAGS[NNP NNP ,]']
[-0.03262218, 'BETWEEN_POS_TAGS[RB]']
[-0.022592578, 'WIN_LEFT_POS_TAGS[NNP]']
[-0.022129225, 'BETWEEN_SEQ_POS_TAGS[NN -LRB-]']
[-0.021860994, 'BETWEEN_SEQ_POS_TAGS[VBN NN -LRB-]']
[-0.019906584, 'BETWEEN_SEQ_POS_TAGS[JJ NN]']
[-0.019017868, 'BETWEEN_LEMMAS[4]']
[-0.017826296, 'BETWEEN_POS_TAGS[$]']
[-0.01670544, 'WIN_LEFT_SEQ_POS_TAGS[NNP NNP]']
[-0.016157726, 'BETWEEN_SEQ_POS_TAGS[NNP XX CD]']
[-0.01613489, 'BETWEEN_LEMMAS[1]']
[-0.015286283, 'BETWEEN_LEMMAS[13]']
[-0.015048918, 'BETWEEN_LEMMAS[usuv]']
[-0.014508486, 'BETWEEN_SEQ_

In [9]:
# Scores on the test set
p, r, f1 = disc_model.score(F_test, L_gold_test)
print("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(p, r, f1))

Prec: 0.320, Recall: 0.642, F1 Score: 0.427
