# Virus-Host Species Relation Extraction
## Notebook 4
### UC Davis Epicenter for Disease Dynamics

In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import os
import re
import numpy as np
from snorkel import SnorkelSession
import pandas as pd
session = SnorkelSession()
# Connect to the database backend and initalize a Snorkel session

#from lib.init import *
from snorkel.models import candidate_subclass
from snorkel.annotations import load_gold_labels

from snorkel.lf_helpers import (
    get_left_tokens, get_right_tokens, get_between_tokens,
    get_text_between, get_tagged_text,
)

VirusHost = candidate_subclass('VirusHost', ['virus', 'host'])

In [2]:
from snorkel.annotations import load_marginals

train_cands = session.query(VirusHost).filter(VirusHost.split == 0).order_by(VirusHost.id).all()
dev_cands   = session.query(VirusHost).filter(VirusHost.split == 1).order_by(VirusHost.id).all()
test_cands  = session.query(VirusHost).filter(VirusHost.split == 2).order_by(VirusHost.id).all()

L_gold_dev  = load_gold_labels(session, annotator_name='gold', split=1, load_as_array=True)

In [3]:
from util_virushost import load_external_labels

missed = load_external_labels(session, VirusHost, annotator_name = 'gold', split = 2)

L_gold_test = load_gold_labels(session, annotator_name = 'gold', split = 2)

L_gold_test

AnnotatorLabels created: 37


<362x1 sparse matrix of type '<class 'numpy.int32'>'
	with 37 stored elements in Compressed Sparse Row format>

### Feature Extraction
Instead of using a deep learning approach to start, let's look at a standard sparse logistic regression model. First, we need to extract out features. This can take a while, but we only have to do it once!

In [4]:
from features import hybrid_span_mention_ftrs
from snorkel.annotations import FeatureAnnotator

featurizer = FeatureAnnotator(f=hybrid_span_mention_ftrs)

In [5]:

F_train = featurizer.load_matrix(session, split=0)
F_dev   = featurizer.load_matrix(session, split=1)
F_test  = featurizer.load_matrix(session, split=2)

if F_train.size == 0:    
    %time F_train = featurizer.apply(split=0, parallelism=1)
if F_dev.size == 0:     
    %time F_dev  = featurizer.apply_existing(split=1, parallelism=1)
if F_test.size == 0:
    %time F_test = featurizer.apply_existing(split=2, parallelism=1)

print(F_train.shape)
print(F_dev.shape)
print(F_test.shape)

Clearing existing...
Running UDF...

Wall time: 5min 46s
Clearing existing...
Running UDF...

Wall time: 14.3 s
Clearing existing...
Running UDF...

Wall time: 50.9 s
(3631, 52127)
(175, 52127)
(362, 52127)


In [6]:
# reload marginals
train_marginals = load_marginals(session, split=0)

### Logistic Regression Grid Search for Tuning Hyperparameters

In [7]:
from snorkel.learning import RandomSearch
from snorkel.learning.tensorflow import SparseLogisticRegression

seed = 1234
num_model_search = 5

# search over this parameter grid
param_grid = {}
param_grid['batch_size'] = [64, 128]
param_grid['lr']         = [1e-4, 1e-3, 1e-2]
param_grid['l1_penalty'] = [1e-6, 1e-4, 1e-2]
param_grid['l2_penalty'] = [1e-6, 1e-4, 1e-2]
param_grid['rebalance']  = [0.0, 0.5]

model_class_params = {
    'n_threads':1
}

model_hyperparams = {
    'n_epochs': 30,
    'print_freq': 10,
    'dev_ckpt_delay': 0.5,
    'X_dev': F_dev,
    'Y_dev': L_gold_dev
}

searcher = RandomSearch(SparseLogisticRegression, param_grid, F_train, train_marginals,
                        n=num_model_search, seed=seed,
                        model_class_params=model_class_params,
                        model_hyperparams=model_hyperparams)

print("Discriminitive Model Parameter Space (seed={}):".format(seed))
for i, params in enumerate(searcher.search_space()):
    print("{} {}".format(i, params))

disc_model, run_stats = searcher.fit(X_valid=F_dev, Y_valid=L_gold_dev, n_threads=1)
run_stats


For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
If you depend on functionality not listed there, please file an issue.

Discriminitive Model Parameter Space (seed=1234):
0 (128, 0.0001, 0.01, 0.01, 0.0)
1 (128, 0.0001, 0.01, 0.01, 0.5)
2 (64, 0.001, 0.01, 0.01, 0.0)
3 (128, 0.001, 1e-06, 1e-06, 0.5)
4 (64, 0.001, 1e-06, 1e-06, 0.5)
[1] Testing batch_size = 64, lr = 1.00e-04, l1_penalty = 1.00e-06, l2_penalty = 1.00e-04, rebalance = 0.00e+00
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


[SparseLogisticRegression] Training model
[SparseLogisticRegression] n_train=3585  #epochs=30  batch size=64
[SparseLogisticRegression] Epoch 0 (3.80s)	Average loss=0.927738	Dev F1=66.05
[SparseLogisticRegression] Epoch 10 (32.43s)	Average loss=0.644058	Dev F1=67.29
[SparseLogisticRegression] Epoch 20 (62.57s)	Average loss=0.614977	Dev F1=67.58
[SparseLogisticRegression] Model saved as <SparseLogisticRegression>
[SparseLogisticRegression] Epoch 29 (88.43s)	Average loss=0.598269	Dev F1=68.47
[SparseLogisticRegression] Model saved as <SparseLogisticRegression>
[SparseLogisticRegression] Training done (88.75s)
Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from checkpoints\SparseLogisticRegression\SparseLogisticRegression-29
[SparseLogisticRegression] Loaded model <SparseLogisticRegression>
[SparseLogisticRegression] F-1 Score: 0.6846846846846847
[SparseLogisticRegression] Model saved as <SparseLogisticRegression

Unnamed: 0,batch_size,lr,l1_penalty,l2_penalty,rebalance,Prec.,Rec.,F-1
2,128,0.01,1e-06,0.0001,0.0,0.830189,0.77193,0.8
3,64,0.01,0.01,0.01,0.0,0.723404,0.894737,0.8
1,64,0.001,0.0001,0.01,0.5,0.751938,0.850877,0.798354
4,64,0.01,1e-06,1e-06,0.5,0.752137,0.77193,0.761905
0,64,0.0001,1e-06,0.0001,0.0,0.703704,0.666667,0.684685


In [8]:
# Just the best model
from snorkel.learning.tensorflow import SparseLogisticRegression

log_reg = SparseLogisticRegression()

log_reg.train(
    X_train = F_train, Y_train = train_marginals, 
    lr = 0.1,
    batch_size = 64,
    l1_penalty = 1.00e-06,
    l2_penalty = 1.00e-04,
    rebalance = 0.0,
    n_threads = 1,
    n_epochs = 30,
    print_freq = 10,
    dev_ckpt_delay = 0.5,
)


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


[SparseLogisticRegression] Training model
[SparseLogisticRegression] n_train=3585  #epochs=30  batch size=64
[SparseLogisticRegression] Epoch 0 (5.09s)	Average loss=3.730745
[SparseLogisticRegression] Epoch 10 (37.81s)	Average loss=4.923562
[SparseLogisticRegression] Epoch 20 (71.60s)	Average loss=5.155086
[SparseLogisticRegression] Epoch 29 (104.67s)	Average loss=5.454622
[SparseLogisticRegression] Training done (104.67s)


In [16]:
import numpy as np
print('Prec, Recall, F1 scores')
print(np.round(log_reg.score(F_dev, L_gold_dev), 3)) # dev set
print(np.round((log_reg.score(F_test, L_gold_test)), 3)) # test set

Prec, Recall, F1 scores
[0.794 0.746 0.769]
[0.19  0.811 0.308]


In [11]:
# Extracting features allows us to inspect and interperet our learned weights
from scoring import *
print_top_k_features(session, disc_model, F_train, top_k=10)

52127
[-0.5328904, 'WIN_RIGHT_SEQ_LEMMAS[( je]']
[-0.46939763, 'WIN_LEFT_SEQ_LEMMAS[neutralization of]']
[-0.4480359, 'WIN_LEFT_LEMMAS[.70%]']
[-0.43169013, 'WIN_RIGHT_SEQ_LEMMAS[fever (]']
[-0.42790008, 'WIN_RIGHT_SEQ_POS_TAGS[NNP -RRB- VBN]']
[-0.42789805, 'WIN_RIGHT_SEQ_LEMMAS[) report]']
[-0.42785242, 'WIN_RIGHT_SEQ_LEMMAS[( chik]']
[-0.42784595, 'WIN_RIGHT_SEQ_LEMMAS[fever ( chik]']
[-0.42782688, 'WIN_RIGHT_SEQ_LEMMAS[chik ) report]']
[-0.4278252, 'WIN_RIGHT_SEQ_LEMMAS[chik )]']
[-0.42781186, 'WIN_RIGHT_SEQ_LEMMAS[( chik )]']
[-0.42696872, 'WIN_RIGHT_SEQ_LEMMAS[je ) ,]']
[-0.42696062, 'WIN_RIGHT_SEQ_LEMMAS[( je )]']
[-0.426956, 'WIN_RIGHT_SEQ_LEMMAS[je )]']
[-0.4224598, 'BETWEEN_SEQ_LEMMAS[be retest]']
[-0.40965173, 'WIN_LEFT_SEQ_LEMMAS[with .70% neutralization]']
[-0.4094547, 'WIN_LEFT_SEQ_LEMMAS[with .70%]']
[-0.4091316, 'BETWEEN_LEMMAS[retest]']
[-0.40907568, 'BETWEEN_SEQ_LEMMAS[be retest through]']
[-0.4080431, 'WIN_LEFT_SEQ_LEMMAS[sample with .70%]']
[-0.40576074, 'BETWEEN_SE

In [17]:
# performance on the test set
p, r, f1 = log_reg.score(X_test = F_test, Y_test = L_gold_test)
print("Prec: {0:.3f}, Recall: {1:.3f}, F1 Score: {2:.3f}".format(p, r, f1))

Prec: 0.190, Recall: 0.811, F1 Score: 0.308


In [19]:
tp, fp, tn, fn = log_reg.error_analysis(session, F_test, L_gold_test)

Scores (Un-adjusted)
Pos. class accuracy: 0.811
Neg. class accuracy: 0.606
Precision            0.19
Recall               0.811
F1                   0.308
----------------------------------------
TP: 30 | FP: 128 | TN: 197 | FN: 7



In [20]:
# view false positives
from snorkel.viewer import SentenceNgramViewer

SentenceNgramViewer(fp, session, height = 350)

<IPython.core.display.Javascript object>

SentenceNgramViewer(cids=[[[22], [26], [61, 62, 63, 64, 84, 85]], [[13, 73, 101], [17, 69, 80, 93], [54, 95]],…

In [None]:
# A look at the test marginals:

# save the predictions of the model on the test set back to the database
disc_model.save_marginals(session, test_cands, training=False)

# load marginals
test_marginals = load_marginals(session, X=test_cands, split=2, training=False)

# plot the test marginals
import matplotlib.pyplot as plt
plt.hist(test_marginals, bins=20, range=(0.0, 1.0))
plt.title('Distribution of Test Marginals')
plt.show()

In [22]:
# export all candidates to csv file
df = pd.DataFrame({'id': [], 'virus': [], 'host': []}) # initialize df w three columns

# list all candidates (to export later and merge w predicted probabilities table?)
for c in session.query(VirusHost).all():
    df = df.append({'id': c.id, 'virus': c.virus.get_attrib_tokens("words"), 'host': c.host.get_attrib_tokens("words")}, ignore_index=True)
    
df2 = df[:] # make a copy of the df
df2['virus'] = df['virus'].str.join(' ')
df2['host'] = df['host'].str.join(' ')
df2['virus'] = df2['virus'].str.replace('[^a-zA-Z ]', '') # remove non alphaetic characters
df2['host'] = df2['host'].str.replace('[^a-zA-Z ]', '')
df2['id'] = df['id'].astype('int64') 
df2.to_csv('candidates.csv', index = False)  # exports the candidates to a file called candidates.csv