- Tested working on 4/1/2019
    - documents: 'pdfs.tsv'
    - host/species names: 'domestic_names.csv', 'ictv_animals.csv', 'ictv_viruses.csv'

In [1]:
import numpy as np
import pandas as pd

In [2]:
import os
from pathlib import Path

In [3]:
# Load Snorkel
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession
session = SnorkelSession()

n_docs = 500 if 'CI' in os.environ else 2591

In [4]:
from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser
from snorkel.models import Document, Sentence
from snorkel.parser import TSVDocPreprocessor

### First read in the documents, stored in TSV format, using a document preprocessor

In [5]:
doc_preprocessor = TSVDocPreprocessor('pdfs.tsv', max_docs=n_docs)

In [6]:
corpus_parser = CorpusParser(parser=Spacy())
%time corpus_parser.apply(doc_preprocessor, count=n_docs)

Clearing existing...
Running UDF...


  2%|▍                                | 39/2591 [01:10<1:05:29,  1.54s/it]


Wall time: 1min 19s


In [7]:
print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

Documents: 39
Sentences: 14435


### Next step: create matcher functions from dictionaries to match virus/host names in the text data

In [8]:
# Create a list of animal host names to use as our dictionary in the matcher
domestic_names = pd.read_csv('domestic_names.csv')

In [9]:
names1 = domestic_names.iloc[:,0]
names2 = domestic_names.iloc[:,1]
names3 = domestic_names.iloc[:,2]
names_list = names1.append([names2,names3])
names_list = names_list.tolist()

In [10]:
names_list.append("dromedary")
names_list.append("Peking")

In [11]:
ictv_animals = pd.read_csv('ictv_animals.csv')
#print('Total animal names:', ictv_animals.count().sum()) # total number of animal names in the ddict

In [12]:
ictv_series = ictv_animals.stack().reset_index().iloc[:,2]

In [13]:
ictv_list = ictv_series.tolist()

In [14]:
# Function that gets first letter of genus + species name 
def name(s): 
    # split the string into a list  
    l = s.split() 
    new_word = ""  # begins as empty string
    if len(l) == 2:
        for i in range(len(l)-1): 
            s = l[i] 
            # adds the capital first character  
            new_word += (s[0].upper()+'. ') 
        new_word += l[-1].title() # add the last word
        return new_word 
    else:
        return s

In [15]:
ictv_list2 = [name(s) for s in ictv_list] # shortened species names list

In [16]:
animals_list = names_list + ictv_list + ictv_list2

In [17]:
animals_list = list(set(animals_list)) # remove duplicates from the list

In [18]:
# Create a list of virus names
ictv_viruses = pd.read_csv('ictv_viruses.csv')
# create copies of certain virus names without the digit at the end
ictv_viruses['Species2'] = ictv_viruses['Species'].str.replace('\d+', '', regex=True)

In [182]:
ictv_v_series = ictv_viruses.stack().reset_index().iloc[:,2].drop_duplicates()
virus_list = ictv_v_series.tolist()

In [183]:
virus_abbrev = pd.read_csv('virus_abbrev.csv', header = None)
virus_abbrev_noparen = pd.read_csv('virus_abbrev_noparen.csv', header = None)

In [184]:
virus_list = virus_list + virus_abbrev.iloc[:,0].tolist() + virus_abbrev_noparen.iloc[:,0].tolist()

In [185]:
# Clean up white space
animals_list = [animal.strip() for animal in animals_list]
virus_list = [virus.strip() for virus in virus_list]

In [186]:
print('Number of virus species to match:', len(virus_list))
print('Number of host species to match:', len(animals_list))

Number of virus species to match: 6392
Number of host species to match: 69877


### Writing a basic `CandidateExtractor`

* Basic function to extract **candidate Virus-Host relation mentions** from the corpus, using the list of names.

* We will extract `Candidate` objects by identifying, for each `Sentence`, all pairs of n-grams (up to 7-grams) that were tagged. (An n-gram is a span of text made up of n tokens.) (A token is a string of contiguous characters between two spaces). We do this with three objects:

* A `ContextSpace` defines the "space" of all candidates we even potentially consider; in this case we use the `Ngrams` subclass, and look for all n-grams up to 7 words long

* A `Matcher` heuristically filters the candidates we use. The keyword argument `longest_match_only` means that we'll skip n-grams contained in other n-grams.

* A `CandidateExtractor` combines this all together

In [188]:
from snorkel.matchers import DictionaryMatch
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.models import candidate_subclass

In [189]:
# Define the candidate to extract (virus-host pair)
VirusHost = candidate_subclass('VirusHost', ['virus', 'host'])

In [190]:
# Define the dictionary matchers, define the candidate extractor
ngrams = Ngrams(n_max=10)
virus_matcher = DictionaryMatch(d = virus_list)
animals_matcher = DictionaryMatch(d = animals_list, stemmer = 'porter')
cand_extractor = CandidateExtractor(VirusHost, [ngrams, ngrams], [virus_matcher, animals_matcher], nested_relations = True)

### Split the docs into 3 sets: training, development, and testing sets

In [191]:
from snorkel.models import Document

docs = session.query(Document).order_by(Document.name).all()

train_sents = set()
dev_sents   = set()
test_sents  = set()

for i, doc in enumerate(docs):
    for s in doc.sentences:
        if i % 10 == 8:
            dev_sents.add(s)
        elif i % 10 == 9:
            test_sents.add(s)
        else:
            train_sents.add(s)

In [192]:
print(len(train_sents))
print(len(dev_sents))
print(len(test_sents))

11869
1759
807


### Apply the candidator extractor to the three sets of sentences. The results will be persisted in the database backend.

In [193]:
%%time
for i, sents in enumerate([train_sents, dev_sents, test_sents]):
    cand_extractor.apply(sents, split=i)
    print("Number of candidates:", session.query(VirusHost).filter(VirusHost.split == i).count())

Clearing existing...
Running UDF...


100%|██████████████████████████████| 11869/11869 [01:00<00:00, 195.19it/s]


Number of candidates: 1808
Clearing existing...
Running UDF...


100%|████████████████████████████████| 1759/1759 [00:09<00:00, 189.75it/s]


Number of candidates: 629
Clearing existing...
Running UDF...


100%|██████████████████████████████████| 807/807 [00:04<00:00, 172.40it/s]


Number of candidates: 218
Wall time: 1min 15s


In [194]:
## To do next: loading gold labels, writing labelling functions

In [195]:
print("Number of training candidates:", session.query(VirusHost).filter(VirusHost.split == 0).count())
print("Number of development candidates:", session.query(VirusHost).filter(VirusHost.split == 1).count())
print("Number of test candidates:", session.query(VirusHost).filter(VirusHost.split == 2).count())
print("Total candidates extracted:", session.query(VirusHost).count())

Number of training candidates: 1808
Number of development candidates: 629
Number of test candidates: 218
Total candidates extracted: 2655


## Next step: write Labeling Functions

Labeling functions encode our heuristics and weak supervision signals to generate (noisy) labels for our training candidates.

In Snorkel, our primary interface through which we provide training signal to the end extraction model we are training is by writing **labeling functions (LFs)** (as opposed to hand-labeling massive training sets). 

A labeling function is just a Python function that accepts a `Candidate` and returns `1` to mark the `Candidate` as true, `-1` to mark the `Candidate` as false, and `0` to abstain from labeling the `Candidate`

In [196]:
# Labelling functions
import re
from snorkel.lf_helpers import (
    get_left_tokens, get_right_tokens, get_between_tokens,
    get_text_between, get_tagged_text,
)

In [197]:
def LF_related(c):
    return 1 if 'related' in c.get_parent().words else 0

def LF_isolated(c):
    return 1 if 'isolated' in c.get_parent().words else 0

def LF_detected(c):
    return 1 if 'detected' in c.get_parent().words else 0
    

In [198]:
labeled = []
for c in session.query(VirusHost).filter(VirusHost.split == 0).all():
    if LF_related(c) != 0 or LF_isolated(c) != 0 or LF_detected(c) != 0:
        labeled.append(c)
print("Number labeled:", len(labeled))

Number labeled: 250


In [199]:
from snorkel.viewer import SentenceNgramViewer

SentenceNgramViewer(labeled, session)

<IPython.core.display.Javascript object>

SentenceNgramViewer(cids=[[[0, 1, 2, 3, 4, 5], [81], [93]], [[17, 18, 19, 20, 21, 22, 23], [89, 90, 91], [142,…

In [200]:
# Running the LFs
from snorkel.annotations import LabelAnnotator
LFs = [
    LF_related, LF_isolated, LF_detected
]
labeler = LabelAnnotator(lfs=LFs)

In [201]:
np.random.seed(1701)
%time L_train = labeler.apply(split=0)
L_train

Clearing existing...
Running UDF...


100%|████████████████████████████████| 1808/1808 [00:11<00:00, 163.11it/s]


Wall time: 11.1 s


<1808x3 sparse matrix of type '<class 'numpy.int32'>'
	with 280 stored elements in Compressed Sparse Row format>

In [202]:
L_train.get_candidate(session, 0)

VirusHost(Span("b'West Nile virus'", sentence=6238, chars=[0,14], words=[0,2]), Span("b''", sentence=6238, chars=[39,38], words=[6,6]))

In [203]:
L_train.get_key(session, 0)

LabelKey (LF_related)

Getting statistics about the resulting label matrix:

* **Coverage** is the fraction of candidates that the labeling function emits a non-zero label for.
* **Overlap** is the fraction candidates that the labeling function emits a non-zero label for and that another labeling function emits a non-zero label for.
* **Conflict** is the fraction candidates that the labeling function emits a non-zero label for and that another labeling function emits a *conflicting* non-zero label for.

In [204]:
L_train.lf_stats(session)

Unnamed: 0,j,Coverage,Overlaps,Conflicts
LF_related,0,0.042588,0.011615,0.0
LF_isolated,1,0.07135,0.010509,0.0
LF_detected,2,0.040929,0.007743,0.0
