- Tested working on 3/26/2019
- A total of 378 candidates extracted
- Input files required to work:
    - documents: 'pdfs.tsv'
    - host/species names: 'domestic_names.csv', 'ictv_animals.csv', 'ictv_viruses.csv'

In [27]:
import numpy as np
import pandas as pd

In [28]:
import os
from pathlib import Path

In [29]:
# Load Snorkel
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession
session = SnorkelSession()

n_docs = 500 if 'CI' in os.environ else 2591

In [30]:
from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser
from snorkel.models import Document, Sentence
from snorkel.parser import TSVDocPreprocessor

### First read in the documents, stored in TSV format, using a document preprocessor

In [69]:
doc_preprocessor = TSVDocPreprocessor('pdfs.tsv', max_docs=n_docs)

In [70]:
corpus_parser = CorpusParser(parser=Spacy())
%time corpus_parser.apply(doc_preprocessor, count=n_docs)

Clearing existing...
Running UDF...



  0%|                                            | 0/2591 [00:00<?, ?it/s]
  0%|                                  | 2/2591 [00:03<1:05:01,  1.51s/it]
  0%|                                  | 3/2591 [00:05<1:23:36,  1.94s/it]
  0%|                                  | 4/2591 [00:07<1:12:12,  1.67s/it]
  0%|                                  | 5/2591 [00:07<1:01:00,  1.42s/it]
  0%|                                  | 6/2591 [00:10<1:11:52,  1.67s/it]
  0%|                                  | 7/2591 [00:11<1:14:53,  1.74s/it]
  0%|                                  | 8/2591 [00:18<2:17:12,  3.19s/it]
  0%|                                  | 9/2591 [00:23<2:37:10,  3.65s/it]
  0%|▏                                | 10/2591 [00:26<2:27:19,  3.42s/it]
  0%|▏                                | 11/2591 [00:31<2:46:28,  3.87s/it]
  0%|▏                                | 12/2591 [00:32<2:11:50,  3.07s/it]
  1%|▏                                | 13/2591 [00:33<1:50:45,  2.58s/it]
  1%|▏                  

Wall time: 1min 34s


In [71]:
print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

Documents: 39
Sentences: 14435


### Next steps: create matcher functions from dictionaries to match virus/host names in the text data

In [72]:
# Create a list of animal host names to use as our dictionary in the matcher
domestic_names = pd.read_csv('domestic_names.csv')

In [73]:
names1 = domestic_names.iloc[:,0]
names2 = domestic_names.iloc[:,1]
names3 = domestic_names.iloc[:,2]
names_list = names1.append([names2,names3])
names_list = names_list.tolist()

In [74]:
names_list.append("dromedary")
names_list.append("Peking")

In [75]:
ictv_animals = pd.read_csv('ictv_animals.csv')
#print('Total animal names:', ictv_animals.count().sum()) # total number of animal names in the ddict

Total animal names: 46430


In [76]:
ictv_series = ictv_animals.stack().reset_index().iloc[:,2]

In [77]:
ictv_list = ictv_series.tolist()

In [78]:
# Function that gets first letter of genus + species name 
def name(s): 
    # split the string into a list  
    l = s.split() 
    new_word = ""  # begins as empty string
    if len(l) == 2:
        for i in range(len(l)-1): 
            s = l[i] 
            # adds the capital first character  
            new_word += (s[0].upper()+'. ') 
        new_word += l[-1].title() # add the last word
        return new_word 
    else:
        return s

In [79]:
ictv_list2 = [name(s) for s in ictv_list] # shortened species names list

In [80]:
animals_list = names_list + ictv_list + ictv_list2

In [81]:
animals_list = list(set(animals_list)) # remove duplicates from the list

In [82]:
# Create a list of virus names
ictv_viruses = pd.read_csv('ictv_viruses.csv')
# create copies of certain virus names without the digit at the end
ictv_viruses['Species2'] = ictv_viruses['Species'].str.replace('\d+', '', regex=True)

In [83]:
ictv_v_series = ictv_viruses.stack().reset_index().iloc[:,2].drop_duplicates()
virus_list = ictv_v_series.tolist()

In [84]:
# Clean up white space
animals_list = [animal.strip() for animal in animals_list]
virus_list = [virus.strip() for virus in virus_list]

In [85]:
print('Number of virus species to match:', len(virus_list))
print('Number of host species to match:', len(animals_list))

Number of virus species to match: 6079
Number of host species to match: 69877


### Writing a basic `CandidateExtractor`

* Basic function to extract **candidate Virus-Host relation mentions** from the corpus, using the list of names.

* We will extract `Candidate` objects by identifying, for each `Sentence`, all pairs of n-grams (up to 7-grams) that were tagged. (An n-gram is a span of text made up of n tokens.) (A token is a string of contiguous characters between two spaces). We do this with three objects:

* A `ContextSpace` defines the "space" of all candidates we even potentially consider; in this case we use the `Ngrams` subclass, and look for all n-grams up to 7 words long

* A `Matcher` heuristically filters the candidates we use. The keyword argument `longest_match_only` means that we'll skip n-grams contained in other n-grams.

* A `CandidateExtractor` combines this all together

In [86]:
from snorkel.matchers import DictionaryMatch
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.models import candidate_subclass

In [87]:
# Define the candidate to extract (virus-host pair)
VirusHost = candidate_subclass('VirusHost', ['virus', 'host'])

In [98]:
# Define the dictionary matchers, define the candidate extractor
ngrams = Ngrams(n_max=10)
virus_matcher = DictionaryMatch(d = virus_list, stemmer = 'porter')
animals_matcher = DictionaryMatch(d = animals_list, stemmer = 'porter')
cand_extractor = CandidateExtractor(VirusHost, [ngrams, ngrams], [virus_matcher, animals_matcher], self_relations = True, nested_relations = True)

### Split the docs into 3 sets: training, development, and testing sets

In [99]:
from snorkel.models import Document

docs = session.query(Document).order_by(Document.name).all()

train_sents = set()
dev_sents   = set()
test_sents  = set()

for i, doc in enumerate(docs):
    for s in doc.sentences:
        if i % 10 == 8:
            dev_sents.add(s)
        elif i % 10 == 9:
            test_sents.add(s)
        else:
            train_sents.add(s)

In [100]:
print(len(train_sents))
print(len(dev_sents))
print(len(test_sents))

11869
1759
807


### Apply the candidator extractor to the three sets of sentences. The results will be persisted in the database backend.

In [101]:
%%time
for i, sents in enumerate([train_sents, dev_sents, test_sents]):
    cand_extractor.apply(sents, split=i)
    print("Number of candidates:", session.query(VirusHost).filter(VirusHost.split == i).count())

Clearing existing...
Running UDF...



  0%|                                           | 0/11869 [00:00<?, ?it/s]
  0%|                                  | 13/11869 [00:00<02:12, 89.28it/s]
  0%|                                  | 21/11869 [00:00<02:25, 81.35it/s]
  0%|                                  | 38/11869 [00:00<02:04, 94.71it/s]
  0%|▏                                | 52/11869 [00:00<01:55, 101.99it/s]
  1%|▏                                 | 61/11869 [00:00<02:26, 80.71it/s]
  1%|▏                                 | 72/11869 [00:00<02:16, 86.74it/s]
  1%|▏                                 | 82/11869 [00:00<02:11, 89.67it/s]
  1%|▎                                 | 93/11869 [00:00<02:11, 89.23it/s]
  1%|▎                                | 106/11869 [00:01<02:01, 97.02it/s]
  1%|▎                               | 118/11869 [00:01<01:56, 101.19it/s]
  1%|▎                               | 131/11869 [00:01<01:49, 107.39it/s]
  1%|▍                               | 142/11869 [00:01<01:48, 107.62it/s]
  1%|▍                  

 15%|████▋                          | 1816/11869 [00:13<01:03, 158.46it/s]
 15%|████▊                          | 1833/11869 [00:14<01:04, 156.40it/s]
 16%|████▊                          | 1850/11869 [00:14<01:20, 124.79it/s]
 16%|████▊                          | 1864/11869 [00:14<01:18, 127.35it/s]
 16%|████▉                          | 1878/11869 [00:14<01:23, 120.15it/s]
 16%|████▉                          | 1891/11869 [00:14<01:32, 108.31it/s]
 16%|████▉                          | 1903/11869 [00:14<01:31, 108.93it/s]
 16%|█████                          | 1918/11869 [00:14<01:25, 116.28it/s]
 16%|█████                          | 1931/11869 [00:15<01:28, 112.01it/s]
 16%|█████                          | 1943/11869 [00:15<01:35, 103.75it/s]
 16%|█████▎                          | 1954/11869 [00:15<01:40, 98.55it/s]
 17%|█████▏                         | 1968/11869 [00:15<01:32, 106.50it/s]
 17%|█████▏                         | 1987/11869 [00:15<01:24, 116.81it/s]
 17%|█████▏              

 30%|█████████▎                     | 3566/11869 [00:27<01:02, 132.18it/s]
 30%|█████████▎                     | 3580/11869 [00:28<01:05, 126.77it/s]
 30%|█████████▍                     | 3594/11869 [00:28<01:12, 114.30it/s]
 30%|█████████▍                     | 3606/11869 [00:28<01:21, 101.82it/s]
 30%|█████████▍                     | 3618/11869 [00:28<01:17, 106.26it/s]
 31%|█████████▍                     | 3630/11869 [00:28<01:22, 100.40it/s]
 31%|█████████▌                     | 3653/11869 [00:28<01:10, 117.37it/s]
 31%|█████████▌                     | 3667/11869 [00:28<01:18, 104.19it/s]
 31%|█████████▌                     | 3679/11869 [00:28<01:16, 106.83it/s]
 31%|█████████▋                     | 3692/11869 [00:29<01:14, 109.44it/s]
 31%|█████████▋                     | 3709/11869 [00:29<01:08, 119.51it/s]
 31%|█████████▋                     | 3729/11869 [00:29<01:00, 133.55it/s]
 32%|█████████▊                     | 3753/11869 [00:29<00:53, 152.08it/s]
 32%|█████████▊          

 46%|██████████████▏                | 5437/11869 [00:41<00:35, 179.61it/s]
 46%|██████████████▎                | 5456/11869 [00:42<00:46, 139.41it/s]
 46%|██████████████▎                | 5472/11869 [00:42<00:46, 137.19it/s]
 46%|██████████████▎                | 5487/11869 [00:42<00:46, 136.66it/s]
 46%|██████████████▎                | 5502/11869 [00:42<00:47, 132.71it/s]
 47%|██████████████▍                | 5522/11869 [00:42<00:43, 145.45it/s]
 47%|██████████████▍                | 5538/11869 [00:42<00:46, 137.25it/s]
 47%|██████████████▌                | 5553/11869 [00:42<00:49, 126.39it/s]
 47%|██████████████▌                | 5568/11869 [00:42<00:49, 127.67it/s]
 47%|██████████████▌                | 5582/11869 [00:42<00:48, 130.49it/s]
 47%|██████████████▋                | 5610/11869 [00:43<00:41, 151.85it/s]
 47%|██████████████▋                | 5627/11869 [00:43<00:40, 152.78it/s]
 48%|██████████████▋                | 5644/11869 [00:43<00:46, 133.70it/s]
 48%|██████████████▊     

 61%|███████████████████▍            | 7213/11869 [00:55<00:56, 82.61it/s]
 61%|██████████████████▉            | 7235/11869 [00:55<00:45, 100.83it/s]
 61%|██████████████████▉            | 7256/11869 [00:55<00:38, 119.11it/s]
 61%|██████████████████▉            | 7272/11869 [00:56<00:45, 101.42it/s]
 61%|███████████████████            | 7285/11869 [00:56<00:43, 105.49it/s]
 61%|███████████████████            | 7299/11869 [00:56<00:41, 111.27it/s]
 62%|███████████████████▏           | 7330/11869 [00:56<00:33, 135.48it/s]
 62%|███████████████████▏           | 7348/11869 [00:56<00:36, 124.34it/s]
 62%|███████████████████▏           | 7364/11869 [00:56<00:35, 125.22it/s]
 62%|███████████████████▉            | 7379/11869 [00:56<00:53, 84.51it/s]
 62%|███████████████████▉            | 7391/11869 [00:57<00:50, 89.08it/s]
 62%|███████████████████▉            | 7403/11869 [00:57<00:50, 88.66it/s]
 62%|███████████████████▉            | 7414/11869 [00:57<00:47, 93.48it/s]
 63%|████████████████████

 73%|██████████████████████▌        | 8651/11869 [01:10<00:28, 113.19it/s]
 73%|██████████████████████▋        | 8666/11869 [01:10<00:27, 116.57it/s]
 73%|██████████████████████▋        | 8679/11869 [01:10<00:26, 120.07it/s]
 73%|██████████████████████▋        | 8694/11869 [01:10<00:25, 126.96it/s]
 73%|██████████████████████▋        | 8708/11869 [01:10<00:27, 114.70it/s]
 73%|██████████████████████▊        | 8721/11869 [01:10<00:28, 111.79it/s]
 74%|██████████████████████▊        | 8740/11869 [01:11<00:24, 126.90it/s]
 74%|██████████████████████▊        | 8754/11869 [01:11<00:26, 117.83it/s]
 74%|██████████████████████▉        | 8767/11869 [01:11<00:28, 110.52it/s]
 74%|██████████████████████▉        | 8784/11869 [01:11<00:25, 121.95it/s]
 74%|██████████████████████▉        | 8802/11869 [01:11<00:22, 134.82it/s]
 74%|███████████████████████        | 8819/11869 [01:11<00:22, 136.81it/s]
 74%|███████████████████████        | 8834/11869 [01:11<00:24, 124.25it/s]
 75%|████████████████████

 89%|██████████████████████████▌   | 10516/11869 [01:24<00:08, 155.23it/s]
 89%|██████████████████████████▌   | 10533/11869 [01:24<00:10, 130.16it/s]
 89%|██████████████████████████▋   | 10548/11869 [01:24<00:09, 133.77it/s]
 89%|██████████████████████████▋   | 10563/11869 [01:24<00:09, 137.67it/s]
 89%|██████████████████████████▋   | 10580/11869 [01:25<00:09, 140.68it/s]
 89%|██████████████████████████▊   | 10595/11869 [01:25<00:09, 131.29it/s]
 89%|██████████████████████████▊   | 10609/11869 [01:25<00:10, 117.14it/s]
 89%|██████████████████████████▊   | 10622/11869 [01:25<00:11, 109.27it/s]
 90%|██████████████████████████▉   | 10634/11869 [01:25<00:11, 104.19it/s]
 90%|██████████████████████████▉   | 10648/11869 [01:25<00:11, 107.13it/s]
 90%|██████████████████████████▉   | 10674/11869 [01:25<00:09, 128.20it/s]
 90%|███████████████████████████   | 10691/11869 [01:25<00:08, 133.73it/s]
 90%|███████████████████████████   | 10706/11869 [01:26<00:09, 127.44it/s]
 90%|████████████████████

Number of candidates: 334
Clearing existing...
Running UDF...



  0%|                                            | 0/1759 [00:00<?, ?it/s]
  0%|▏                                   | 8/1759 [00:00<00:23, 73.69it/s]
  1%|▍                                  | 20/1759 [00:00<00:20, 82.87it/s]
  2%|▌                                  | 31/1759 [00:00<00:19, 88.47it/s]
  3%|█                                 | 56/1759 [00:00<00:15, 107.13it/s]
  4%|█▎                                 | 68/1759 [00:00<00:21, 78.02it/s]
  5%|█▋                                 | 87/1759 [00:00<00:17, 94.39it/s]
  6%|█▉                               | 102/1759 [00:00<00:15, 103.63it/s]
  7%|██▎                              | 125/1759 [00:01<00:14, 114.52it/s]
  8%|██▌                              | 139/1759 [00:01<00:15, 107.57it/s]
  9%|██▊                              | 152/1759 [00:01<00:15, 106.04it/s]
  9%|███                              | 164/1759 [00:01<00:14, 108.76it/s]
 10%|███▎                             | 176/1759 [00:01<00:14, 106.62it/s]
 11%|███▌               

Number of candidates: 35
Clearing existing...
Running UDF...



  0%|                                             | 0/807 [00:00<?, ?it/s]
  1%|▍                                    | 9/807 [00:00<00:10, 79.16it/s]
  2%|▋                                   | 16/807 [00:00<00:10, 72.01it/s]
  5%|█▋                                  | 37/807 [00:00<00:08, 89.61it/s]
  7%|██▍                                | 55/807 [00:00<00:07, 103.73it/s]
  8%|██▉                                 | 66/807 [00:00<00:07, 98.35it/s]
 10%|███▍                                | 77/807 [00:00<00:12, 60.83it/s]
 11%|████                                | 90/807 [00:01<00:10, 71.22it/s]
 13%|████▋                              | 108/807 [00:01<00:08, 83.26it/s]
 15%|█████▍                             | 125/807 [00:01<00:06, 97.85it/s]
 17%|█████▉                            | 141/807 [00:01<00:06, 109.01it/s]
 19%|██████▋                            | 155/807 [00:01<00:06, 95.99it/s]
 21%|███████                           | 167/807 [00:01<00:06, 100.61it/s]
 23%|███████▉           

Number of candidates: 9
Wall time: 1min 55s


In [53]:
## To do next: loading gold labels, writing labelling functions

In [107]:
print("Number of training candidates:", session.query(VirusHost).filter(VirusHost.split == 0).count())
print("Number of development candidates:", session.query(VirusHost).filter(VirusHost.split == 1).count())
print("Number of test candidates:", session.query(VirusHost).filter(VirusHost.split == 2).count())
print("Total candidates extracted:", session.query(VirusHost).count())

Number of training candidates: 334
Number of development candidates: 35
Number of test candidates: 9
Total candidates extracted: 378


In [104]:
## TESTING THE SENTENCE VIEWER

In [105]:
labeled = []
for c in session.query(VirusHost).filter(VirusHost.split == 0).all():
    labeled.append(c)
print("Number labeled:", len(labeled))

Number labeled: 334


In [106]:
from snorkel.viewer import SentenceNgramViewer

SentenceNgramViewer(labeled, session)

<IPython.core.display.Javascript object>

SentenceNgramViewer(cids=[[[101], [182], [127]], [[92, 93], [191], [184]], [[0], [198, 199], [88]], [[109, 110…