- Tested working on 3/26/2019
- Input files required to work:
    - documents: 'pdfs.tsv'
    - host/species names: 'domestic_names.csv', 'ictv_animals.csv', 'ictv_viruses.csv'

In [27]:
import numpy as np
import pandas as pd

In [28]:
import os
from pathlib import Path

In [29]:
# Load Snorkel
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession
session = SnorkelSession()

n_docs = 500 if 'CI' in os.environ else 2591

In [30]:
from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser
from snorkel.models import Document, Sentence
from snorkel.parser import TSVDocPreprocessor

### First read in the documents, stored in TSV format, using a document preprocessor

In [69]:
doc_preprocessor = TSVDocPreprocessor('pdfs.tsv', max_docs=n_docs)

In [70]:
corpus_parser = CorpusParser(parser=Spacy())
%time corpus_parser.apply(doc_preprocessor, count=n_docs)

Clearing existing...
Running UDF...



  0%|                                            | 0/2591 [00:00<?, ?it/s]
  0%|                                  | 2/2591 [00:03<1:05:01,  1.51s/it]
  0%|                                  | 3/2591 [00:05<1:23:36,  1.94s/it]
  0%|                                  | 4/2591 [00:07<1:12:12,  1.67s/it]
  0%|                                  | 5/2591 [00:07<1:01:00,  1.42s/it]
  0%|                                  | 6/2591 [00:10<1:11:52,  1.67s/it]
  0%|                                  | 7/2591 [00:11<1:14:53,  1.74s/it]
  0%|                                  | 8/2591 [00:18<2:17:12,  3.19s/it]
  0%|                                  | 9/2591 [00:23<2:37:10,  3.65s/it]
  0%|▏                                | 10/2591 [00:26<2:27:19,  3.42s/it]
  0%|▏                                | 11/2591 [00:31<2:46:28,  3.87s/it]
  0%|▏                                | 12/2591 [00:32<2:11:50,  3.07s/it]
  1%|▏                                | 13/2591 [00:33<1:50:45,  2.58s/it]
  1%|▏                  

Wall time: 1min 34s


In [71]:
print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

Documents: 39
Sentences: 14435


### Next steps: create matcher functions from dictionaries to match virus/host names in the text data

In [72]:
# Create a list of animal host names to use as our dictionary in the matcher
domestic_names = pd.read_csv('domestic_names.csv')

In [73]:
names1 = domestic_names.iloc[:,0]
names2 = domestic_names.iloc[:,1]
names3 = domestic_names.iloc[:,2]
names_list = names1.append([names2,names3])
names_list = names_list.tolist()

In [74]:
names_list.append("dromedary")
names_list.append("Peking")

In [75]:
ictv_animals = pd.read_csv('ictv_animals.csv')
print('Total animal names:', ictv_animals.count().sum()) # total number of animal names in the ddict

Total animal names: 46430


In [76]:
ictv_series = ictv_animals.stack().reset_index().iloc[:,2]

In [77]:
ictv_list = ictv_series.tolist()

In [78]:
# Function that gets first letter of genus + species name 
def name(s): 
    # split the string into a list  
    l = s.split() 
    new_word = ""  # begins as empty string
    if len(l) == 2:
        for i in range(len(l)-1): 
            s = l[i] 
            # adds the capital first character  
            new_word += (s[0].upper()+'. ') 
        new_word += l[-1].title() # add the last word
        return new_word 
    else:
        return s

In [79]:
ictv_list2 = [name(s) for s in ictv_list] # shortened species names list

In [80]:
animals_list = names_list + ictv_list + ictv_list2

In [81]:
animals_list = list(set(animals_list)) # remove duplicates from the list

In [82]:
# Create a list of virus names
ictv_viruses = pd.read_csv('ictv_viruses.csv')
# create copies of certain virus names without the digit at the end
ictv_viruses['Species2'] = ictv_viruses['Species'].str.replace('\d+', '', regex=True)

In [83]:
ictv_v_series = ictv_viruses.stack().reset_index().iloc[:,2].drop_duplicates()
virus_list = ictv_v_series.tolist()

In [84]:
# Clean up white space
animals_list = [animal.strip() for animal in animals_list]
virus_list = [virus.strip() for virus in virus_list]

In [85]:
print('Number of virus species to match:', len(virus_list))
print('Number of host species to match:', len(animals_list))

Number of virus species to match: 6079
Number of host species to match: 69877


### Writing a basic `CandidateExtractor`

* Basic function to extract **candidate Virus-Host relation mentions** from the corpus, using the list of names.

* We will extract `Candidate` objects by identifying, for each `Sentence`, all pairs of n-grams (up to 7-grams) that were tagged. (An n-gram is a span of text made up of n tokens.) (A token is a string of contiguous characters between two spaces). We do this with three objects:

* A `ContextSpace` defines the "space" of all candidates we even potentially consider; in this case we use the `Ngrams` subclass, and look for all n-grams up to 7 words long

* A `Matcher` heuristically filters the candidates we use. The keyword argument `longest_match_only` means that we'll skip n-grams contained in other n-grams.

* A `CandidateExtractor` combines this all together

In [86]:
from snorkel.matchers import DictionaryMatch
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.models import candidate_subclass

In [87]:
# Define the candidate to extract (virus-host pair)
VirusHost = candidate_subclass('VirusHost', ['virus', 'host'])

In [88]:
# Define the dictionary matchers, define the candidate extractor
ngrams = Ngrams(n_max=10)
virus_matcher = DictionaryMatch(d = virus_list, stemmer = 'porter')
animals_matcher = DictionaryMatch(d = animals_list, stemmer = 'porter')
cand_extractor = CandidateExtractor(VirusHost, [ngrams, ngrams], [virus_matcher, animals_matcher])

### Split the docs into 3 sets: training, development, and testing sets

In [89]:
from snorkel.models import Document

docs = session.query(Document).order_by(Document.name).all()

train_sents = set()
dev_sents   = set()
test_sents  = set()

for i, doc in enumerate(docs):
    for s in doc.sentences:
        if i % 10 == 8:
            dev_sents.add(s)
        elif i % 10 == 9:
            test_sents.add(s)
        else:
            train_sents.add(s)

In [90]:
print(len(train_sents))
print(len(dev_sents))
print(len(test_sents))

11869
1759
807


### Apply the candidator extractor to the three sets of sentences. The results will be persisted in the database backend.

In [91]:
%%time
for i, sents in enumerate([train_sents, dev_sents, test_sents]):
    cand_extractor.apply(sents, split=i)
    print("Number of candidates:", session.query(VirusHost).filter(VirusHost.split == i).count())

Clearing existing...
Running UDF...



  0%|                                           | 0/11869 [00:00<?, ?it/s]
  0%|                                   | 4/11869 [00:00<04:57, 39.89it/s]
  0%|                                  | 13/11869 [00:00<04:08, 47.75it/s]
  0%|                                  | 21/11869 [00:00<03:40, 53.70it/s]
  0%|                                  | 29/11869 [00:00<03:31, 55.96it/s]
  0%|                                  | 38/11869 [00:00<03:10, 62.12it/s]
  0%|▏                                 | 53/11869 [00:00<02:37, 75.25it/s]
  1%|▏                                 | 64/11869 [00:00<02:25, 81.20it/s]
  1%|▏                                 | 85/11869 [00:00<01:58, 99.09it/s]
  1%|▎                                 | 98/11869 [00:00<01:59, 98.76it/s]
  1%|▎                                | 110/11869 [00:01<02:20, 83.69it/s]
  1%|▎                                | 121/11869 [00:01<02:15, 86.76it/s]
  1%|▎                                | 131/11869 [00:01<02:32, 77.07it/s]
  1%|▍                  

 13%|████                           | 1579/11869 [00:14<01:14, 137.24it/s]
 13%|████▏                          | 1594/11869 [00:14<01:15, 135.48it/s]
 14%|████▏                          | 1609/11869 [00:14<01:25, 119.97it/s]
 14%|████▎                          | 1631/11869 [00:14<01:14, 138.21it/s]
 14%|████▎                          | 1654/11869 [00:14<01:05, 154.87it/s]
 14%|████▎                          | 1672/11869 [00:14<01:04, 158.35it/s]
 14%|████▍                          | 1690/11869 [00:14<01:03, 159.07it/s]
 14%|████▍                          | 1707/11869 [00:15<01:09, 146.05it/s]
 15%|████▌                          | 1723/11869 [00:15<01:10, 144.03it/s]
 15%|████▌                          | 1738/11869 [00:15<01:19, 127.56it/s]
 15%|████▌                          | 1754/11869 [00:15<01:19, 127.20it/s]
 15%|████▌                          | 1770/11869 [00:15<01:18, 128.77it/s]
 15%|████▋                          | 1784/11869 [00:15<01:17, 129.48it/s]
 15%|████▋               

 27%|████████▎                      | 3177/11869 [00:27<01:24, 103.17it/s]
 27%|████████▎                      | 3189/11869 [00:28<01:24, 102.81it/s]
 27%|████████▋                       | 3200/11869 [00:28<01:30, 95.91it/s]
 27%|████████▋                       | 3211/11869 [00:28<01:31, 94.44it/s]
 27%|████████▍                      | 3224/11869 [00:28<01:24, 102.15it/s]
 27%|████████▍                      | 3235/11869 [00:28<01:24, 102.71it/s]
 27%|████████▊                       | 3246/11869 [00:28<01:26, 99.76it/s]
 27%|████████▊                       | 3257/11869 [00:28<02:00, 71.39it/s]
 28%|████████▊                       | 3277/11869 [00:29<01:38, 87.33it/s]
 28%|████████▊                       | 3289/11869 [00:29<01:34, 91.05it/s]
 28%|████████▉                       | 3300/11869 [00:29<01:33, 92.00it/s]
 28%|████████▉                       | 3313/11869 [00:29<01:25, 99.49it/s]
 28%|████████▋                      | 3327/11869 [00:29<01:19, 106.98it/s]
 28%|█████████           

 39%|████████████▏                  | 4664/11869 [00:41<01:08, 105.29it/s]
 39%|████████████▏                  | 4677/11869 [00:42<01:10, 102.27it/s]
 40%|████████████▋                   | 4689/11869 [00:42<01:18, 91.88it/s]
 40%|████████████▎                  | 4705/11869 [00:42<01:10, 101.38it/s]
 40%|████████████▎                  | 4719/11869 [00:42<01:06, 107.73it/s]
 40%|████████████▍                  | 4741/11869 [00:42<00:57, 123.63it/s]
 40%|████████████▍                  | 4757/11869 [00:42<00:54, 131.47it/s]
 40%|████████████▍                  | 4772/11869 [00:42<00:52, 134.80it/s]
 40%|████████████▌                  | 4787/11869 [00:42<00:56, 126.17it/s]
 40%|████████████▌                  | 4801/11869 [00:43<00:55, 126.25it/s]
 41%|████████████▌                  | 4815/11869 [00:43<01:00, 115.98it/s]
 41%|████████████▌                  | 4828/11869 [00:43<00:59, 117.68it/s]
 41%|████████████▋                  | 4842/11869 [00:43<00:56, 123.34it/s]
 41%|████████████▋       

 52%|████████████████▋               | 6168/11869 [00:56<01:08, 83.65it/s]
 52%|████████████████▋               | 6180/11869 [00:56<01:05, 87.26it/s]
 52%|████████████████▋               | 6191/11869 [00:57<01:02, 91.40it/s]
 52%|████████████████▏              | 6212/11869 [00:57<00:51, 109.74it/s]
 52%|████████████████▊               | 6225/11869 [00:57<01:00, 93.01it/s]
 53%|████████████████▎              | 6240/11869 [00:57<00:54, 104.16it/s]
 53%|████████████████▊               | 6253/11869 [00:57<00:57, 98.03it/s]
 53%|████████████████▉               | 6265/11869 [00:57<00:58, 96.54it/s]
 53%|████████████████▉               | 6276/11869 [00:57<01:08, 81.24it/s]
 53%|████████████████▉               | 6294/11869 [00:57<00:58, 95.59it/s]
 53%|█████████████████               | 6306/11869 [00:58<01:02, 88.39it/s]
 53%|█████████████████               | 6317/11869 [00:58<01:26, 64.49it/s]
 53%|█████████████████               | 6329/11869 [00:58<01:18, 70.78it/s]
 53%|█████████████████   

 63%|████████████████████▏           | 7506/11869 [01:11<00:48, 89.54it/s]
 63%|████████████████████▎           | 7519/11869 [01:11<00:44, 98.16it/s]
 63%|████████████████████▎           | 7530/11869 [01:11<00:50, 85.34it/s]
 64%|████████████████████▎           | 7548/11869 [01:11<00:43, 99.73it/s]
 64%|███████████████████▊           | 7567/11869 [01:11<00:37, 115.71it/s]
 64%|███████████████████▊           | 7581/11869 [01:12<00:41, 103.46it/s]
 64%|███████████████████▊           | 7598/11869 [01:12<00:37, 114.22it/s]
 64%|███████████████████▉           | 7623/11869 [01:12<00:31, 136.26it/s]
 64%|███████████████████▉           | 7640/11869 [01:12<00:32, 128.69it/s]
 65%|████████████████████           | 7664/11869 [01:12<00:29, 144.04it/s]
 65%|████████████████████           | 7681/11869 [01:12<00:31, 132.67it/s]
 65%|████████████████████           | 7696/11869 [01:12<00:30, 135.89it/s]
 65%|████████████████████▏          | 7711/11869 [01:12<00:31, 131.27it/s]
 65%|████████████████████

 76%|███████████████████████▌       | 9002/11869 [01:25<00:27, 103.23it/s]
 76%|███████████████████████▌       | 9014/11869 [01:25<00:28, 100.72it/s]
 76%|████████████████████████▎       | 9026/11869 [01:26<00:28, 98.59it/s]
 76%|███████████████████████▌       | 9043/11869 [01:26<00:28, 100.67it/s]
 76%|████████████████████████▍       | 9054/11869 [01:26<00:30, 92.70it/s]
 76%|████████████████████████▍       | 9064/11869 [01:26<00:44, 62.97it/s]
 76%|████████████████████████▍       | 9072/11869 [01:26<00:43, 64.08it/s]
 77%|████████████████████████▌       | 9089/11869 [01:26<00:36, 75.90it/s]
 77%|████████████████████████▌       | 9102/11869 [01:27<00:32, 85.24it/s]
 77%|███████████████████████▊       | 9123/11869 [01:27<00:26, 103.31it/s]
 77%|████████████████████████▋       | 9137/11869 [01:27<00:28, 95.87it/s]
 77%|████████████████████████▋       | 9149/11869 [01:27<00:31, 85.13it/s]
 77%|███████████████████████▉       | 9168/11869 [01:27<00:26, 100.14it/s]
 77%|████████████████████

 89%|███████████████████████████▋   | 10612/11869 [01:39<00:15, 80.91it/s]
 89%|███████████████████████████▋   | 10621/11869 [01:39<00:15, 81.12it/s]
 90%|███████████████████████████▊   | 10631/11869 [01:40<00:15, 82.04it/s]
 90%|███████████████████████████▊   | 10643/11869 [01:40<00:14, 86.40it/s]
 90%|███████████████████████████▊   | 10652/11869 [01:40<00:13, 87.00it/s]
 90%|██████████████████████████▉   | 10674/11869 [01:40<00:11, 104.96it/s]
 90%|███████████████████████████   | 10690/11869 [01:40<00:10, 114.35it/s]
 90%|███████████████████████████▉   | 10703/11869 [01:40<00:11, 98.92it/s]
 90%|███████████████████████████   | 10725/11869 [01:40<00:09, 116.66it/s]
 91%|███████████████████████████▏  | 10748/11869 [01:40<00:08, 134.33it/s]
 91%|███████████████████████████▏  | 10770/11869 [01:40<00:07, 147.01it/s]
 91%|███████████████████████████▎  | 10787/11869 [01:41<00:07, 146.56it/s]
 91%|███████████████████████████▎  | 10808/11869 [01:41<00:07, 148.94it/s]
 91%|████████████████████

Number of candidates: 315
Clearing existing...
Running UDF...



  0%|                                            | 0/1759 [00:00<?, ?it/s]
  0%|▏                                   | 8/1759 [00:00<00:27, 63.66it/s]
  1%|▎                                  | 16/1759 [00:00<00:25, 67.51it/s]
  2%|▌                                  | 28/1759 [00:00<00:23, 75.06it/s]
  3%|▉                                  | 45/1759 [00:00<00:19, 89.94it/s]
  3%|█▏                                 | 58/1759 [00:00<00:18, 94.01it/s]
  4%|█▎                                 | 68/1759 [00:00<00:25, 65.17it/s]
  5%|█▋                                 | 83/1759 [00:00<00:21, 76.48it/s]
  5%|█▊                                 | 93/1759 [00:01<00:22, 74.13it/s]
  6%|█▉                                | 102/1759 [00:01<00:21, 75.38it/s]
  6%|██▏                               | 114/1759 [00:01<00:19, 84.18it/s]
  7%|██▍                               | 126/1759 [00:01<00:20, 81.10it/s]
  8%|██▌                               | 135/1759 [00:01<00:23, 67.96it/s]
  8%|██▊                

 94%|██████████████████████████████▏ | 1656/1759 [00:14<00:00, 105.66it/s]
 95%|███████████████████████████████▎ | 1668/1759 [00:14<00:01, 86.61it/s]
 96%|██████████████████████████████▋ | 1684/1759 [00:14<00:00, 100.42it/s]
 97%|██████████████████████████████▉ | 1698/1759 [00:14<00:00, 109.55it/s]
 98%|███████████████████████████████▏| 1717/1759 [00:14<00:00, 121.70it/s]
 98%|███████████████████████████████▍| 1731/1759 [00:14<00:00, 121.82it/s]
 99%|███████████████████████████████▊| 1750/1759 [00:14<00:00, 134.97it/s]
100%|████████████████████████████████| 1759/1759 [00:14<00:00, 117.28it/s]

Number of candidates: 33
Clearing existing...
Running UDF...



  0%|                                             | 0/807 [00:00<?, ?it/s]
  1%|▌                                  | 12/807 [00:00<00:07, 104.77it/s]
  2%|▉                                   | 20/807 [00:00<00:08, 94.33it/s]
  5%|█▊                                 | 42/807 [00:00<00:06, 109.92it/s]
  7%|██▍                                | 56/807 [00:00<00:06, 115.25it/s]
  8%|██▉                                | 67/807 [00:00<00:06, 107.10it/s]
 10%|███▍                                | 77/807 [00:00<00:10, 70.15it/s]
 11%|███▉                                | 88/807 [00:00<00:09, 77.04it/s]
 12%|████▎                               | 97/807 [00:01<00:09, 74.83it/s]
 13%|████▌                              | 106/807 [00:01<00:09, 75.32it/s]
 14%|█████                              | 117/807 [00:01<00:08, 80.86it/s]
 16%|█████▍                             | 126/807 [00:01<00:08, 76.82it/s]
 17%|█████▉                             | 138/807 [00:01<00:07, 84.00it/s]
 19%|██████▌            

Number of candidates: 9
Wall time: 2min 13s


In [53]:
# candidate results: training 315, dev 33, test 9
## To do next: loading gold labels, writing labelling functions

In [54]:
print('Ratio of candidates to sentences: %.3f' % ((315+33+9)/14435))
# Goal: increase number of candidates extracted (can implement better matchers?)

Ratio of candidates to sentences: 0.025
