- Tested working on 3/26/2019
- Input files required to work:
    - documents: 'pdfs.tsv'
    - host/species names: 'domestic_names.csv', 'ictv_animals.csv', 'ictv_viruses.csv'

In [1]:
import numpy as np
import pandas as pd

In [2]:
import os
from pathlib import Path

In [3]:
# Load Snorkel
%load_ext autoreload
%autoreload 2
%matplotlib inline

from snorkel import SnorkelSession
session = SnorkelSession()

n_docs = 500 if 'CI' in os.environ else 2591

In [4]:
from snorkel.parser.spacy_parser import Spacy
from snorkel.parser import CorpusParser
from snorkel.models import Document, Sentence
from snorkel.parser import TSVDocPreprocessor

### First read in the documents, stored in TSV format, using a document preprocessor

In [5]:
doc_preprocessor = TSVDocPreprocessor('pdfs.tsv', max_docs=n_docs)

In [6]:
corpus_parser = CorpusParser(parser=Spacy())
%time corpus_parser.apply(doc_preprocessor, count=n_docs)

Clearing existing...
Running UDF...


  2%|▍                                | 39/2591 [01:25<1:15:27,  1.77s/it]


Wall time: 1min 35s


In [17]:
print("Documents:", session.query(Document).count())
print("Sentences:", session.query(Sentence).count())

Documents: 39
Sentences: 14435


### Next steps: create matcher functions from dictionaries to match virus/host names in the text data

In [66]:
# Create a list of animal host names to use as our dictionary in the matcher
domestic_names = pd.read_csv('domestic_names.csv')

In [67]:
names1 = domestic_names.iloc[:,0]
names2 = domestic_names.iloc[:,1]
names3 = domestic_names.iloc[:,2]
names_list = names1.append([names2,names3])
names_list = names_list.tolist()

In [68]:
names_list.append("dromedary")
names_list.append("Peking")

In [43]:
ictv_animals = pd.read_csv('ictv_animals.csv')
print('Total animal names:', ictv_animals.count().sum()) # total number of animal names in the ddict

Total animal names: 46430


In [57]:
ictv_series = ictv_animals.stack().reset_index().iloc[:,2]

In [60]:
ictv_list = ictv_series.tolist()

In [62]:
animals_list = names_list + ictv_list

In [121]:
# Create a list of virus names
ictv_viruses = pd.read_csv('ictv_viruses.csv')
# create copies of certain virus names without the digit at the end
ictv_viruses['Species2'] = ictv_viruses['Species'].str.replace('\d+', '', regex=True)

In [122]:
ictv_v_series = ictv_viruses.stack().reset_index().iloc[:,2].drop_duplicates()
virus_list = ictv_v_series.tolist()

In [123]:
# Clean up white space
animals_list = [animal.strip() for animal in animals_list]
virus_list = [virus.strip() for virus in virus_list]

In [148]:
print('Number of virus species to match:', len(virus_list))
print('Number of host species to match:', len(animals_list))

Number of virus species to match: 6079
Number of host species to match: 46480


### Writing a basic `CandidateExtractor`

* Basic function to extract **candidate Virus-Host relation mentions** from the corpus, using the list of names.

* We will extract `Candidate` objects by identifying, for each `Sentence`, all pairs of n-grams (up to 7-grams) that were tagged. (An n-gram is a span of text made up of n tokens.) (A token is a string of contiguous characters between two spaces). We do this with three objects:

* A `ContextSpace` defines the "space" of all candidates we even potentially consider; in this case we use the `Ngrams` subclass, and look for all n-grams up to 7 words long

* A `Matcher` heuristically filters the candidates we use. The keyword argument `longest_match_only` means that we'll skip n-grams contained in other n-grams.

* A `CandidateExtractor` combines this all together

In [131]:
from snorkel.matchers import DictionaryMatch
from snorkel.candidates import Ngrams, CandidateExtractor
from snorkel.models import candidate_subclass

In [132]:
# Define the candidate to extract (virus-host pair)
VirusHost = candidate_subclass('VirusHost', ['virus', 'host'])

In [149]:
# Define the dictionary matchers, define the candidate extractor
ngrams = Ngrams(n_max=10)
virus_matcher = DictionaryMatch(d = virus_list, stemmer = 'porter')
animals_matcher = DictionaryMatch(d = animals_list, stemmer = 'porter')
cand_extractor = CandidateExtractor(VirusHost, [ngrams, ngrams], [virus_matcher, animals_matcher])

### Split the docs into 3 sets: training, development, and testing sets

In [150]:
from snorkel.models import Document

docs = session.query(Document).order_by(Document.name).all()

train_sents = set()
dev_sents   = set()
test_sents  = set()

for i, doc in enumerate(docs):
    for s in doc.sentences:
        if i % 10 == 8:
            dev_sents.add(s)
        elif i % 10 == 9:
            test_sents.add(s)
        else:
            train_sents.add(s)

In [151]:
print(len(train_sents))
print(len(dev_sents))
print(len(test_sents))

11869
1759
807


### Apply the candidator extractor to the three sets of sentences. The results will be persisted in the database backend.

In [152]:
%%time
for i, sents in enumerate([train_sents, dev_sents, test_sents]):
    cand_extractor.apply(sents, split=i)
    print("Number of candidates:", session.query(VirusHost).filter(VirusHost.split == i).count())

Clearing existing...
Running UDF...


100%|███████████████████████████████| 11869/11869 [02:48<00:00, 77.32it/s]


Number of candidates: 315
Clearing existing...
Running UDF...


100%|█████████████████████████████████| 1759/1759 [00:22<00:00, 76.70it/s]


Number of candidates: 33
Clearing existing...
Running UDF...


100%|███████████████████████████████████| 807/807 [00:11<00:00, 69.25it/s]


Number of candidates: 9
Wall time: 3min 22s


In [16]:
# candidate results: training 315, dev 33, test 9
## To do next: loading gold labels, writing labelling functions

In [147]:
print('Ratio of candidates to sentences: %.3f' % ((315+33+9)/14435))
# Goal: increase number of candidates extracted (can implement better matchers?)

Ratio of candidates to sentences: 0.025
