In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../../..')
from typing import Dict
from wiser.data.dataset_readers import CDRDiseaseDatasetReader, BioASQDatasetReader
from allennlp.data.token_indexers import TokenIndexer
from wiser.lf import LabelingFunction, LinkingFunction, UMLSMatcher, DictionaryMatcher
from allennlp.common.params import Params

## Loads Data

In [2]:
root_directory = '../../..'
cdr_reader = CDRDiseaseDatasetReader()
train_data = cdr_reader.read(root_directory + '/data/cdr/CDR_TrainingSet.BioC.xml')
dev_data = cdr_reader.read(root_directory + '/data/cdr/CDR_DevelopmentSet.BioC.xml')
test_data = cdr_reader.read(root_directory +'/data/cdr/CDR_TestSet.BioC.xml')

0it [00:00, ?it/s]

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

500it [01:13,  7.91it/s]
0it [00:00, ?it/s]

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

500it [01:14,  7.40it/s]
0it [00:00, ?it/s]

HBox(children=(IntProgress(value=0, max=500), HTML(value='')))

500it [01:18,  5.89it/s]


In [7]:
cdr_docs = train_data + dev_data + test_data

## Applies Labeling Functions

In [9]:
def apply_umls(docs, semantic_type, positive=True):
    types = set([semantic_type])
    additional_stop_words = set([
        "analgesic", "anesthesia", "anesthetic", "anterior", "antibiotic",
        "battery", "brain",
        "capillary", "cortex", "face", "grip", "group", "illness", "injury", "medulla",
        "nervous", "nose", "posterior", "liver", "secondary", "suffer", "symptom",
        "toxic", "toxic effect"
    ])
    i_label = 'I' if positive else 'O'
    lf = UMLSMatcher(
        semantic_type, '/data/bats/2018AB', types,
#         semantic_type, '../../../../../../../Desktop/2018AB', types,
        additional_stop_words=additional_stop_words, i_label=i_label)
    lf.apply(docs)
    
    for doc in docs:
        for i, token in enumerate(doc['tokens']):
            if len(token.text) <= 3:
                doc['WISER_LABELS'][semantic_type][i] = 'ABS'

        acronyms = set()
        active = False
        for i, label in enumerate(doc['WISER_LABELS'][semantic_type]):
            if label[0] == i_label:
                active = True
            elif active and doc['tokens'][i].text == '(' and doc['tokens'][i+2].pos_ == "PUNCT":
                acronyms.add(doc['tokens'][i+1].text)
                active = False
            else:
                active = False

        for i, token in enumerate(doc['tokens']):
            if token.text in acronyms:
                doc['WISER_LABELS'][semantic_type][i] = i_label

In [10]:
apply_umls(cdr_docs, 'Congenital Abnormality')

In [11]:
apply_umls(cdr_docs, 'Acquired Abnormality')

In [12]:
apply_umls(cdr_docs, 'Injury or Poisoning')

In [13]:
apply_umls(cdr_docs, 'Disease or Syndrome')

for doc in cdr_docs:
    for i, token in enumerate(doc['tokens']):
        if token.text.lower() == 'tubular' or token.text.lower() == 'ganglia'\
        or token.text.lower() == 'albino' or token.text.lower() == 'prostate':
            doc['WISER_LABELS']['Disease or Syndrome'][i] = 'ABS'

In [14]:
apply_umls(cdr_docs, 'Mental or Behavioral Dysfunction')

In [15]:
apply_umls(cdr_docs, 'Cell or Molecular Dysfunction')

In [16]:
apply_umls(cdr_docs, 'Sign or Symptom')

In [17]:
apply_umls(cdr_docs, 'Neoplastic Process')

In [18]:
apply_umls(cdr_docs, 'Body Part, Organ, or Organ Component', positive=False)
class BodyTerms(LabelingFunction):
    def apply_instance(self, instance):
        tokens = [token.text.lower() for token in instance['tokens']]
        labels = ['ABS'] * len(tokens)
        
        terms = set([
            "cancer", "cancers",
            "damage",
            "disease", "diseases"
            "pain",
            "injury", "injuries",
        ])
        
        for i in range(0, len(tokens)-1):
            if instance['WISER_LABELS']['Body Part, Organ, or Organ Component'][i] == 'O':
                if tokens[i+1] in terms:
                    labels[i] = "I"
                    labels[i+1] = "I"
        return labels

lf = BodyTerms()
lf.apply(cdr_docs)

for doc in cdr_docs:
    del doc['WISER_LABELS']['Body Part, Organ, or Organ Component']

In [19]:
terms = [
    ["anemic"], ["bradycardia"], ["dyskinesia"], ["dyskinetic"],
    ["hyperthermia"], ["hyperthermic"], ["hypertension"], ["hypertensive"],
    ["hypothermia"], ["hypothermic"], ["hypotension"], ["hypotensive"]
]

lf = DictionaryMatcher("Other Terms", terms)
lf.apply(cdr_docs)

In [20]:
class CancerLike(LabelingFunction):
    def apply_instance(self, instance):
        tokens = [token.text.lower() for token in instance['tokens']]
        labels = ['ABS'] * len(tokens)
        
        token_counts = {}
        for token in tokens:
            if token not in token_counts:
                token_counts[token] = 0
            token_counts[token] += 1
        
        for i, token in enumerate(tokens):
            if token_counts[token] > 0:
                if token.endswith('edema') or token.endswith('toma') or token.endswith('coma'):
                    labels[i] = 'I'
        return labels

lf = CancerLike()
lf.apply(cdr_docs)

In [21]:
class CommonPrefixes(LabelingFunction):
    
    prefixes = {"hyper", "hypo"}
    
    def apply_instance(self, instance):
        labels = ['ABS'] * len(instance['tokens'])
        
        for i in range(len(instance['tokens'])):
            for prefix in self.prefixes:
                if instance['tokens'][i].lemma_.startswith(prefix):
                    labels[i] = 'I'
        return labels
    
lf = CommonPrefixes()
lf.apply(cdr_docs)

In [22]:
terms = [
    ["acute"], ["chronic"], ["disease"], ["syndrome"]
]

lf = DictionaryMatcher("Partial Terms", terms)
lf.apply(cdr_docs)

In [23]:
class CommonSuffixes(LabelingFunction):
    
    suffixes = {"agia", "cardia", "hypo",
                "trophy", "toxic"}
    
    def apply_instance(self, instance):
        labels = ['ABS'] * len(instance['tokens'])
        
        for i in range(len(instance['tokens'])):
            for suffix in self.suffixes:
                if instance['tokens'][i].lemma_.endswith(suffix):
                    labels[i] = 'I'
        return labels
    
lf = CommonSuffixes()
lf.apply(cdr_docs)

In [24]:
apply_umls(cdr_docs, 'Chemical', positive=False)

In [25]:
apply_umls(cdr_docs, 'Chemical Viewed Functionally', positive=False)

In [26]:
apply_umls(cdr_docs, 'Organic Chemical', positive=False)

In [27]:
apply_umls(cdr_docs, 'Inorganic Chemical', positive=False)

In [28]:
apply_umls(cdr_docs, 'Pharmacologic Substance', positive=False)

In [29]:
apply_umls(cdr_docs, 'Element, Ion, or Isotope', positive=False)

In [30]:
apply_umls(cdr_docs, 'Biologically Active Substance', positive=False)

In [31]:
apply_umls(cdr_docs, 'Clinical Drug', positive=False)

In [32]:
apply_umls(cdr_docs, 'Vitamin', positive=False)

In [33]:
apply_umls(cdr_docs, 'Hazardous or Poisonous Substance', positive=False)

In [34]:
apply_umls(cdr_docs, 'Antibiotic', positive=False)

In [35]:
class OtherPOS(LabelingFunction):
    other_pos = {"ADP", "ADV", "DET", "VERB"}
    def apply_instance(self, instance):
        labels = ['ABS'] * len(instance['tokens'])
        
        for i in range(0, len(instance['tokens'])):
            if instance['tokens'][i].pos_ in self.other_pos:
                labels[i] = "O"
        return labels

lf = OtherPOS()
lf.apply(cdr_docs)

In [36]:
class BoundaryWords(LabelingFunction):
    
    start_words = {"develop", "induce"}
    boundary_words = {"after", "before"}
    
    def apply_instance(self, instance):
        labels = ['ABS'] * len(instance['tokens'])
        
        for i in range(len(instance['tokens']) - 1):
            if instance['tokens'][i].lemma_ in self.start_words:
                labels[i] = 'O'
                labels[i+1] = 'I'
        
        for i in range(len(instance['tokens'])):
            if instance['tokens'][i].lemma_ in self.boundary_words:
                labels[i] = 'O'
                
        return labels
    
lf = BoundaryWords()
lf.apply(cdr_docs)

In [37]:
class CommonFP(LabelingFunction):
    
    words = {"abnormality", "abuse", "acute", "bacterial",
             "chronic", "consumption",
             "discharge", "drug", "effect", "evaluable",
             "fall", "group", "hepatic",
             "infant", "infectious", "intoxication",
             "mild", "myopic", "overload",
             "perinatal", "plan", "postpartum",
             "regression", "severe", "toxic", "withdrawl"}
    
    def apply_instance(self, instance):
        labels = ['ABS'] * len(instance['tokens'])
        
        for i in range(len(instance['tokens'])):
            if instance['tokens'][i].lemma_ in self.words:
                labels[i] = 'O'
        return labels
    
lf = CommonFP()
lf.apply(cdr_docs)

In [38]:
class StopWords(LabelingFunction):
    
    boundary_words = {"a", "and", "as", "be", "in", "is", "of", "or",
                      "that", "the", "with"}
    
    def apply_instance(self, instance):
        labels = ['ABS'] * len(instance['tokens'])
        
        for i in range(len(instance['tokens'])):
            if instance['tokens'][i].lemma_ in self.boundary_words:
                labels[i] = 'O'
        return labels
    
lf = StopWords()
lf.apply(cdr_docs)

In [39]:
class EndNounPhrase(LabelingFunction):
    
    noun_tags = {"NOUN", "PROPN"}
    def apply_instance(self, instance):
        labels = ['ABS'] * len(instance['tokens'])
        
        active = False
        for i in range(len(instance['tokens'])):
            if instance['tokens'][i].pos_ in self.noun_tags:
                active=True
            elif active and instance['tokens'][i].text != '-':
                active=False
                labels[i] = 'O'
        return labels
    
lf = EndNounPhrase()
lf.apply(cdr_docs)

In [40]:
class Punctuation(LabelingFunction):
    
    other_punc = {".", ",", "?", "!", ";", ":", "(", ")",
                  "%", "<", ">", "=", "+", "/", "\\"}
    def apply_instance(self, instance):
        labels = ['ABS'] * len(instance['tokens'])
        
        for i in range(len(instance['tokens'])):
            if instance['tokens'][i].text in self.other_punc:
                labels[i] = 'O'
        return labels
    
lf = Punctuation()
lf.apply(cdr_docs)

## Applies Linking Functions

In [41]:
class PossessivePhrase(LinkingFunction):
    def apply_instance(self, instance):
        links = [0] * len(instance['tokens'])
        for i in range(1, len(instance['tokens'])):
            if instance['tokens'][i-1].text == "'s" or instance['tokens'][i].text == "'s":
                links[i] = 1
        
        return links

lf = PossessivePhrase()
lf.apply(cdr_docs)

In [42]:
class HyphenatedPhrase(LinkingFunction):
    def apply_instance(self, instance):
        links = [0] * len(instance['tokens'])
        for i in range(1, len(instance['tokens'])):
            if instance['tokens'][i-1].text == "-" or instance['tokens'][i].text == "-":
                links[i] = 1
        
        return links

lf = HyphenatedPhrase()
lf.apply(cdr_docs)

In [43]:
from wiser.lf import ElmoLinkingFunction

lf = ElmoLinkingFunction(.8)
lf.apply(cdr_docs)

In [44]:
class CommonBigram(LinkingFunction):
    def apply_instance(self, instance):
        links = [0] * len(instance['tokens'])
        tokens = [token.text.lower() for token in instance['tokens']]
        
        bigrams = {}
        for i in range(1, len(tokens)):
            bigram = tokens[i-1], tokens[i]
            if bigram in bigrams:
                bigrams[bigram] += 1
            else:
                bigrams[bigram] = 1
        
        for i in range(1, len(tokens)):
            bigram = tokens[i-1], tokens[i]
            count = bigrams[bigram]
            if count >= 4:
                links[i] = 1
        
        return links

lf = CommonBigram()
lf.apply(cdr_docs)

In [45]:
class CompoundPhrase(LinkingFunction):
    def apply_instance(self, instance):
        links = [0] * len(instance['tokens'])
        for i in range(1, len(instance['tokens'])):
            if instance['tokens'][i-1].dep_ == "compound":
                links[i] = 1
        
        return links

lf = CompoundPhrase()
lf.apply(cdr_docs)

## Saves Weak Supervision to Disk

In [51]:
import pickle

with open('tmp/train_data.p', 'wb') as f:
    pickle.dump(train_data, f)

with open('tmp/dev_data.p', 'wb') as f:
    pickle.dump(dev_data, f)
    
with open('tmp/test_data.p', 'wb') as f:
    pickle.dump(test_data, f)

###### End of Part 1