In [1]:
%load_ext autoreload
%autoreload 2

import sys
sys.path.append('../../..')
import os
import allennlp
from allennlp.data import Instance
from allennlp.data.fields import ArrayField
import numpy as np
from wiser.data.util import save_label_distribution

import pickle

## Reloads Data

In [2]:
with open('tmp/train_data.p', 'rb') as f:
    train_data = pickle.load(f)
            
with open('tmp/dev_data.p', 'rb') as f:
    dev_data = pickle.load(f)

with open('tmp/test_data.p', 'rb') as f:
    test_data = pickle.load(f)

In [3]:
from collections import Counter

cnt = Counter()
for instance in train_data:
    for tag in instance['tags']:
        cnt[tag] += 1
        
disc_label_to_ix = {value[0]: ix for ix, value in enumerate(cnt.most_common())}
gen_label_to_ix = {'ABS': 0, 'O': 1, 'I': 2, 'B': 3}

count = 0

labeling_functions = set()
linking_functions = set()
for doc in train_data:
    count += 1
    for name in doc['WISER_LABELS'].keys():
        labeling_functions.add(name)
    for name in doc['WISER_LINKS'].keys():
        linking_functions.add(name)

In [4]:
save_label_distribution('data/test_data.p', test_data)
save_label_distribution('data/dev_data.p', dev_data)

## Eval

In [5]:
from wiser.eval import score_labeling_functions, score_labels_majority_vote
score_labeling_functions(dev_data)

Unnamed: 0,TP,FP,FN,Token Acc.,Token Votes
Acquired Abnormality,10,15,4302,0.4706,34
Antibiotic,0,0,4312,1.0,386
Biologically Active Substance,0,0,4312,0.9801,1304
BodyTerms,69,42,4243,0.982,222
BoundaryWords,228,468,4084,0.822,1921
CancerLike,65,27,4247,1.0,96
Cell or Molecular Dysfunction,8,24,4304,0.2157,51
Chemical,0,0,4312,0.9943,176
Chemical Viewed Functionally,0,0,4312,1.0,104
Clinical Drug,0,0,4312,1.0,67


## Majority Vote

In [14]:
from wiser.eval import score_labels_majority_vote, get_mv_label_distribution
from wiser.eval import get_unweighted_label_distribution
from wiser.data.util import save_label_distribution

In [7]:
score_labels_majority_vote(dev_data)

Unnamed: 0,TP,FP,FN,P,R,F1
Majority Vote,2631,1243,1681,0.6791,0.6102,0.6428


In [8]:
score_labels_majority_vote(dev_data, span_level=False)

Unnamed: 0,TP,FP,FN,P,R,F1
Majority Vote (Token Level),4387,793,2508,0.8469,0.6363,0.7266


In [9]:
dist = get_mv_label_distribution(train_data, disc_label_to_ix, 'O')
save_label_distribution('data/train_data_mv_labels.p', train_data, dist)

In [10]:
dist = get_unweighted_label_distribution(train_data, disc_label_to_ix, 'O')
save_label_distribution('data/train_data_unweighted_labels.p', train_data, dist)

## Generative Modeling

In [5]:
from experiments.util import train_generative_model, get_gen_model_inputs
from labelmodels.hmm import HMM
from labelmodels.label_model import LearningConfig
from labelmodels.linked_hmm import LinkedHMM
from labelmodels.naive_bayes import NaiveBayes
import numpy as np

In [6]:
num_classes = 2
epochs = 10

labeling_functions = set()
linking_functions = set()
for doc in train_data:
    for name in doc['WISER_LABELS'].keys():
        labeling_functions.add(name)
    for name in doc['WISER_LINKS'].keys():
        linking_functions.add(name)

### Naive Bayes

In [7]:
config = LearningConfig()
config.batch_size = 16
nb = NaiveBayes(len(disc_label_to_ix), len(labeling_functions), init_acc=0.7,
                   acc_prior=0.5, balance_prior=1.0)
p, r, f1 = train_generative_model(nb, train_data, dev_data,
                                  5, gen_label_to_ix, config)

In [8]:
print(p)
print(r)
print(f1)

0.6791
0.6102
0.6428


In [9]:
label_votes, link_votes, seq_starts = get_gen_model_inputs(train_data, gen_label_to_ix)
dist = nb.get_label_distribution(label_votes)

In [10]:
save_label_distribution('data/train_data_nb_labels.p', train_data, dist)

### HMM

In [18]:
config = LearningConfig()
config.batch_size = 16
model = HMM(len(disc_label_to_ix), len(labeling_functions), init_acc=0.9,
                   acc_prior=100, balance_prior=500)
p, r, f1 = train_generative_model(model, train_data, dev_data,
                                  5, gen_label_to_ix, config)

In [19]:
print(p)
print(r)
print(f1)

0.6785
0.6122
0.6436


In [20]:
label_votes, link_votes, seq_starts = get_gen_model_inputs(train_data, gen_label_to_ix)
p_unary, p_pairwise = model.get_label_distribution(label_votes, seq_starts)

In [21]:
save_label_distribution('data/train_data_hmm_labels.p', train_data, p_unary, p_pairwise)

### Linked HMM

In [22]:
config = LearningConfig()
config.batch_size = 16
link_hmm = LinkedHMM(len(disc_label_to_ix), len(labeling_functions), len(linking_functions),
                  init_acc=0.9, acc_prior=100, balance_prior=500)
p, r, f1 = train_generative_model(link_hmm, train_data, dev_data,
                                  5, gen_label_to_ix, config)

In [23]:
print(p)
print(r)
print(f1)

0.7424
0.6276
0.6802


In [24]:
label_votes, link_votes, seq_starts = get_gen_model_inputs(train_data, gen_label_to_ix)
p_unary, p_pairwise = link_hmm.get_label_distribution(label_votes, link_votes, seq_starts)

In [26]:
save_label_distribution('data/train_data_link_hmm_labels.p', train_data, p_unary, p_pairwise)