In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../../..')
from wiser.data.util import save_label_distribution

## Reloads Data

In [2]:
import pickle

with open('/data/bats/results/ner-ws/conll/train_data.p', 'rb') as f:
    train_data_steve = pickle.load(f)

with open('/data/bats/results/ner-ws/conll/dev_data.p', 'rb') as f:
    dev_data_steve = pickle.load(f)
    
with open('/data/bats/results/ner-ws/conll/test_data.p', 'rb') as f:
    test_data_steve = pickle.load(f)
    
test_data = test_data_steve
train_data = train_data_steve
dev_data = dev_data_steve

In [3]:
from collections import Counter
cnt = Counter()
for instance in train_data:
    for tag in instance['tags']:
        cnt[tag] += 1
        
disc_label_to_ix = {value[0]: ix for ix, value in enumerate(cnt.most_common())}

In [4]:
gen_label_to_ix = {'ABS': 0, 'O': 1, 'I-PER': 2, 'I-LOC': 3, 'I-ORG': 4, 'I-MISC': 5, 'B-PER': 6, 'B-LOC': 7, 'B-ORG': 8, 'B-MISC': 9}


labeling_functions = set()
linking_functions = set()
for doc in train_data:
    for name in doc['WISER_LABELS'].keys():
        labeling_functions.add(name)
    for name in doc['WISER_LINKS'].keys():
        linking_functions.add(name)

In [5]:
save_label_distribution('data/test_data.p', test_data)
save_label_distribution('data/dev_data.p', dev_data)

## Eval

In [6]:
from wiser.eval import score_labeling_functions, score_labels_majority_vote

score_labeling_functions(dev_data)

Unnamed: 0,TP,FP,FN,Token Acc.,Token Votes
ConsecutiveLowerCase,0,0,5943,0.9993,26640
CountryAbbrvs,113,18,5830,0.8779,131
DBPediaMISC,393,110,5550,0.8484,521
DBpediaCountries,807,39,5136,0.9617,914
DBpediaORG,161,132,5782,0.7277,459
DBpediaPeople,639,24,5304,0.9932,1320
DBpediaPopPlaces,964,257,4979,0.7901,1339
FirstNames,288,1082,5655,0.8502,1589
LongLowerCase,0,0,5943,0.9979,13570
MiscAdj,14,9,5929,0.9649,57


In [7]:
score_labels_majority_vote(dev_data, span_level=False)

Unnamed: 0,TP,FP,FN,P,R,F1
Majority Vote (Token Level),4957,998,3646,0.8324,0.5762,0.681


## Majority Vote

In [8]:
from wiser.eval import score_labels_majority_vote, get_mv_label_distribution
from wiser.eval import get_unweighted_label_distribution

In [9]:
score_labels_majority_vote(dev_data)

Unnamed: 0,TP,FP,FN,P,R,F1
Majority Vote,3154,1415,2789,0.6903,0.5307,0.6001


In [10]:
score_labels_majority_vote(dev_data, span_level=False)

Unnamed: 0,TP,FP,FN,P,R,F1
Majority Vote (Token Level),4957,998,3646,0.8324,0.5762,0.681


In [11]:
dist = get_mv_label_distribution(train_data, disc_label_to_ix, 'O')
save_label_distribution('data/train_data_mv_labels.p', train_data, dist)

In [12]:
dist = get_unweighted_label_distribution(train_data, disc_label_to_ix, 'O')
save_label_distribution('data/train_data_unweighted_labels.p', train_data, dist)

## Generative Modeling

In [13]:
from experiments.util import train_generative_model, get_gen_model_inputs
from labelmodels import NaiveBayes, HMM, LinkedHMM, LearningConfig
import numpy as np

In [14]:
epochs = 5

### Naive Bayes

In [15]:
config = LearningConfig()
nb = NaiveBayes(len(disc_label_to_ix), len(labeling_functions), init_acc=0.9,
                   acc_prior=0.01, balance_prior=5.0)
p, r, f1 = train_generative_model(nb, train_data, dev_data,
                                  epochs, gen_label_to_ix, config)

In [16]:
print(p)
print(r)
print(f1)

0.6908
0.5443
0.6089


In [17]:
label_votes, link_votes, seq_starts = get_gen_model_inputs(train_data, gen_label_to_ix)
dist = nb.get_label_distribution(label_votes)

In [18]:
instances = save_label_distribution('data/train_data_nb_labels.p', train_data, dist)

### HMM

In [19]:
config = LearningConfig()
config.batch_size = 16
hmm = HMM(len(disc_label_to_ix), len(labeling_functions), init_acc=0.7,
                   acc_prior=100, balance_prior=500)
p, r, f1 = train_generative_model(hmm, train_data, dev_data,
                                  epochs, gen_label_to_ix, config)

In [20]:
print(p)
print(r)
print(f1)

0.6544
0.5088
0.5725


In [21]:
label_votes, link_votes, seq_starts = get_gen_model_inputs(train_data, gen_label_to_ix)
p_unary, p_pairwise = hmm.get_label_distribution(label_votes, seq_starts)

In [22]:
instances = save_label_distribution('data/train_data_hmm_labels.p', train_data, p_unary, p_pairwise)

### Linked HMM

In [None]:
config = LearningConfig()
config.batch_size = 16
link_hmm = LinkedHMM(len(disc_label_to_ix), len(labeling_functions), len(linking_functions),
                  init_acc=0.7, acc_prior=100, balance_prior=500)
p, r, f1 = train_generative_model(link_hmm, train_data, dev_data,
                                  5, gen_label_to_ix, config)

In [None]:
print(p)
print(r)
print(f1)

In [None]:
link_hmm.get_accuracies()

In [None]:
label_votes, link_votes, seq_starts = get_gen_model_inputs(train_data, gen_label_to_ix)
p_unary, p_pairwise = link_hmm.get_label_distribution(label_votes, link_votes, seq_starts)

In [None]:
instances = save_label_distribution('data/train_data_link_hmm_labels.p',  train_data, p_unary, p_pairwise)






End of Part 2