In [56]:
import pycrfsuite
from itertools import chain
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer


In [57]:
train_filename = "data_Train_megam.dat"
test_filename = "data_Devel_megam.dat"

In [58]:
train_samples = open(train_filename, "r").read().split("\n")
test_samples = open(test_filename, "r").read().split("\n")

In [59]:
def split_data(word_features):
    X_samples = []
    sample = []
    
    Y_labels = []
    label = []
    
    for word_feature in word_features:
        if word_feature != "":
            word_feature = word_feature.split(" ")
            label.append(word_feature[0])
            sample.append(word_feature[1:])
        else:
            X_samples.append(sample)
            Y_labels.append(label)

            sample = []
            label = []
    return X_samples, Y_labels

In [60]:
%%time 
X_train, Y_train = split_data(train_samples)
X_test, Y_test = split_data(test_samples)

CPU times: user 368 ms, sys: 72.5 ms, total: 441 ms
Wall time: 402 ms


In [61]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, Y_train):
    trainer.append(xseq, yseq)

CPU times: user 822 ms, sys: 14.7 ms, total: 837 ms
Wall time: 847 ms


In [62]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [63]:
%%time
trainer.train(train_filename + '_model.crfsuite')

CPU times: user 8.12 s, sys: 6.9 ms, total: 8.13 s
Wall time: 8.21 s


In [64]:
trainer.logparser.last_iteration


{'active_features': 3504,
 'error_norm': 116.401182,
 'feature_norm': 65.003895,
 'linesearch_step': 1.0,
 'linesearch_trials': 1,
 'loss': 9534.191645,
 'num': 50,
 'scores': {},
 'time': 0.102}

In [65]:
print len(trainer.logparser.iterations), trainer.logparser.iterations[-1]


50 {'loss': 9534.191645, 'error_norm': 116.401182, 'linesearch_trials': 1, 'active_features': 3504, 'num': 50, 'time': 0.102, 'scores': {}, 'linesearch_step': 1.0, 'feature_norm': 65.003895}


In [66]:
tagger = pycrfsuite.Tagger()
tagger.open(train_filename + '_model.crfsuite')

<contextlib.closing at 0x7fc9c5c31a90>

In [67]:

def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [68]:
%%time
Y_pred = [tagger.tag(xseq) for xseq in X_test]

CPU times: user 57.3 ms, sys: 4.92 ms, total: 62.2 ms
Wall time: 64.4 ms


In [71]:
print(bio_classification_report(Y_test, Y_pred))


              precision    recall  f1-score   support

     B-brand       0.62      0.14      0.23        71
     I-brand       0.00      0.00      0.00         2
      B-drug       0.74      0.46      0.57       349
      I-drug       0.64      0.67      0.65        24
    B-drug_n       0.00      0.00      0.00        34
    I-drug_n       0.00      0.00      0.00         6
     B-group       0.73      0.32      0.45       171
     I-group       0.62      0.25      0.36       134

   micro avg       0.71      0.35      0.47       791
   macro avg       0.42      0.23      0.28       791
weighted avg       0.66      0.35      0.45       791
 samples avg       0.02      0.02      0.02       791



In [73]:
from collections import Counter
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(15))

print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-15:])

Top likely transitions:
O      -> O       6.306696
B-group -> I-group 5.584653
B-drug_n -> I-drug_n 5.041959
I-group -> I-group 4.873304
I-drug_n -> I-drug_n 4.130055
B-drug -> I-drug  3.704736
B-brand -> I-brand 3.327522
O      -> B-drug  3.271012
I-drug -> I-drug  3.037962
B-drug -> O       2.737865
B-group -> O       2.628012
B-drug_n -> O       2.242394
B-brand -> O       2.182316
O      -> B-group 2.092251
I-group -> O       1.982841

Top unlikely transitions:
B-drug_n -> B-brand -0.717688
I-drug -> B-brand -0.756350
I-brand -> B-brand -0.808869
I-group -> B-brand -0.895362
B-group -> B-group -0.965026
O      -> I-drug  -0.986978
B-group -> B-brand -1.223175
B-brand -> B-group -1.403115
O      -> I-brand -1.443811
B-drug -> I-group -1.447992
B-drug -> B-drug  -1.764949
B-drug -> B-group -1.980907
O      -> I-group -2.401827
B-brand -> B-brand -2.716318
B-drug -> B-brand -2.795083
