In [56]:
import pycrfsuite
from itertools import chain
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer


In [57]:
train_filename = "data_Train_megam.dat"
test_filename = "data_Devel_megam.dat"

In [58]:
train_samples = open(train_filename, "r").read().split("\n")
test_samples = open(test_filename, "r").read().split("\n")

In [59]:
def split_data(word_features):
    X_samples = []
    sample = []
    
    Y_labels = []
    label = []
    
    for word_feature in word_features:
        if word_feature != "":
            word_feature = word_feature.split(" ")
            label.append(word_feature[0])
            sample.append(word_feature[1:])
        else:
            X_samples.append(sample)
            Y_labels.append(label)

            sample = []
            label = []
    return X_samples, Y_labels

In [60]:
%%time 
X_train, Y_train = split_data(train_samples)
X_test, Y_test = split_data(test_samples)

CPU times: user 368 ms, sys: 72.5 ms, total: 441 ms
Wall time: 402 ms


In [75]:
X_train

[[['form=Increased',
   'suf4=ased',
   'prev=_BoS_',
   'next=nephrotoxicity',
   'capitalized'],
  ['form=nephrotoxicity', 'suf4=city', 'prev=Increased', 'next=has'],
  ['form=has', 'suf4=has', 'prev=nephrotoxicity', 'next=been'],
  ['form=been', 'suf4=been', 'prev=has', 'next=reported'],
  ['form=reported', 'suf4=rted', 'prev=been', 'next=following'],
  ['form=following', 'suf4=wing', 'prev=reported', 'next=concomitant'],
  ['form=concomitant', 'suf4=tant', 'prev=following', 'next=administration'],
  ['form=administration', 'suf4=tion', 'prev=concomitant', 'next=of'],
  ['form=of', 'suf4=of', 'prev=administration', 'next=cephalosporins'],
  ['form=cephalosporins', 'suf4=rins', 'prev=of', 'next=and'],
  ['form=and', 'suf4=and', 'prev=cephalosporins', 'next=aminoglycoside'],
  ['form=aminoglycoside', 'suf4=side', 'prev=and', 'next=antibiotics'],
  ['form=antibiotics', 'suf4=tics', 'prev=aminoglycoside', 'next=.'],
  ['form=.', 'suf4=.', 'prev=antibiotics']],
 [['form=Drug/Laboratory',

In [61]:
%%time
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, Y_train):
    trainer.append(xseq, yseq)

CPU times: user 822 ms, sys: 14.7 ms, total: 837 ms
Wall time: 847 ms


In [62]:
trainer.set_params({
    'c1': 1.0,   # coefficient for L1 penalty
    'c2': 1e-3,  # coefficient for L2 penalty
    'max_iterations': 50,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

In [63]:
%%time
trainer.train(train_filename + '_model.crfsuite')

CPU times: user 8.12 s, sys: 6.9 ms, total: 8.13 s
Wall time: 8.21 s


In [64]:
trainer.logparser.last_iteration


{'active_features': 3504,
 'error_norm': 116.401182,
 'feature_norm': 65.003895,
 'linesearch_step': 1.0,
 'linesearch_trials': 1,
 'loss': 9534.191645,
 'num': 50,
 'scores': {},
 'time': 0.102}

In [65]:
print len(trainer.logparser.iterations), trainer.logparser.iterations[-1]


50 {'loss': 9534.191645, 'error_norm': 116.401182, 'linesearch_trials': 1, 'active_features': 3504, 'num': 50, 'time': 0.102, 'scores': {}, 'linesearch_step': 1.0, 'feature_norm': 65.003895}


In [66]:
tagger = pycrfsuite.Tagger()
tagger.open(train_filename + '_model.crfsuite')

<contextlib.closing at 0x7fc9c5c31a90>

In [67]:

def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [68]:
%%time
Y_pred = [tagger.tag(xseq) for xseq in X_test]

CPU times: user 57.3 ms, sys: 4.92 ms, total: 62.2 ms
Wall time: 64.4 ms


In [79]:
#def output_enti    for x_seq, y_seq in zip(X_train, Y_pred):
for x_seq, y_seq in zip(X_train, Y_pred):
    for word, label in zip(x_seq, y_seq):
        if "drug" in label:
            line = ["drug",]
        print(word, label)

(['form=Increased', 'suf4=ased', 'prev=_BoS_', 'next=nephrotoxicity', 'capitalized'], 'B-drug')
(['form=nephrotoxicity', 'suf4=city', 'prev=Increased', 'next=has'], 'O')
(['form=has', 'suf4=has', 'prev=nephrotoxicity', 'next=been'], 'O')
(['form=been', 'suf4=been', 'prev=has', 'next=reported'], 'O')
(['form=reported', 'suf4=rted', 'prev=been', 'next=following'], 'O')
(['form=following', 'suf4=wing', 'prev=reported', 'next=concomitant'], 'O')
(['form=concomitant', 'suf4=tant', 'prev=following', 'next=administration'], 'O')
(['form=administration', 'suf4=tion', 'prev=concomitant', 'next=of'], 'O')
(['form=of', 'suf4=of', 'prev=administration', 'next=cephalosporins'], 'O')
(['form=cephalosporins', 'suf4=rins', 'prev=of', 'next=and'], 'O')
(['form=Drug/Laboratory', 'suf4=tory', 'prev=_BoS_', 'next=Test', 'capitalized'], 'O')
(['form=Test', 'suf4=Test', 'prev=Drug/Laboratory', 'next=Interactions', 'capitalized'], 'O')
(['form=Interactions', 'suf4=ions', 'prev=Test', 'next=Cephalosporins', '

(['form=1A2', 'suf4=1A2', 'prev=isoforms:', 'next=,'], 'O')
(['form=,', 'suf4=,', 'prev=1A2', 'next=2C9'], 'O')
(['form=2C9', 'suf4=2C9', 'prev=,', 'next=,'], 'O')
(['form=,', 'suf4=,', 'prev=2C9', 'next=2C19'], 'O')
(['form=Drug', 'suf4=Drug', 'prev=_BoS_', 'next=interactions', 'capitalized'], 'O')
(['form=interactions', 'suf4=ions', 'prev=Drug', 'next=caused'], 'O')
(['form=caused', 'suf4=used', 'prev=interactions', 'next=by'], 'O')
(['form=by', 'suf4=by', 'prev=caused', 'next=inhibition'], 'O')
(['form=inhibition', 'suf4=tion', 'prev=by', 'next=of'], 'O')
(['form=of', 'suf4=of', 'prev=inhibition', 'next=P-glycoprotein-mediated'], 'O')
(['form=P-glycoprotein-mediated', 'suf4=ated', 'prev=of', 'next=drug', 'capitalized'], 'O')
(['form=drug', 'suf4=drug', 'prev=P-glycoprotein-mediated', 'next=clearance'], 'O')
(['form=clearance', 'suf4=ance', 'prev=drug', 'next=or'], 'O')
(['form=or', 'suf4=or', 'prev=clearance', 'next=CYP-mediated'], 'O')
(['form=CYP-mediated', 'suf4=ated', 'prev=or',

(['form=known', 'suf4=nown', 'prev=no', 'next=clinically'], 'O')
(['form=clinically', 'suf4=ally', 'prev=known', 'next=significant'], 'O')
(['form=No', 'suf4=No', 'prev=_BoS_', 'next=formal', 'capitalized'], 'O')
(['form=formal', 'suf4=rmal', 'prev=No', 'next=drug/laboratory'], 'O')
(['form=drug/laboratory', 'suf4=tory', 'prev=formal', 'next=test'], 'O')
(['form=test', 'suf4=test', 'prev=drug/laboratory', 'next=interaction'], 'O')
(['form=interaction', 'suf4=tion', 'prev=test', 'next=studies'], 'O')
(['form=studies', 'suf4=dies', 'prev=interaction', 'next=have'], 'O')
(['form=have', 'suf4=have', 'prev=studies', 'next=been'], 'O')
(['form=been', 'suf4=been', 'prev=have', 'next=conducted'], 'O')
(['form=conducted', 'suf4=cted', 'prev=been', 'next=with'], 'O')
(['form=with', 'suf4=with', 'prev=conducted', 'next=CLOLAR'], 'O')
(['form=CLOLAR', 'suf4=OLAR', 'prev=with', 'next=.', 'capitalized'], 'O')
(['form=.', 'suf4=.', 'prev=CLOLAR'], 'O')
(['form=Addition', 'suf4=tion', 'prev=_BoS_', 'n

(['form=of', 'suf4=of', 'prev=particles', 'next=budesonide'], 'O')
(['form=budesonide', 'suf4=nide', 'prev=of', 'next=in'], 'O')
(['form=in', 'suf4=in', 'prev=budesonide', 'next=Survanta'], 'O')
(['form=Survanta', 'suf4=anta', 'prev=in', 'next=,', 'capitalized'], 'O')
(['form=,', 'suf4=,', 'prev=Survanta', 'next=a'], 'O')
(['form=a', 'suf4=a', 'prev=,', 'next=model'], 'O')
(['form=model', 'suf4=odel', 'prev=a', 'next=lung'], 'O')
(['form=lung', 'suf4=lung', 'prev=model', 'next=surfactant'], 'O')
(['form=surfactant', 'suf4=tant', 'prev=lung', 'next=.'], 'O')
(['form=.', 'suf4=.', 'prev=surfactant'], 'O')
(['form=The', 'suf4=The', 'prev=_BoS_', 'next=effect', 'capitalized'], 'O')
(['form=effect', 'suf4=fect', 'prev=The', 'next=of'], 'O')
(['form=of', 'suf4=of', 'prev=effect', 'next=a'], 'O')
(['form=a', 'suf4=a', 'prev=of', 'next=pulmonary'], 'O')
(['form=pulmonary', 'suf4=nary', 'prev=a', 'next=surfactant'], 'O')
(['form=surfactant', 'suf4=tant', 'prev=pulmonary', 'next=extract'], 'O')


(['form=Hyperaminotransferasemia:', 'suf4=mia:', 'prev=Interactions', 'next=Significant', 'capitalized'], 'O')
(['form=Significant', 'suf4=cant', 'prev=Hyperaminotransferasemia:', 'next=elevations', 'capitalized'], 'O')
(['form=elevations', 'suf4=ions', 'prev=Significant', 'next=of'], 'O')
(['form=of', 'suf4=of', 'prev=elevations', 'next=aminotransferase'], 'O')
(['form=aminotransferase', 'suf4=rase', 'prev=of', 'next=(SGOT'], 'O')
(['form=(SGOT', 'suf4=SGOT', 'prev=aminotransferase', 'next=[S-AST]'], 'O')
(['form=Since', 'suf4=ince', 'prev=_BoS_', 'next=aminotransferase', 'capitalized'], 'O')
(['form=A', 'suf4=A', 'prev=_BoS_', 'next=proposed', 'capitalized'], 'O')
(['form=proposed', 'suf4=osed', 'prev=A', 'next=mechanism'], 'O')
(['form=mechanism', 'suf4=nism', 'prev=proposed', 'next=for'], 'O')
(['form=for', 'suf4=for', 'prev=mechanism', 'next=the'], 'O')
(['form=the', 'suf4=the', 'prev=for', 'next=potentiation'], 'B-group')
(['form=potentiation', 'suf4=tion', 'prev=the', 'next=of']

(['form=determine', 'suf4=mine', 'prev=to', 'next=whether'], 'O')
(['form=whether', 'suf4=ther', 'prev=determine', 'next=they'], 'O')
(['form=they', 'suf4=they', 'prev=whether', 'next=respond'], 'O')
(['form=respond', 'suf4=pond', 'prev=they', 'next=differently'], 'O')
(['form=Adverse', 'suf4=erse', 'prev=_BoS_', 'next=reactions', 'capitalized'], 'O')
(['form=reactions', 'suf4=ions', 'prev=Adverse', 'next=related'], 'O')
(['form=related', 'suf4=ated', 'prev=reactions', 'next=to'], 'O')
(['form=to', 'suf4=to', 'prev=related', 'next=alpha'], 'O')
(['form=alpha', 'suf4=lpha', 'prev=to', 'next=interferons'], 'O')
(['form=interferons', 'suf4=rons', 'prev=alpha', 'next=,'], 'O')
(['form=,', 'suf4=,', 'prev=interferons', 'next=such'], 'O')
(['form=such', 'suf4=such', 'prev=,', 'next=as'], 'O')
(['form=as', 'suf4=as', 'prev=such', 'next=CNS'], 'O')
(['form=CNS', 'suf4=CNS', 'prev=as', 'next=,', 'capitalized'], 'O')
(['form=,', 'suf4=,', 'prev=CNS', 'next=cardiac'], 'O')
(['form=cardiac', 'suf4

(['form=during', 'suf4=ring', 'prev=fold)', 'next=coadministration'], 'O')
(['form=These', 'suf4=hese', 'prev=_BoS_', 'next=pharmacokinetic', 'capitalized'], 'O')
(['form=pharmacokinetic', 'suf4=etic', 'prev=These', 'next=effects'], 'O')
(['form=effects', 'suf4=ects', 'prev=pharmacokinetic', 'next=seen'], 'O')
(['form=seen', 'suf4=seen', 'prev=effects', 'next=during'], 'O')
(['form=during', 'suf4=ring', 'prev=seen', 'next=diltiazem'], 'O')
(['form=diltiazem', 'suf4=azem', 'prev=during', 'next=coadministration'], 'O')
(['form=coadministration', 'suf4=tion', 'prev=diltiazem', 'next=can'], 'O')
(['form=can', 'suf4=can', 'prev=coadministration', 'next=result'], 'O')
(['form=result', 'suf4=sult', 'prev=can', 'next=in'], 'O')
(['form=in', 'suf4=in', 'prev=result', 'next=increased'], 'O')
(['form=increased', 'suf4=ased', 'prev=in', 'next=clinical'], 'O')
(['form=clinical', 'suf4=ical', 'prev=increased', 'next=effects'], 'O')
(['form=effects', 'suf4=ects', 'prev=clinical', 'next=(e.g.'], 'O')


In [74]:
Y_pred

[['B-drug', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['B-drug', 'O', 'O', 'O', 'O', 'O', 'O', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O'],
 ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-drug', 'O', 'O', 'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-drug',
  'O'],
 ['O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'O',
  'B-drug'],
 ['O

In [71]:
print(bio_classification_report(Y_test, Y_pred))


              precision    recall  f1-score   support

     B-brand       0.62      0.14      0.23        71
     I-brand       0.00      0.00      0.00         2
      B-drug       0.74      0.46      0.57       349
      I-drug       0.64      0.67      0.65        24
    B-drug_n       0.00      0.00      0.00        34
    I-drug_n       0.00      0.00      0.00         6
     B-group       0.73      0.32      0.45       171
     I-group       0.62      0.25      0.36       134

   micro avg       0.71      0.35      0.47       791
   macro avg       0.42      0.23      0.28       791
weighted avg       0.66      0.35      0.45       791
 samples avg       0.02      0.02      0.02       791



In [73]:
from collections import Counter
info = tagger.info()

def print_transitions(trans_features):
    for (label_from, label_to), weight in trans_features:
        print("%-6s -> %-7s %0.6f" % (label_from, label_to, weight))

print("Top likely transitions:")
print_transitions(Counter(info.transitions).most_common(15))

print("\nTop unlikely transitions:")
print_transitions(Counter(info.transitions).most_common()[-15:])

Top likely transitions:
O      -> O       6.306696
B-group -> I-group 5.584653
B-drug_n -> I-drug_n 5.041959
I-group -> I-group 4.873304
I-drug_n -> I-drug_n 4.130055
B-drug -> I-drug  3.704736
B-brand -> I-brand 3.327522
O      -> B-drug  3.271012
I-drug -> I-drug  3.037962
B-drug -> O       2.737865
B-group -> O       2.628012
B-drug_n -> O       2.242394
B-brand -> O       2.182316
O      -> B-group 2.092251
I-group -> O       1.982841

Top unlikely transitions:
B-drug_n -> B-brand -0.717688
I-drug -> B-brand -0.756350
I-brand -> B-brand -0.808869
I-group -> B-brand -0.895362
B-group -> B-group -0.965026
O      -> I-drug  -0.986978
B-group -> B-brand -1.223175
B-brand -> B-group -1.403115
O      -> I-brand -1.443811
B-drug -> I-group -1.447992
B-drug -> B-drug  -1.764949
B-drug -> B-group -1.980907
O      -> I-group -2.401827
B-brand -> B-brand -2.716318
B-drug -> B-brand -2.795083
