In [105]:
import os
import pycrfsuite
import argparse
from itertools import chain
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.preprocessing import LabelBinarizer


def split_data(features):
    """
    Receive a list of featured sentences splitted by words and split it into samples and labels.

    Parameters:
    word_features(list): List of feature words. There is an empty element between sentences in order to split each one.

    Returns:
    X_samples(list): List of feature words missing the label of the word.
    Y_labels(list): List of labels for each word.
    """

    X_samples = []
    Y_labels = []

    for feat in features:
        feat = feat.split(" ")
        Y_labels.append([feat[0]])
        X_samples.append([feat[1:]])

    return X_samples, Y_labels


def output_predicted_entities(Y_pred, filename):
    """
    Receives the predicted list of labels by the ML model and a filename of the detailed word features and construct predicted.txt file
    with the intrinsic details of each word, id, offsets and the predicted label.

    Parameters:
    Y_pred(list): List of list of predicted labels for each word.
    filename(str): filename to read the "feats.dat" data of the testing dataset

    Returns:

    """

    Y_pred_flatten = [el for line in Y_pred for el in line]
    detailed_word_features = open(filename).read().split("\n")[:-1]

    for label, detailed_feats in zip(Y_pred_flatten, detailed_word_features):
        _id, e1_id, e2_id = detailed_feats.split(" ")[0:3]
        if label == "null":
            interaction = "0"
        else:
            interaction = "1"
        line = [_id, e1_id, e2_id, interaction, label]
        outputfile.write("|".join(line) + "\n")
        
def evaluate(inputdir, outputfile):
    """
    Receives an input directory and the outputfile to evaluate the predicted labels with the evaluateNER.jar program.

    Parameters:
    inputdir(str):
    outputfile(str):

    Returns:

    """

    os.system("java -jar eval/evaluateDIR.jar " + inputdir + " " + outputfile)


In [106]:
train_filename = "megam.dat"
test_filename = "megam.dat"


In [107]:
output_filename = "predicted.txt"
model_filename = "model.crfsuite"

In [108]:
train_samples = open(train_filename, "r").read().split("\n")[:-1]
test_samples = open(test_filename, "r").read().split("\n")[:-1]

In [109]:
X_train, Y_train = split_data(train_samples)
X_test, Y_test = split_data(test_samples)

In [110]:
trainer = pycrfsuite.Trainer(verbose=False)

for xseq, yseq in zip(X_train, Y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 0.05,  # coefficient for L1 penalty
    'c2': 0.1,  # coefficient for L2 penalty 1e-1 0.61
    'max_iterations': 10000,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

trainer.train(model_filename)

tagger = pycrfsuite.Tagger()
tagger.open(model_filename)


<contextlib.closing at 0x7f4cee8d0550>

In [111]:
outputfile = open("predicted.txt", "w")
output_predicted_entities(Y_pred, "feats.dat")
outputfile.close()

In [38]:
Y_pred = [tagger.tag(xseq) for xseq in X_test]

In [39]:
def bio_classification_report(y_true, y_pred):
    """
    Classification report for a list of BIO-encoded sequences.
    It computes token-level metrics and discards "O" labels.
    
    Note that it requires scikit-learn 0.15+ (or a version from github master)
    to calculate averages properly!
    """
    lb = LabelBinarizer()
    y_true_combined = lb.fit_transform(list(chain.from_iterable(y_true)))
    y_pred_combined = lb.transform(list(chain.from_iterable(y_pred)))
        
    tagset = set(lb.classes_) - {'O'}
    tagset = sorted(tagset, key=lambda tag: tag.split('-', 1)[::-1])
    class_indices = {cls: idx for idx, cls in enumerate(lb.classes_)}
    
    return classification_report(
        y_true_combined,
        y_pred_combined,
        labels = [class_indices[cls] for cls in tagset],
        target_names = tagset,
    )

In [40]:
print(bio_classification_report(Y_test, Y_pred))

              precision    recall  f1-score   support

                   0.00      0.00      0.00         1
      advise       0.97      0.89      0.93       119
      effect       0.96      0.94      0.95       162
         int       1.00      1.00      1.00         2
   mechanism       0.94      0.82      0.88       201
        null       0.97      0.99      0.98      2219

   micro avg       0.97      0.97      0.97      2704
   macro avg       0.81      0.77      0.79      2704
weighted avg       0.97      0.97      0.97      2704
 samples avg       0.97      0.97      0.97      2704



In [12]:

trainer = pycrfsuite.Trainer(verbose=True)

for xseq, yseq in zip(X_train, Y_train):
    trainer.append(xseq, yseq)

trainer.set_params({
    'c1': 0.05,  # coefficient for L1 penalty
    'c2': 0.1,  # coefficient for L2 penalty 1e-1 0.61
    'max_iterations': 10000,  # stop earlier

    # include transitions that are possible, but not observed
    'feature.possible_transitions': True
})

trainer.train(model_filename)

tagger = pycrfsuite.Tagger()
tagger.open(model_filename)

Y_pred = [tagger.tag(xseq) for xseq in X_test]

outputfile = open(output_filename, "w")
output_predicted_entities(Y_pred, fulltest_filename)
outputfile.close()

evaluate(inputdir, output_filename)


ValueError: The numbers of items and labels differ: |x| = 31, |y| = 4

In [50]:
Y_pred_flatten = [el for line in Y_pred for el in line]
len(Y_pred_flatten)

2704

In [51]:
detailed_word_features = open("feats.dat").read().split("\n")
len(detailed_word_features)

2704

In [None]:



Y_pred_flatten = [el for line in Y_pred for el in line]
detailed_word_features = open(filename).read().split("\n")

for label, detailed_feats in zip(Y_pred_flatten, detailed_word_features):
    _id, e1_id, e2_id = detailed_feats.split(" ")[0:3]
    if label == "null":
        interaction = "0"
    else:
        interaction = "1"
    line = [_id, e1_id, e2_id, interaction, label]
    outputfile.write("|".join(line) + "\n")
