In [1]:
# Data processing libraries
import pandas as pd
import numpy as np

# NLP libraries
import nltk
from nltk.tag import StanfordPOSTagger

# Machine Learning Libraries
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split # Parameter selection
import sklearn_crfsuite
from sklearn_crfsuite import scorers, metrics
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV

# Other libraries
import time # Execution time of some blocks
import statistics
from IPython.display import display # For displaying DataFrames correctly in Jupyter
from itertools import chain
import collections
import pickle

# Import our own defined functions
from xlm_parsers_functions import *
from drug_interaction_functions import *
from drug_functions import *
from NER_functions import *
from ortographic_features import *
from context_features import *
from feature_creation_interaction import *
from crf_functions import *



In [2]:
%%time

with open("X_train.txt", "rb") as f:   # Unpickling
    X_train = pickle.load(f)
    
with open("y_train_int.txt", "rb") as f:   # Unpickling
    y_train_int = pickle.load(f)
    
with open("y_train_type.txt", "rb") as f:   # Unpickling
    y_train_type = pickle.load(f)

with open("X_test.txt", "rb") as f:   # Unpickling
    X_test = pickle.load(f)
    
with open("y_test_int.txt", "rb") as f:   # Unpickling
    y_test_int = pickle.load(f)
    
with open("y_test_type.txt", "rb") as f:   # Unpickling
    y_test_type = pickle.load(f)

print('Number of training sentences readed: ', len(X_train))
print('Number of testing sentences readed: ', len(X_test))

Number of training sentences readed:  27792
Number of testing sentences readed:  5716
CPU times: user 1.95 s, sys: 342 ms, total: 2.29 s
Wall time: 2.37 s


In [3]:
%%time

mod1 = trainCRFAndEvaluate(
            X_train = X_train, 
            y_train = y_train_int,
            X_test = X_test,
            y_test = y_test_int,
            labels = ['true', 'false'],
            hyperparam_optim = False)

             precision    recall  f1-score   support

       true      0.507     0.531     0.519       979
      false      0.902     0.893     0.898      4737

avg / total      0.835     0.831     0.833      5716

CPU times: user 27.1 s, sys: 189 ms, total: 27.3 s
Wall time: 27.4 s


In [4]:
y_pred_int = mod1.predict(X_test)

In [5]:
createNewFeatureFromVector(
    X = X_train,
    new_feature_vector = y_train_int,
    new_feature_name = 'is_there_interaction')

createNewFeatureFromVector(
    X = X_test,
    new_feature_vector = y_pred_int,
    new_feature_name = 'is_there_interaction')

In [6]:
%%time

mod2 = trainCRFAndEvaluate(
            X_train = X_train, 
            y_train = y_train_type,
            X_test = X_test,
            y_test = y_test_type,
            labels = ['mechanism', 'int', 'advise', 'effect'],
            hyperparam_optim = False)

             precision    recall  f1-score   support

  mechanism      0.398     0.437     0.416       302
        int      0.577     0.156     0.246        96
     advise      0.406     0.421     0.413       221
     effect      0.417     0.456     0.436       360

avg / total      0.424     0.413     0.406       979

CPU times: user 39.5 s, sys: 138 ms, total: 39.7 s
Wall time: 39.7 s


In [8]:
import collections

def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(collections.Counter(mod2.state_features_).most_common(15))

print("\nTop negative:")
print_state_features(collections.Counter(mod2.state_features_).most_common()[-15:])



Top positive:
9.918947 none     is_there_interaction:false
4.048308 advise   first_modal_sentence:should
1.490504 int      POS_tags_sentence_simpl:NN-IN-NN-CC-NN-IN-NN-.
1.479522 mechanism POS_tags_sentence_simpl:JJ-NN-MD-VB-DT-JJ-NN-IN-JJ-NN-,-NN-,-NN-CC-NN-,-IN-VB-PR-NN-IN-DT-NN-.
1.442210 advise   ent1_pos_tag_prev_word3:TO
1.403773 advise   POS_tags_sentence_simpl:IN-NN-VB-DT-JJ-NN-IN-NN-,-CC-NN-VB-NN-IN-RB-,-JJ-NN-IN-DT-NN-,-VB-NN-,-CC-NN-VB-JJ-NN-IN-NN-TO-VB-JJ-IN-DT-NN-IN-PR-JJ-NN-VB-.
1.396110 effect   POS_tags_sentence_simpl:NN-:-NN-MD-VB-IN-NN-.
1.384008 effect   3_grams_bw_entities:N a
1.361293 effect   2_grams_bw_entities:- 
1.342038 int      POS_tags_sentence_simpl:VB-IN-JJ-NN-,-EX-MD-VB-DT-NN-IN-NN-CC-NN-.
1.264539 advise   sentence_contains_neg
1.170500 effect   3_grams_bw_entities:rel
1.165166 effect   POS_tags_sentence_simpl:RB-,-DT-NN-IN-NN-IN-NN-,-JJ-NN-CC-NN-VB-JJ
1.155956 effect   POS_tags_sentence_simpl:NN-VB-IN-JJ-NN-VB-DT-NN-IN-NN-,-VB-IN-VB-NN-IN-NN-CC-PR-JJ-NN