In [1]:
# Data processing libraries
import pandas as pd
import numpy as np

# NLP libraries
import nltk
from nltk.tag import StanfordPOSTagger

# Machine Learning Libraries
import sklearn
import scipy.stats
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split # Parameter selection
import sklearn_crfsuite
from sklearn_crfsuite import scorers, metrics
from sklearn.cross_validation import cross_val_score
from sklearn.grid_search import RandomizedSearchCV

# Other libraries
import time # Execution time of some blocks
import statistics
from IPython.display import display # For displaying DataFrames correctly in Jupyter
from itertools import chain
import collections
import pickle

# Import our own defined functions
from xlm_parsers_functions import *
from drug_interaction_functions import *
from drug_functions import *
from NER_functions import *
from ortographic_features import *
from context_features import *
from feature_creation_interaction import *
from crf_functions import *



In [None]:
%%time

with open("X_train.txt", "rb") as f:   # Unpickling
    X_train = pickle.load(f)
    
with open("y_train_int.txt", "rb") as f:   # Unpickling
    y_train_int = pickle.load(f)
    
with open("y_train_type.txt", "rb") as f:   # Unpickling
    y_train_type = pickle.load(f)

with open("X_test.txt", "rb") as f:   # Unpickling
    X_test = pickle.load(f)
    
with open("y_test_int.txt", "rb") as f:   # Unpickling
    y_test_int = pickle.load(f)
    
with open("y_test_type.txt", "rb") as f:   # Unpickling
    y_test_type = pickle.load(f)

print('Number of training sentences readed: ', len(X_train))
print('Number of testing sentences readed: ', len(X_test))

Number of training sentences readed:  27792
Number of testing sentences readed:  5716
CPU times: user 2.54 s, sys: 573 ms, total: 3.11 s
Wall time: 3.47 s


In [None]:
%%time

mod1 = trainCRFAndEvaluate(
            X_train = X_train, 
            y_train = y_train_int,
            X_test = X_test,
            y_test = y_test_int,
            labels = ['true', 'false'],
            hyperparam_optim = False)

In [None]:
y_pred_int = mod1.predict(X_test)

In [None]:
X_train_filtered = []
y_train_filtered = []
for idx,val in enumerate(y_train_int):
    if val==['true']:
        X_train_filtered.append(X_train[idx])
        y_train_filtered.append(y_train_type[idx]) # no we don't want true or false rather than int, mechanism...


X_test_filtered = []
y_test_filtered = []
for idx,val in enumerate(y_pred_int):
    if val==['true']:
        X_test_filtered.append(X_test[idx])
        y_test_filtered.append(y_test_type[idx]) # no we don't want true or false rather than int, mechanism...

print('Number of training sentences: ', len(X_train_filtered))
print('Number of testing sentences: ', len(X_test_filtered))

In [None]:
%%time

mod2 = trainCRFAndEvaluate(
            X_train = X_train_filtered, 
            y_train = y_train_filtered,
            X_test = X_test_filtered,
            y_test = y_test_filtered,
            labels = ['mechanism', 'int', 'advise', 'effect'],
            hyperparam_optim = False)

In [None]:
import collections

def print_state_features(state_features):
    for (attr, label), weight in state_features:
        print("%0.6f %-8s %s" % (weight, label, attr))

print("Top positive:")
print_state_features(collections.Counter(mod2.state_features_).most_common(15))

print("\nTop negative:")
print_state_features(collections.Counter(mod2.state_features_).most_common()[-15:])

