In [1]:
%matplotlib inline

import sys
sys.path.append("./packages")

import matplotlib
import datetime
import numpy as np
import data_preparation_tools as dpt
import features_generation_tools as fgt
import model_tools
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import make_scorer
from sklearn.ensemble import GradientBoostingClassifier



In [6]:
# path used to save temporary doc2vec files
temp_doc2vec_file = r"./demo_output/temp_doc2vec.txt"
# path to text file that contains background sentences used in doc2vec
background_samples_file_path = r"./demo_output/background_samples.txt"

doc2vec_func = lambda x_train,x_test : fgt.get_doc2vec_features(x_train, x_test, temp_doc2vec_file, background_samples_file_path)
bow_func = lambda x_train,x_test : fgt.get_bow_features(x_train, x_test, (1,3))

# evaluate different features
gen_features_methods = [
fgt.GenFeaturesMethod("bow_1_gram", lambda x_train,x_test : fgt.get_bow_features(x_train, x_test, (1,1))),
fgt.GenFeaturesMethod("bow_2_gram", lambda x_train,x_test : fgt.get_bow_features(x_train, x_test, (2,2))),
fgt.GenFeaturesMethod("bow_3_gram", lambda x_train,x_test : fgt.get_bow_features(x_train, x_test, (3,3))),
fgt.GenFeaturesMethod("bow_1_3_gram", lambda x_train,x_test : fgt.get_bow_features(x_train, x_test, (1,3))),
fgt.GenFeaturesMethod("doc2vec", lambda x_train,x_test : fgt.get_doc2vec_features(x_train, x_test, temp_doc2vec_file, background_samples_file_path)),
fgt.GenFeaturesMethod("pos_3_3", lambda x_train,x_test : fgt.to_pos_bow(x_train, x_test, (3,3))),
fgt.GenFeaturesMethod("bow_1_3_pos_3_3", lambda x_train,x_test : fgt.get_bow_and_pos_features(x_train, x_test, (1,3), (3,3))),
fgt.GenFeaturesMethod("bow_1_3_doc2vec", lambda x_train,x_test : fgt.get_compound_features(x_train, x_test, [bow_func, doc2vec_func]))
]

#Cs= [0.005, 0.01, 0.03, 0.05, 0.1, 0.3, 0.5, 0.8] + np.linspace(1,5, 9).tolist()
Cs = np.linspace(0.005,0.25,10)

# evaluates different classifiers
evaluation_methods = [
    fgt.EvaluationMethod("logistic regression l1", lambda: LogisticRegression(C=0.1, penalty='l1', solver='liblinear')),
    fgt.EvaluationMethod("lr l1 cv", lambda: LogisticRegressionCV(penalty='l1', cv=5, scoring=make_scorer(f1_score), solver='liblinear', Cs=Cs, refit=True)),
    fgt.EvaluationMethod("lr l2 cv", lambda: LogisticRegressionCV(penalty='l2', cv=5, scoring=make_scorer(f1_score), solver='liblinear', Cs=Cs, refit=True)),
    #fgt.EvaluationMethod("GBC", lambda: GradientBoostingClassifier(n_estimators=100, learning_rate=0.5, max_depth=10, random_state=0))
]

# path to input dir 
input_dir = r"./demo_output"
startTime = datetime.datetime.now()

models = fgt.run_gen_features_pipeline(input_dir, gen_features_methods, evaluation_methods)

runTime = datetime.datetime.now() - startTime
print "Finished generating features, took:%s"%runTime

generating bow_1_gram features for 9_18_2016_binary_data_entities_normalize
model evaluation for: 9_18_2016_binary_data_entities_normalize, bow_1_gram, logistic regression l1
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       0.00      0.00      0.00         4

avg / total       0.00      0.00      0.00         4

model evaluation for: 9_18_2016_binary_data_entities_normalize, bow_1_gram, lr l1 cv
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       0.00      0.00      0.00         4

avg / total       0.00      0.00      0.00         4

model evaluation for: 9_18_2016_binary_data_entities_normalize, bow_1_gram, lr l2 cv
             precision    recall  f1-score   support

          1       1.00      1.00      1.00         4

avg / total       1.00      1.00      1.00         4

generating bow_2_gram features for 9_18_2016_binary_data_entiti




generating doc2vec features for 9_18_2016_binary_data_entities_normalize
creating temp file...
creating model...
model built
model evaluation for: 9_18_2016_binary_data_entities_normalize, doc2vec, logistic regression l1
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       0.00      0.00      0.00         4

avg / total       0.00      0.00      0.00         4

model evaluation for: 9_18_2016_binary_data_entities_normalize, doc2vec, lr l1 cv
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       0.00      0.00      0.00         4

avg / total       0.00      0.00      0.00         4

model evaluation for: 9_18_2016_binary_data_entities_normalize, doc2vec, lr l2 cv
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         0
          1       1.00      0.75      0.86         4

avg / total       1.00 

ValueError: too many values to unpack

In [None]:
import numpy as np
import matplotlib.pyplot as plt

m = models[0]

av = [np.average(m.scores_[1][:,i]) for i in xrange(len(m.scores_[1][0])) ]
plt.plot(Cs, av)

In [None]:
# file path to data with test sentences
test_file_path = r""

# path to context file path which contains the entities for each sentence as well (produced by the data transofrmation pipeline)
context_file_path = r""

entities_1 = []
entities_2 = []

with open(context_file_path) as handle:
    for l in handle:
        parts = l.rstrip().split("\t")
        entities_1.append(parts[0])
        entities_2.append(parts[1])

sentences, labels = fgt.read_data_from_file(test_file_path)

import heapq
import PrettyTable as pt

COLUMN_NAMES = ["score", "text", "Entity 1", "Entity2"]

er = models[0]

neg_scores = [er.scores[i][0] for i in xrange(len(er.scores))]
pos_scores = [er.scores[i][1] for i in xrange(len(er.scores))]


def get_top_scoring(scores, sentences, entities_1, entities_2, n):
    table_data = []
    #scores_arr = np.array(scores)
    #l = scores_arr.argsort()[-n:][::-1]
    l = heapq.nlargest(n, range(len(scores)), scores.__getitem__)
    for i in l:
        table_data.append(["%0.7f"%scores[i], sentences[i], entities_1[i], entities_2[i]])
    return table_data

def write_results_to_file(file_path, column_names, table_data):
    with open(file_path, "w") as handle:
        handle.write("\t".join(COLUMN_NAMES) + "\n")
        for l in table_data:
            handle.write("\t".join(l) + "\n")

# path to the file that will contain scored sentences, ordered by their score
top_scoring_positive_class_file_path = r"./demo_output/top_scoring_positive_class.txt"            

write_results_to_file(top_scoring_positive_class_file_path, COLUMN_NAMES, get_top_scoring(pos_scores, sentences, entities_1, entities_2, len(pos_scores)))