In [118]:
from sklearn.ensemble import RandomForestClassifier
import itertools
import numpy as np
import pandas as pd

In [13]:
def split_data(features):
    """
    Receive a list of featured sentences splitted by words and split it into samples and labels.

    Parameters:
    word_features(list): List of feature words. There is an empty element between sentences in order to split each one.

    Returns:
    X_samples(list): List of feature words missing the label of the word.
    Y_labels(list): List of labels for each word.
    """

    X_samples = []
    Y_labels = []

    for feat in features:
        feat = feat.split(" ")
        Y_labels.append(feat[0])
        X_samples.append(feat[1:])

    return X_samples, Y_labels


def output_predicted_entities(Y_pred, filename):
    """
    Receives the predicted list of labels by the ML model and a filename of the detailed word features and construct predicted.txt file
    with the intrinsic details of each word, id, offsets and the predicted label.

    Parameters:
    Y_pred(list): List of list of predicted labels for each word.
    filename(str): filename to read the "feats.dat" data of the testing dataset

    Returns:

    """

    Y_pred_flatten = [el for line in Y_pred for el in line]
    detailed_word_features = open(filename).read().split("\n")[:-1]

    for label, detailed_feats in zip(Y_pred_flatten, detailed_word_features):
        _id, e1_id, e2_id = detailed_feats.split(" ")[0:3]
        if label == "null":
            interaction = "0"
        else:
            interaction = "1"
        line = [_id, e1_id, e2_id, interaction, label]
        outputfile.write("|".join(line) + "\n")
        
def evaluate(inputdir, outputfile):
    """
    Receives an input directory and the outputfile to evaluate the predicted labels with the evaluateNER.jar program.

    Parameters:
    inputdir(str):
    outputfile(str):

    Returns:

    """

    os.system("java -jar eval/evaluateDDI.jar " + inputdir + " " + outputfile)


In [147]:
def transform_feature_vector_to_dataset(feature_vectors_train, feature_vectors_test):
    flatten_X_samples = list(itertools.chain(*feature_vectors_train))
    feature_names = list(set(flatten_X_samples))[1:]
    X_train = np.zeros((len(feature_vectors_train), len(feature_names)))
    X_test = np.zeros((len(feature_vectors_test), len(feature_names)))
    for i, sample in enumerate(feature_vectors_train):
        for j, feature_name in enumerate(feature_names):
            if feature_name in sample:
                X_train[i,j] = 1
    
    for i, sample in enumerate(feature_vectors_test):
        for j, feature_name in enumerate(feature_names):
            if feature_name in sample:
                X_test[i,j] = 1
    return X_train, X_test

def process_feature_vectors(train_feature_vectors, test_feature_vectors):
    X_train, Y_train = split_data(train_feature_vectors)
    X_test, Y_test = split_data(test_feature_vectors)
    
    new_X_train, new_X_test = transform_feature_vector_to_dataset(X_train, X_test)
    new_X_train, new_X_test = process_dataset_to_dataframe(new_X_train, new_X_test)
    return new_X_train, Y_train, new_X_test, Y_test

In [148]:
def process_dataset_to_dataframe(X_train, X_test):
    df_train = pd.DataFrame(X_train)
    df_test = pd.DataFrame(X_test)
    
    criteria_train = df_train.sum(axis=0) >= 50
    criteria_test = df_test.sum(axis=0) >= 50
    
    df_train = df_train.loc[:,criteria_train]
    df_test = df_test.loc[:, criteria_test]
    return df_train, df_test

In [152]:
### MAIN VARIABLES
# train_filename = os.path.join(args.train, "megam.dat")
# test_filename = os.path.join(args.test, "megam.dat")
# fulltest_filename = os.path.join(args.test, "feats.dat")

train_filename = "./extracted_features/Train/megam.dat"
test_filename = "./extracted_features/Devel/megam.dat"
fulltest_filename = "./extracted_features/Devel/feats.dat"

# _type represents the if it has internal or external knowledge
# _type = args.test.split("/")[-2]
# inputdir = os.path.join("data", args.test.split("/")[-1])

# If outputdir does not exist, we safely create it
# outputdir = "./"
# if not os.path.exists(outputdir):
#    os.makedirs(outputdir)


output_filename = "./predicted.txt"


train_feat_vects = open(train_filename, "r").read().split("\n")[:-1]
test_feat_vects = open(test_filename, "r").read().split("\n")[:-1]

X_train, Y_train, X_test, Y_test = process_feature_vectors(train_feat_vects, test_feat_vects)

In [153]:
clf = RandomForestClassifier()
clf.fit(X_train, Y_train)
Y_pred = clf.predict(X_test)
Y_pred = [[value] for value in Y_pred]

ValueError: Number of features of the model must match the input. Model n_features is 117 and input n_features is 51 

In [151]:
outputfile = open(output_filename, "w")
output_predicted_entities(Y_pred, fulltest_filename)
outputfile.close()