In [1]:
import os
import joblib
import argparse
import itertools
import collections
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
def split_data(features):
    """
    Receive a list of featured sentences splitted by words and split it into samples and labels.

    Parameters:
    word_features(list): List of feature words. There is an empty element between sentences in order to split each one.

    Returns:
    X_samples(list): List of feature words missing the label of the word.
    Y_labels(list): List of labels for each word.
    """

    X_samples = []
    Y_labels = []

    for feat in features:
        feat = feat.split(" ")
        Y_labels.append(feat[0])
        X_samples.append(feat[1:])

    return X_samples, Y_labels

def transform_feature_vector_to_dataset(X_train, X_test):

    flatten_X_samples = list(itertools.chain(*X_train))
    feature_names_eq = [item.split("=")[0]for item in flatten_X_samples if "=" in item]
    feature_names_eq = list(set(feature_names_eq))
    feature_names_without_eq = [item for item, count in collections.Counter(flatten_X_samples).items() if "=" not in item][0:]
    feature_names_without_eq.remove('')
    n_features = len(feature_names_eq) + len(feature_names_without_eq)

    new_X_train = np.zeros((len(X_train), n_features))
    new_X_test = np.zeros((len(X_test), n_features))


    for i, sample in enumerate(X_train):
        for j, feature_name in enumerate(feature_names_eq):
            for feat in sample:
                if feature_name in feat:
                    value = feat.split("=")[1]
                    new_X_train[i,j] = value

    for i, sample in enumerate(X_train):
        for j, feature_name in enumerate(feature_names_without_eq, len(feature_names_eq)):
            if feature_name in sample:
                new_X_train[i,j] = 1

    for i, sample in enumerate(X_test):
        for j, feature_name in enumerate(feature_names_eq):
            for feat in sample:
                if feature_name in feat:
                    value = feat.split("=")[1]
                    new_X_test[i,j] = value

    for i, sample in enumerate(X_test):
        for j, feature_name in enumerate(feature_names_without_eq, len(feature_names_eq)):
            if feature_name in sample:
                new_X_test[i,j] = 1

    return new_X_train, new_X_test


def process_feature_vectors(train_feature_vectors, test_feature_vectors):
    X_train, Y_train = split_data(train_feature_vectors)
    X_test, Y_test = split_data(test_feature_vectors)

    new_X_train, new_X_test = transform_feature_vector_to_dataset(X_train, X_test)
    # new_X_train, new_X_test = process_dataset_to_dataframe(new_X_train, new_X_test)
    return new_X_train, Y_train, new_X_test, Y_test


def output_predicted_entities(Y_pred, filename):
    """
    Receives the predicted list of labels by the ML model and a filename of the detailed word features and construct predicted.txt file
    with the intrinsic details of each word, id, offsets and the predicted label.

    Parameters:
    Y_pred(list): List of list of predicted labels for each word.
    filename(str): filename to read the "feats.dat" data of the testing dataset

    Returns:

    """

    Y_pred_flatten = [el for line in Y_pred for el in line]
    detailed_word_features = open(filename).read().split("\n")[:-1]

    for label, detailed_feats in zip(Y_pred_flatten, detailed_word_features):
        _id, e1_id, e2_id = detailed_feats.split(" ")[0:3]
        if label == "null":
            interaction = "0"
        else:
            interaction = "1"
        line = [_id, e1_id, e2_id, interaction, label]
        outputfile.write("|".join(line) + "\n")

def evaluate(inputdir, outputfile):
    """
    Receives an input directory and the outputfile to evaluate the predicted labels with the evaluateNER.jar program.

    Parameters:
    inputdir(str):
    outputfile(str):

    Returns:

    """

    os.system("java -jar eval/evaluateDDI.jar " + inputdir + " " + outputfile)

def gridsearch(model, parameters):
    clf = GridSearchCV(model, parameters)
    clf.fit(X_train, Y_train)
    Y_pred = clf.predict(X_test)

    return Y_pred


train_filename = "./marc_extracted_features/Train/megam.dat"
test_filename = "./marc_extracted_features/Devel/megam.dat"
fulltest_filename = "./marc_extracted_features/Devel/feats.dat"





train_feat_vects = open(train_filename, "r").read().split("\n")[:-1]
test_feat_vects = open(test_filename, "r").read().split("\n")[:-1]

X_train, Y_train, X_test, Y_test = process_feature_vectors(train_feat_vects, test_feat_vects)
train_feat_vects = open(train_filename, "r").read().split("\n")[:-1]
test_feat_vects = open(test_filename, "r").read().split("\n")[:-1]

X_train, Y_train, X_test, Y_test = process_feature_vectors(train_feat_vects, test_feat_vects)


In [24]:
for n_estimator in [5, 10, 20, 50]:
    for max_depth in [50, 100, 150, 200, 250, 300]:

        params = {
            "class_weight": "balanced",
            "n_estimators": n_estimator,
            "max_depth": max_depth
        }

        prueba_nombre = "RF_estimator={}_depth={}".format(n_estimator, max_depth)
        output_filename = "./pruebas_finales/{}.txt".format(prueba_nombre)

        rfc_cv = RandomForestClassifier(**params)
        rfc_cv.fit(X_train, Y_train)
        Y_pred = rfc_cv.predict(X_test)
        Y_pred = [[value] for value in Y_pred]

        outputfile = open(output_filename, "w")
        output_predicted_entities(Y_pred, fulltest_filename)
        outputfile.close()
        
        filename = '{}.sav'.format(prueba_nombre)
        joblib.dump(model, filename)

        evaluate("data/Devel", output_filename)

In [None]:
for kernel  in ["linear", "poly", "rbf", "sigmoid"]:
    for gamma in ["auto", "scale"]:

        print(kernel, gamma)
        params = {
            "class_weight": "balanced",
            "kernel": kernel,
            "gamma": gamma,
        }

        prueba_nombre = "SVM_kernel={}_gamma={}".format(kernel, gamma)
        output_filename = "./pruebas_finales/{}.txt".format(prueba_nombre)

        svc_cv = SVC(**params)
        svc_cv.fit(X_train, Y_train)
        Y_pred = svc_cv.predict(X_test)
        Y_pred = [[value] for value in Y_pred]

        outputfile = open(output_filename, "w")
        output_predicted_entities(Y_pred, fulltest_filename)
        outputfile.close()
        
        filename = '{}.sav'.format(prueba_nombre)
        joblib.dump(model, filename)

        evaluate("data/Devel", output_filename)

('linear', 'auto')
