In [1]:
import joblib
from tqdm.auto import tqdm
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
import networkx as nx
import numpy as np

In [2]:
TARGET_SIZES = (1, 4, 24, 124)
FEATURES = ("ed_", "emb_self_", "emb_imp_", "degree_", "min_dist_")

In [3]:
test = joblib.load("../data/graphsTest")
train = joblib.load("../data/graphsTrain")

## Shortest Distance, Number of Paths, Number of Edges to the node, string similarity, embedding similarity

In [4]:
# Copied from Wikipedia
def levenshtein(s1, s2):
    if len(s1) < len(s2):
        return levenshtein(s2, s1)

    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1       # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]

In [5]:
def calculate_edit_distance(graph):
    for n in TARGET_SIZES:
        graph["ed_" + str(n)] = np.zeros(len(graph["targets_" + str(n)]))
        if len(graph["targets_" + str(n)]) == 0:
            continue
        anchor_str = graph["strings"][graph["targets_" + str(n)][0]]
        anchor_str = anchor_str.split('.')[-1]
        for i in range(len(graph["targets_" + str(n)])):
            node_id = graph["targets_" + str(n)][i]
            node_str = graph["strings"][node_id]
            node_str = node_str.split('.')[-1]
            graph["ed_" + str(n)][i] = levenshtein(anchor_str, node_str) 

In [6]:
def calculate_embedding_distance(graph):
    for n in TARGET_SIZES:
        graph["emb_self_" + str(n)] = np.zeros(len(graph["targets_" + str(n)]))
        graph["emb_imp_" + str(n)] = np.zeros(len(graph["targets_" + str(n)]))
        if len(graph["targets_" + str(n)]) == 0:
            continue
        anchor_emb = graph["annotations"][graph["targets_" + str(n)][0]]
        for i in range(len(graph["targets_" + str(n)])):
            node_id = graph["targets_" + str(n)][i]
            node_emb = graph["annotations"][node_id]
            graph["emb_self_" + str(n)][i] = np.sum((anchor_emb[:50] - node_emb[:50])**2)  
            graph["emb_imp_" + str(n)][i] = np.sum((anchor_emb[50:] - node_emb[50:])**2)  

In [7]:
def nx_graph(graph):
    G = nx.Graph()
    for i in range(len(graph["strings"])):
        node = i
        G.add_node(node)
    for edge in graph["edges"]:
        node_from, _, node_to = edge
        G.add_edge(node_from, node_to)
    return G

In [8]:
def calculate_graph_features(graph):
    G = nx_graph(graph)
    for n in TARGET_SIZES:
        graph["min_dist_" + str(n)] = np.zeros(len(graph["targets_" + str(n)]))
        graph["degree_" + str(n)] = np.zeros(len(graph["targets_" + str(n)]))
        if len(graph["targets_" + str(n)]) == 0:
            continue
        anchor = graph["targets_" + str(n)][0]
        target = graph["targets_" + str(n)][1]
        G.remove_edge(anchor, target)
        path_lengths = nx.single_source_shortest_path_length(G, anchor)
        G.add_edge(anchor, target)
        not_reachable = max(path_lengths.values()) + 1
        # print(path_lengths, not_reachable)
        for i in range(len(graph["targets_" + str(n)])):
            node = graph["targets_" + str(n)][i]
            graph["min_dist_" + str(n)][i] = path_lengths.get(node, not_reachable)
            graph["degree_" + str(n)][i] = G.degree(node)
            # print(graph["strings"][anchor].split('.')[-1], 
            # graph["strings"][node].split('.')[-1], graph["features_min_dist_" + str(n)][i])

In [9]:
for graph in tqdm(test):
    calculate_edit_distance(graph)
    calculate_embedding_distance(graph)
    calculate_graph_features(graph)

HBox(children=(IntProgress(value=0, max=2167), HTML(value='')))




In [10]:
for graph in tqdm(train):
    calculate_edit_distance(graph)
    calculate_embedding_distance(graph)
    calculate_graph_features(graph)

HBox(children=(IntProgress(value=0, max=19503), HTML(value='')))




In [11]:
def take_max_or_min(graph, n, key, ismax=True):
    if ismax:
        extreme = np.max(graph[key + str(n)][1:])
    else:
        extreme = np.min(graph[key + str(n)][1:])
    if graph[key + str(n)][1] != extreme:
        return 0
    options = np.where(graph[key + str(n)][1:] == extreme)[0]
    if len(options) > 1:
        return 0
    return 1

In [25]:
def test_baseline(baseline, n):
    total = 0
    correct = 0
    for graph in test:
        if len(graph["targets_" + str(n)]) == 0:
            continue
        total += 1
        correct += baseline(graph, n)
    return correct/total

In [26]:
for n in TARGET_SIZES:
    print("Testing with %d options:" % n)
    for feature in FEATURES:
        min_result = test_baseline(lambda x, y: take_max_or_min(x, y, feature, ismax=False), n)
        max_result = test_baseline(lambda x, y: take_max_or_min(x, y, feature, ismax=True), n)
        print("\t" + feature + ":\t" + str(round(max(min_result, max_result) * 100, 1)))

Testing with 1 options:
	ed_:	54.7
	emb_self_:	51.1
	emb_imp_:	51.4
	degree_:	70.4
	min_dist_:	34.3
Testing with 4 options:
	ed_:	30.2
	emb_self_:	25.7
	emb_imp_:	22.6
	degree_:	50.4
	min_dist_:	12.0
Testing with 24 options:
	ed_:	10.9
	emb_self_:	10.7
	emb_imp_:	7.5
	degree_:	26.2
	min_dist_:	1.7
Testing with 124 options:
	ed_:	6.2
	emb_self_:	7.1
	emb_imp_:	3.3
	degree_:	11.6
	min_dist_:	0.0


In [14]:
def normalize(dataset):
    for n in tqdm(TARGET_SIZES):
        for graph in dataset:
            if len(graph["targets_" + str(n)]) == 0:
                continue
            for feature in FEATURES:
                max_value = np.max(graph[feature + str(n)][1:])
                if max_value != 0:
                    graph[feature + str(n)] /= max_value
                    graph[feature + str(n)] -= 0.5

## Try z-score

In [15]:
normalize(train)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




In [16]:
normalize(test)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




In [17]:
train_y = {}
train_X = {}
for n in tqdm(TARGET_SIZES):
    train_y[n] = []
    train_X[n] = []
    for graph in train:
        if len(graph["targets_" + str(n)]) == 0:
            continue
        features = np.stack([graph[feature + str(n)] for feature in FEATURES], axis=1)
        for i in range(1, n+2):
            train_X[n].append(features[i])  
        curr_y = np.zeros(n+1)
        curr_y[0] = 1
        train_y[n] += list(curr_y)

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




In [18]:
classsifiers = {}
for n in tqdm(TARGET_SIZES):
    classsifiers[n] = RandomForestClassifier(n_estimators = 100, n_jobs=2)
    classsifiers[n].fit(train_X[n], train_y[n])

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




In [19]:
classifierSVM = svm.SVC(probability=True, verbose=True)
classifierSVM.fit(train_X[1], train_y[1])

[LibSVM]



SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='rbf', max_iter=-1, probability=True, random_state=None,
  shrinking=True, tol=0.001, verbose=True)

In [20]:
def test_classifier(graph, n, classifier):
    features = np.stack([graph[feature + str(n)] for feature in FEATURES], axis=1)
    test_X = []
    for i in range(1, n+2):
        test_X.append(features[i]) 
    proba = classifier.predict_proba(test_X)
    # print(proba)
    extreme = np.max(proba, axis=0)[1]
    # print(extreme)
    # print(proba[0][1])
    if proba[0][1] != extreme:
        return 0
    options = np.where(proba[:, 1] == extreme)[0]
    # print(options)
    if len(options) > 1:
        return 0
    return 1

In [23]:
for n in TARGET_SIZES:
    print("Testing with random forest yields:" + 
          str(test_baseline(lambda x, y: test_classifier(x, y, classsifiers[y]), n)))

HBox(children=(IntProgress(value=0, max=2167), HTML(value='')))


Testing with random forest yields:0.772496538994001


HBox(children=(IntProgress(value=0, max=2167), HTML(value='')))


Testing with random forest yields:0.544026653974298


HBox(children=(IntProgress(value=0, max=2167), HTML(value='')))


Testing with random forest yields:0.265625


HBox(children=(IntProgress(value=0, max=2167), HTML(value='')))


Testing with random forest yields:0.12033195020746888


In [24]:
for n in TARGET_SIZES:
    print("Testing with random forest yields:" + 
          str(test_baseline(lambda x, y: test_classifier(x, y, classifierSVM), n)))

HBox(children=(IntProgress(value=0, max=2167), HTML(value='')))


Testing with random forest yields:0.7503461005999077


HBox(children=(IntProgress(value=0, max=2167), HTML(value='')))


Testing with random forest yields:0.5188005711565921


HBox(children=(IntProgress(value=0, max=2167), HTML(value='')))


Testing with random forest yields:0.15707236842105263


HBox(children=(IntProgress(value=0, max=2167), HTML(value='')))


Testing with random forest yields:0.016597510373443983
