# This notebook contains code to generate baselines for the import prediction problem that the newural network results could later be compared to

In [1]:
import joblib
from tqdm.notebook import tqdm
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
import networkx as nx
import numpy as np

In [2]:
# Different number of negatives a classifier is given. For Target size of 1, the problem becomes a binary
# classifiecation problem (one positive, one negative). In other cases, this is a multiclass classification
TARGET_SIZES = (1, 4, 24, 124)
EMBED_SIZE = 50 # Embedding size used
# keys for different features described below
FEATURES = ("ed_", "degree_", "min_dist_", "emb_self_", "emb_imp_")

In [3]:
test = joblib.load("../data/graphsTest50")
train = joblib.load("../data/graphsTrain50")

## Calculating various features to be used in baseline prediction. 

### (i) Edit distance between compilation unit names

In [4]:
# Copied from Wikipedia
def levenshtein(s1, s2):
    """
    Function to calculate edit distance between to strings. Copied from wikipedia
    """
    if len(s1) < len(s2):
        return levenshtein(s2, s1)

    # len(s1) >= len(s2)
    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1 # j+1 instead of j since previous_row and current_row are one character longer
            deletions = current_row[j] + 1       # than s2
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]

In [5]:
def calculate_edit_distance(graph):
    """
    For each datapoint in the dataset calculate the edit distance between relevant pairs of nodes
    :param graph:   a dictionary of the following form: 
                    "annotations" - initial node embeddings. These are composed of two parts. The first one
                    encodes the class name, the second part record external imports in that class\compilation unit
                    "edges"   - a list of tuples of three elements each (source, edge_type, destination)
                    "strings" - a dictionary that maps an id of a node to the fully qualified name of the class
                    "targets_[n]" - list of node indices. The first index is the node to which an import is made,
                    the second index is the node from which the import is mad (the node the baseline has to predict)
                    the following n indices are alternative nodes that the system is presented with and has to
                    distinguish from the positive.
    """
    for n in TARGET_SIZES:
        graph["ed_" + str(n)] = np.zeros(len(graph["targets_" + str(n)]))
        if len(graph["targets_" + str(n)]) == 0:
            continue
        anchor_str = graph["strings"][graph["targets_" + str(n)][0]]
        anchor_str = anchor_str.split('.')[-1]
        for i in range(len(graph["targets_" + str(n)])):
            node_id = graph["targets_" + str(n)][i]
            node_str = graph["strings"][node_id]
            node_str = node_str.split('.')[-1]
            graph["ed_" + str(n)][i] = levenshtein(anchor_str, node_str)

### (ii) distance between corresponding embeddings

In [6]:
def calculate_embedding_distance(graph):
    """
    For each datapoint in the dataset calculate the distance between embeddings of teh two relevant nodes.
    Since these embeddings are composed of two parts that encode conceptually different things (see above), 
    two distances are calculated
    :param graph: see docstring for calculate_edit_distance()
    """
    for n in TARGET_SIZES:
        graph["emb_self_" + str(n)] = np.zeros(len(graph["targets_" + str(n)]))
        graph["emb_imp_" + str(n)] = np.zeros(len(graph["targets_" + str(n)]))
        graph["emb_full_" + str(n)] = np.zeros((len(graph["targets_" + str(n)]), EMBED_SIZE * 4))
        if len(graph["targets_" + str(n)]) == 0:
            continue
        anchor_emb = graph["annotations"][graph["targets_" + str(n)][0]]
        for i in range(len(graph["targets_" + str(n)])):
            node_id = graph["targets_" + str(n)][i]
            node_emb = graph["annotations"][node_id]
            graph["emb_self_" + str(n)][i] = np.sum((anchor_emb[:EMBED_SIZE] - node_emb[:EMBED_SIZE])**2)  
            graph["emb_imp_" + str(n)][i] = np.sum((anchor_emb[EMBED_SIZE:] - node_emb[EMBED_SIZE:])**2)
            graph["emb_full_" + str(n)][i] = np.concatenate((anchor_emb, node_emb))

### (iii) shortest distance from one node to the other on the graph and (iv) degree of a node

In [7]:
def nx_graph(graph):
    """
    Take a graph as described in the docstring for calculate_edit_distance() and return a networkx representation
    of that graph
    """
    G = nx.Graph()
    for i in range(len(graph["strings"])):
        node = i
        G.add_node(node)
    for edge in graph["edges"]:
        node_from, _, node_to = edge
        G.add_edge(node_from, node_to)
    return G

In [8]:
def calculate_graph_features(graph):
    """
    For each datapoint in the dataset calculate the graph-level features for each pair of relevant nodes.
    These features are (i) shortest distance from one node to the other on the graph and (ii) degree of a node
    :param graph: see docstring for calculate_edit_distance()
    """
    G = nx_graph(graph)
    for n in TARGET_SIZES:
        graph["min_dist_" + str(n)] = np.zeros(len(graph["targets_" + str(n)]))
        graph["degree_" + str(n)] = np.zeros(len(graph["targets_" + str(n)]))
        if len(graph["targets_" + str(n)]) == 0:
            continue
        anchor = graph["targets_" + str(n)][0]
        target = graph["targets_" + str(n)][1]
        G.remove_edge(anchor, target)
        path_lengths = nx.single_source_shortest_path_length(G, anchor)
        G.add_edge(anchor, target)
        not_reachable = max(path_lengths.values()) + 1
        # print(path_lengths, not_reachable)
        for i in range(len(graph["targets_" + str(n)])):
            node = graph["targets_" + str(n)][i]
            graph["min_dist_" + str(n)][i] = path_lengths.get(node, not_reachable)
            graph["degree_" + str(n)][i] = G.degree(node)
            # print(graph["strings"][anchor].split('.')[-1], 
            # graph["strings"][node].split('.')[-1], graph["features_min_dist_" + str(n)][i])

## Calculate all the features for the training and testing sets

In [9]:
for graph in tqdm(test):
    calculate_edit_distance(graph)
    calculate_embedding_distance(graph)
    calculate_graph_features(graph)

HBox(children=(FloatProgress(value=0.0, max=2167.0), HTML(value='')))




In [10]:
for graph in tqdm(train):
    calculate_edit_distance(graph)
    calculate_embedding_distance(graph)
    calculate_graph_features(graph)

HBox(children=(FloatProgress(value=0.0, max=19503.0), HTML(value='')))




## See how good each of the features is for import prediction alone

In [11]:
def take_max_or_min(graph, n, key, ismax=True):
    """
    Given a feature name, a graph, and the number of options to consider (TARGET_SIZE)
    select the option that maximizes or minimizes the feature value and
    return 1 if the option is the correct prediction and 0 otherwise
    :param graph: see docstring for calculate_edit_distance()
    :param n:     the number of options to consider. Should be the element of TARGET_SIZES
    :param key:   the feature to make the prediction by. Should be a string and element of FEATURES list
    ;param ismax: whether to take max or min
    """
    if ismax:
        extreme = np.max(graph[key + str(n)][1:])
    else:
        extreme = np.min(graph[key + str(n)][1:])
    if graph[key + str(n)][1] != extreme:
        return 0
    options = np.where(graph[key + str(n)][1:] == extreme)[0]
    return 1/len(options)

In [12]:
def test_baseline(baseline, n):
    """
    Test hoe well a given baseline fares with a given number of options in terms of accuraccy.
    Note that accuraccy is obviously dependent on the number of options. The random baseline in always 1/n
    :param baseline:a function that takes a graph and n as options
    :param n:       number of options to consider
    """
    total = 0
    correct = 0
    for graph in test:
        if len(graph["targets_" + str(n)]) == 0:
            continue
        total += 1
        correct += baseline(graph, n, total-1)
    return correct/total

### Below is the breakdown of how well each feature is for predicting imports on its own

In [13]:
for n in TARGET_SIZES:
    print("Testing with %d options:" % n)
    for feature in FEATURES:
        min_result = test_baseline(lambda x, y, _: take_max_or_min(x, y, feature, ismax=False), n)
        max_result = test_baseline(lambda x, y, _: take_max_or_min(x, y, feature, ismax=True), n)
        print("\t" + feature + ":\t" + str(round(max(min_result, max_result) * 100, 1)))

Testing with 1 options:
	ed_:	61.9
	degree_:	74.2
	min_dist_:	62.3
	emb_self_:	50.8
	emb_imp_:	50.2
Testing with 4 options:
	ed_:	31.4
	degree_:	51.8
	min_dist_:	31.2
	emb_self_:	25.1
	emb_imp_:	24.4
Testing with 24 options:
	ed_:	12.9
	degree_:	25.3
	min_dist_:	9.1
	emb_self_:	9.3
	emb_imp_:	6.7
Testing with 124 options:
	ed_:	6.5
	degree_:	9.8
	min_dist_:	2.5
	emb_self_:	4.1
	emb_imp_:	1.7


## Convert all the data in the format that can be easily used with sklearn classifiers and normalize it

In [14]:
def convert_to_xy_format(n, dataset, func, scaler=None):
    y = []
    X = []
    for graph in dataset:
        if len(graph["targets_" + str(n)]) == 0:
            continue
        features = func(graph)
        if scaler:
            features[1:] = scaler.fit_transform(features[1:])
        for i in range(1, n+2):
            X.append(features[i])  
        curr_y = np.zeros(n+1)
        curr_y[0] = 1
        y += list(curr_y)
    return np.array(X), np.array(y)

In [15]:
train_y = {}
train_X = {}
test_y = {}
test_X = {}
for n in tqdm(TARGET_SIZES):
    func = lambda x: np.stack([x[feature + str(n)] for feature in FEATURES], axis=1)
    scaler = StandardScaler(copy=False, with_mean=True, with_std=True)
    train_X[n], train_y[n] = convert_to_xy_format(n, train, func, scaler=scaler)
    test_X[n], test_y[n] = convert_to_xy_format(n, test, func, scaler=scaler)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




## Train and test random forest and SVM classifiers

In [None]:
classsifiers = {}
for n in tqdm(TARGET_SIZES):
    classsifiers[n] = RandomForestClassifier(n_estimators = 100, n_jobs=2)
    classsifiers[n].fit(train_X[n], train_y[n])

In [None]:
classsifiersSVMs = {}
for n in tqdm(TARGET_SIZES[:1]):
    classsifiersSVMs[n] = svm.SVC(probability=True, verbose=True)
    classsifiersSVMs[n].fit(train_X[n], train_y[n])

In [None]:
def test_classifier(X, classifier):
    """
    Test a sklearn classifier. This is analagoues to take_max_or_min function above. 
    The function return 1 if the classifier makes a correct prediction for the given graph and 0 otherwise
    """
    proba = classifier.predict_proba(X)
    extreme = np.max(proba, axis=0)[1]
    if proba[0][1] != extreme:
        return 0
    options = np.where(proba[:, 1] == extreme)[0]
    return 1 / len(options)

In [None]:
for n in TARGET_SIZES:
    print("Testing with %d options:" % n)
    print("Testing with random forest yields:" + 
          str(test_baseline(lambda g, n, i: test_classifier(test_X[n][(n+1)*i:(n+1)*(i+1)], classsifiers[n]), n)))

In [None]:
for n in TARGET_SIZES:
    print("Testing with %d options:" % n)
    print("Testing with svm yields:" + 
          str(test_baseline(lambda g, n, i: test_classifier(test_X[n][(n+1)*i:(n+1)*(i+1)], classsifiersSVMs[1]), n)))

# Neural Network Baselines

In [16]:
embed_train_y = {}
embed_train_X = {}
embed_test_y = {}
embed_test_X = {}
for n in tqdm(TARGET_SIZES):
    func = lambda x: x["emb_full_" + str(n)]
    embed_train_X[n], embed_train_y[n] = convert_to_xy_format(n, train, func)
    embed_test_X[n], embed_test_y[n] = convert_to_xy_format(n, test, func)

HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [17]:
import torch as tt
from torch import nn
import copy

In [26]:
class FeedForward(nn.Module):
    """Simple One layer Autoencoder"""
    def __init__(self, in_size):
        super(FeedForward, self).__init__()
        midway = int(in_size**0.5)
        self.layers = nn.Sequential(
            nn.Linear(in_size, midway),
            nn.Linear(midway, int(midway/2)),
            nn.Linear(int(midway/2), 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        x = self.layers(x)
        return x

In [27]:
def evaluate_model(model, test_X, n, binary_acc=False):
    id = 0
    accurate = 0
    model.eval()
    for id in range(int(len(test_X)/(n+1))):
        batch_X = tt.tensor(test_X[id * (n+1):(id + 1) * (n+1)]).float().cuda()
        id += n
        
        output = model(batch_X).cpu().detach().numpy()
        if binary_acc:
            accurate += len(np.where(output<output[0])[0])/n
        else:
            extreme = np.max(output, axis=0)
            if output[0] != extreme:
                continue
            options = np.where(output == extreme)[0]
            accurate += 1 / len(options)
    return accurate / (len(test_X) / (n+1))
    print(full_train_X.shape)

In [24]:
def train_model(model, train_X, train_y, n, patience=3, batch_size=int(1e5)):
    valid_X = train_X[:int(len(train_X)/10)]
    train_X = train_X[int(len(train_X)/10):]
    train_y = train_y[int(len(train_y)/10):]
    criterion = nn.BCELoss()
    optimizer = tt.optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
    model.cuda()
    
    best_acc = -1
    best_state_dict = None
    patience_left = patience
    epoch = -1
    bar = tqdm()
    
    while patience_left > 0:
        model.train()
        p = np.random.permutation(len(train_X))
        train_X=train_X[p]
        train_y=train_y[p]
        epoch += 1
        total_loss = 0
        id = 0
        while id < len(train_X):
            if id + batch_size <=len(train_X):
                batch_X = tt.tensor(train_X[id:id+batch_size]).float().cuda()
                batch_y = tt.tensor(train_y[id:id+batch_size]).float().cuda()
            else:
                batch_X = tt.tensor(train_X[id:]).float().cuda()
                batch_y = tt.tensor(train_y[id:]).float().cuda()
            id += batch_size
            output = model(batch_X)
            loss = criterion(output, batch_y)
            total_loss += loss.item()
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
        total_loss *= batch_size/len(train_X)  # to get the mean
        acc = evaluate_model(model, valid_X, n, binary_acc=True)
        
        if best_acc == -1 or best_acc < acc:
            best_acc = acc
            patience_left = patience
            best_state_dict = copy.deepcopy(model.state_dict())
        else:
            patience_left -= 1
        bar.set_description("Epochs: %d, Loss: %f, Acc: %f" %(epoch, total_loss, acc))
        bar.update(1)
    
    model.load_state_dict(best_state_dict)
    return model

In [None]:
for n in TARGET_SIZES:
    print("Testing with %d options:" % n)
    model = FeedForward(EMBED_SIZE * 4)
    train_model(model, embed_train_X[n], embed_train_y[n], n, patience=3, batch_size=int(1e4))
    print("Testing with ffnn on embeddings yields:" + str(evaluate_model(model, embed_test_X[n], n)))    

In [None]:
for n in TARGET_SIZES:
    train_X[n] = np.concatenate((train_X[n], embed_train_X[n]), axis=1)
    test_X[n] = np.concatenate((test_X[n], embed_test_X[n]), axis=1)

In [28]:
for n in TARGET_SIZES:
    print("Testing with %d options:" % n)
    model = FeedForward(EMBED_SIZE * 4 + len(FEATURES))
    train_model(model, train_X[n], train_y[n], n, patience=3, batch_size=int(1e4))
    print("Testing with ffnn on embeddings yields:" + str(evaluate_model(model, test_X[n], n)))    

Testing with 1 options:


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Testing with ffnn on embeddings yields:0.7741116751269036
Testing with 4 options:


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Testing with ffnn on embeddings yields:0.5402189433603046
Testing with 24 options:


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Testing with ffnn on embeddings yields:0.27631578947368424
Testing with 124 options:


HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Testing with ffnn on embeddings yields:0.08713692946058091
