# Link Prediction

We take the embedded data and calculate the cosine similarity between nodes. Cosine similarity is equal to normalized dot product similarity (source: https://zhang-yang.medium.com/cosine-similarity-dot-product-for-normalized-vectors-c07bdb61c9d1). If nodes are similar, but not linked yet, we predict a link between them.

## Imports

In [None]:
import json
import itertools
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix, classification_report

In [None]:
# Open link dictionaries
with open('preprocessed_data/all_links.json', 'r') as fp:
    all_links = json.load(fp)

with open('preprocessed_data/removed_links.json', 'r') as fp:
    removed_links = json.load(fp)

highest_node_id = int(list(removed_links.keys())[-1])

## Functions

In [None]:
def predict_links(df):
    # Initialize empty predicted links
    predicted_links = []

    # Iterate over all nodes and compare them to nodes they are not yet linked to
    # Predicts link when cosine similarity > 70%
    for node in range(0, highest_node_id + 1):
        # Slices of dataframes containing the node itself
        # and the nodes it is not connected to
        node_df = df[df.index == node]
        unconnected_nodes = [n for n in range(0, highest_node_id + 1) 
                             if n not in removed_links[str(node)] +
                             [node]]
        unconnected_nodes_df = df[df.index.isin(unconnected_nodes)]

        # Cosine similarity between the node itself
        # and the nodes it is not already connected to
        similarity_scores = cosine_similarity(node_df, unconnected_nodes_df)[0].tolist()
        similarity_dict = dict(zip(unconnected_nodes, similarity_scores))
            
        # If similarity > 85%, append to the list of predictions
        for pair in similarity_dict.items():
            if pair[1] > 0.85:
                predicted_links.append((node, pair[0]))

    return predicted_links

In [None]:
def link_dict_to_pairs(link_dict):
    pairs = []

    # Add a bidirectional pair for each link
    # E.g. (1,0) and (0,1)
    for node in range(0, highest_node_id + 1):
        for neighbor in link_dict[str(node)]:
            pairs.append((node, neighbor))
            pairs.append((neighbor, node))

    return pairs

## Predict for Node2Vec

In [None]:
# Import and make predictions
node2vec = pd.read_csv('embedded_data/node2vec.csv')
node2vec_pred = predict_links(node2vec)
node2vec_pred

## Evaluate

In [None]:
# Convert the dictionaries to lists of pairs for easier comparison
all_link_pairs = link_dict_to_pairs(all_links)
removed_link_pairs = link_dict_to_pairs(removed_links)

In [None]:
# False positive predictions node2vec
# i.e. it predicted a link that isn't there
fp_node2vec = set(node2vec_pred) - set(all_link_pairs)

# False negative predictions node2vec
# i.e. it didnt predict a link that should have been there
fn_node2vec = set(all_link_pairs) - set(node2vec_pred) - set(removed_link_pairs)

# True positive predictions node2vec
# i.e. all correct predictions
tp_node2vec = set(node2vec_pred) - fp_node2vec

# True negative predictions node2vec
# i.e. all links that were correctly not predicted
all_possible_pairs = itertools.product(list(range(0, highest_node_id + 1)), list(range(0, highest_node_id + 1)))
tn_node2vec = set(all_possible_pairs) - set(all_link_pairs) - set(node2vec_pred)


In [None]:
print('fp: ', fp_node2vec, 
      'fn: ', fn_node2vec,
      'tp: ', tp_node2vec,
      'tn: ', tn_node2vec)

In [None]:
# Accuracy
# accuracy_node2vec = (tp_rate_node2vec + tn_rate_node2vec) / ()