# Link Prediction

We take the embedded data and calculate the cosine similarity between nodes. Cosine similarity is equal to normalized dot product similarity (source: https://zhang-yang.medium.com/cosine-similarity-dot-product-for-normalized-vectors-c07bdb61c9d1). If nodes are similar, but not linked yet, we predict a link between them.

## Imports

In [1]:
import json
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix, classification_report

In [2]:
# Open link dictionaries
with open('preprocessed_data/all_links.json', 'r') as fp:
    all_links = json.load(fp)

with open('preprocessed_data/removed_links.json', 'r') as fp:
    removed_links = json.load(fp)

## Functions

In [3]:
def predict_links(df):
    # Initialize empty predicted links
    predicted_links = []

    highest_node_id = int(list(removed_links.keys())[-1])

    # Iterate over all nodes and compare them to nodes they are not yet linked to
    # Predicts link when cosine similarity > 70%
    for node in range(0, highest_node_id + 1):
        # Slices of dataframes containing the node itself
        # and the nodes it is not connected to
        node_df = df[df.index == node]
        unconnected_nodes = [n for n in range(0, highest_node_id + 1) 
                             if n not in removed_links[str(node)] +
                             [node]]
        unconnected_nodes_df = df[df.index.isin(unconnected_nodes)]

        # Cosine similarity between the node itself
        # and the nodes it is not already connected to
        similarity_scores = cosine_similarity(node_df, unconnected_nodes_df)[0].tolist()
        similarity_dict = dict(zip(unconnected_nodes, similarity_scores))
            
        # If similarity > 70%, append to the list of predictions
        for pair in similarity_dict.items():
            if pair[1] > 0.7:
                predicted_links.append((node, pair[0]))

    return predicted_links

In [4]:
def link_dict_to_pairs(link_dict):
    pairs = []

    highest_node_id = int(list(link_dict.keys())[-1])

    # Add a bidirectional pair for each link
    # E.g. (1,0) and (0,1)
    for node in range(0, highest_node_id + 1):
        for neighbor in link_dict[str(node)]:
            pairs.append((node, neighbor))
            pairs.append((neighbor, node))

    return pairs

## Predict for Node2Vec

In [5]:
# Import and make predictions
node2vec = pd.read_csv('embedded_data/node2vec.csv')
node2vec_pred = predict_links(node2vec)
node2vec_pred

[(0, 4),
 (0, 8),
 (0, 23),
 (0, 24),
 (0, 40),
 (0, 49),
 (0, 52),
 (0, 59),
 (0, 61),
 (0, 62),
 (0, 63),
 (0, 75),
 (0, 76),
 (0, 81),
 (0, 82),
 (0, 88),
 (0, 89),
 (0, 90),
 (0, 92),
 (0, 106),
 (0, 110),
 (0, 112),
 (0, 115),
 (0, 118),
 (0, 126),
 (0, 134),
 (0, 141),
 (0, 143),
 (0, 149),
 (0, 150),
 (0, 156),
 (0, 157),
 (0, 176),
 (0, 177),
 (0, 182),
 (0, 200),
 (0, 205),
 (0, 214),
 (0, 225),
 (0, 231),
 (0, 233),
 (0, 237),
 (0, 239),
 (0, 241),
 (0, 249),
 (0, 263),
 (0, 264),
 (0, 266),
 (0, 272),
 (0, 278),
 (0, 289),
 (0, 300),
 (0, 307),
 (0, 309),
 (0, 312),
 (0, 315),
 (0, 319),
 (0, 321),
 (0, 326),
 (0, 327),
 (0, 333),
 (0, 339),
 (0, 345),
 (0, 347),
 (1, 0),
 (1, 3),
 (1, 5),
 (1, 7),
 (1, 9),
 (1, 10),
 (1, 11),
 (1, 13),
 (1, 15),
 (1, 16),
 (1, 18),
 (1, 21),
 (1, 22),
 (1, 24),
 (1, 25),
 (1, 26),
 (1, 27),
 (1, 30),
 (1, 33),
 (1, 35),
 (1, 36),
 (1, 37),
 (1, 38),
 (1, 39),
 (1, 40),
 (1, 42),
 (1, 43),
 (1, 45),
 (1, 47),
 (1, 50),
 (1, 51),
 (1, 55),
 (

## Evaluate

In [6]:
# Convert the dictionaries to lists of pairs for easier comparison
all_link_pairs = link_dict_to_pairs(all_links)
removed_link_pairs = link_dict_to_pairs(removed_links)

In [7]:
# Merge predictions with already existing links
all_links_and_pred_node2vec = removed_link_pairs + node2vec_pred

# Compare predictions + existing links with all actual links
# It takes a while to run

# False positive predictions node2vec
# i.e. it predicted a link that isn't there
fp_node2vec = [x for x in all_links_and_pred_node2vec if x not in set(all_link_pairs)]
fp_rate_node2vec = fp_node2vec.len() / node2vec_pred.len()

# False negative predictions node2vec
# i.e. it didnt predict a link that should have been there
fn_node2vec = [x for x in all_link_pairs if x not in set(all_links_and_pred_node2vec)]
fn_rate_node2vec = fn_node2vec.len() / all_link_pairs.len()

tp_rate_node2vec = 1 - fp_rate_node2vec
tn_rate_node2vec = 1 - fn_rate_node2vec

KeyboardInterrupt: 

KeyboardInterrupt: 