<a href="https://colab.research.google.com/github/Adamphoenix003/GNN-LinkPrediction/blob/main/CoraDataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [57]:
import pandas as pd
import numpy as np
import networkx as nx
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    roc_auc_score,
    average_precision_score,
    confusion_matrix,
    classification_report
)
from node2vec import Node2Vec
from sklearn.linear_model import LogisticRegression
import random

In [58]:


content_path = "/content/sample_data/cora.content"

cora_content = pd.read_csv(
    content_path,
    sep="\t",
    header=None
)

print("Shape:", cora_content.shape)
cora_content.head()


Shape: (2708, 1435)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1425,1426,1427,1428,1429,1430,1431,1432,1433,1434
0,31336,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,Neural_Networks
1,1061127,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,Rule_Learning
2,1106406,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Reinforcement_Learning
3,13195,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Reinforcement_Learning
4,37879,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Probabilistic_Methods


In [59]:
paper_ids = cora_content.iloc[:, 0]
features = cora_content.iloc[:, 1:-1]
labels = cora_content.iloc[:, -1]

print("Features shape:", features.shape)
print("Labels shape:", labels.shape)


Features shape: (2708, 1433)
Labels shape: (2708,)


In [60]:
cites_path = "/content/sample_data/cora.cites"

cora_cites = pd.read_csv(
    cites_path,
    sep="\t",
    header=None
)

cora_cites.columns = ["cited", "citing"]
cora_cites.head()


Unnamed: 0,cited,citing
0,35,1033
1,35,103482
2,35,103515
3,35,1050679
4,35,1103960


In [61]:
# Create mapping
id_map = {id_: i for i, id_ in enumerate(paper_ids)}

# Map citation IDs
cora_cites["cited"] = cora_cites["cited"].map(id_map)
cora_cites["citing"] = cora_cites["citing"].map(id_map)

cora_cites.head()


Unnamed: 0,cited,citing
0,163,402
1,163,659
2,163,1696
3,163,2295
4,163,1274


In [62]:

G = nx.DiGraph()

# citing → cited
G.add_edges_from(
    zip(cora_cites["citing"], cora_cites["cited"])
)

print("Nodes:", G.number_of_nodes())
print("Edges:", G.number_of_edges())


Nodes: 2708
Edges: 5429


In [63]:

G = G.to_undirected()


In [66]:
edges = list(G.edges())
non_edges = list(nx.non_edges(G))

# Sample negative edges equal to positive edges
np.random.seed(42)
non_edges_sample = np.random.choice(len(non_edges), len(edges), replace=False)
non_edges_sample = [non_edges[i] for i in non_edges_sample]

# Split positive edges
train_edges, test_edges = train_test_split(edges, test_size=0.2, random_state=42)

# Split negative edges
train_non_edges, test_non_edges = train_test_split(non_edges_sample, test_size=0.2, random_state=42)


In [67]:
node2vec = Node2Vec(
    G,
    dimensions=128,
    walk_length=30,
    num_walks=200,
    workers=4
)

model = node2vec.fit(
    window=10,
    min_count=1,
    batch_words=4
)


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]

In [68]:
embeddings = {str(node): model.wv[str(node)] for node in G.nodes()}


In [69]:
def edge_embedding(edge):
    node1, node2 = edge
    emb1 = embeddings[str(node1)]
    emb2 = embeddings[str(node2)]
    return emb1 * emb2


In [70]:

X_train = []
y_train = []

for edge in train_edges:
    X_train.append(edge_embedding(edge))
    y_train.append(1)

for edge in train_non_edges:
    X_train.append(edge_embedding(edge))
    y_train.append(0)

X_train = np.array(X_train)
y_train = np.array(y_train)

#test split

X_test = []
y_test = []

for edge in test_edges:
    X_test.append(edge_embedding(edge))
    y_test.append(1)

for edge in test_non_edges:
    X_test.append(edge_embedding(edge))
    y_test.append(0)

X_test = np.array(X_test)
y_test = np.array(y_test)


In [71]:

# Train classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Probabilities for AUC & AP
y_pred_proba = clf.predict_proba(X_test)[:, 1]

# Binary predictions (threshold = 0.5)
y_pred = (y_pred_proba >= 0.5).astype(int)

# -------------------------
# Metrics
# -------------------------

# ROC-AUC
auc = roc_auc_score(y_test, y_pred_proba)

# Average Precision
ap = average_precision_score(y_test, y_pred_proba)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Node2Vec Link Prediction")
print("ROC-AUC:", auc)
print("Average Precision (AP):", ap)

print("\nConfusion Matrix:")
print(cm)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Node2Vec Link Prediction
ROC-AUC: 0.9955915691000918
Average Precision (AP): 0.9949591330699443

Confusion Matrix:
[[1029   27]
 [  24 1032]]

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.97      0.98      1056
           1       0.97      0.98      0.98      1056

    accuracy                           0.98      2112
   macro avg       0.98      0.98      0.98      2112
weighted avg       0.98      0.98      0.98      2112



**99% That is almost certainly data leakage.Lets train only on the train edges instead of the whole graph**

In [72]:
edges = list(G.edges())
train_edges, test_edges = train_test_split(edges, test_size=0.2, random_state=42)

G_train = G.copy()
G_train.remove_edges_from(test_edges)


**p = 1, q = 0.5**

In [84]:
# p = 1
# q = 0.5


node2vec = Node2Vec(
    G_train,
    dimensions=128,
    walk_length=30,
    num_walks=150,
    p=1,
    q=0.5,
    workers=1
)

model4 = node2vec.fit(
    window=10,
    min_count=1,
    epochs=5
)


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|          | 0/150 [00:00<?, ?it/s][A
Generating walks (CPU: 1):   1%|▏         | 2/150 [00:00<00:46,  3.16it/s][A
Generating walks (CPU: 1):   2%|▏         | 3/150 [00:01<01:05,  2.24it/s][A
Generating walks (CPU: 1):   3%|▎         | 4/150 [00:01<01:13,  1.98it/s][A
Generating walks (CPU: 1):   3%|▎         | 5/150 [00:02<01:18,  1.85it/s][A
Generating walks (CPU: 1):   4%|▍         | 6/150 [00:03<01:22,  1.74it/s][A
Generating walks (CPU: 1):   5%|▍         | 7/150 [00:03<01:12,  1.98it/s][A
Generating walks (CPU: 1):   5%|▌         | 8/150 [00:03<01:05,  2.17it/s][A
Generating walks (CPU: 1):   6%|▌         | 9/150 [00:04<00:59,  2.35it/s][A
Generating walks (CPU: 1):   7%|▋         | 10/150 [00:04<00:56,  2.50it/s][A
Generating walks (CPU: 1):   7%|▋         | 11/150 [00:04<00:53,  2.58it/s][A
Generating walks (CPU: 1):   8%|▊         | 12/150 [00:05<00:51,  2.66it/s][A
Generating walks (CPU: 1):   9%|▊         | 13/150 [00:05<00:50,  2.

In [85]:
embeddings = {str(node): model4.wv[str(node)] for node in G_train.nodes()}


non_edges = list(nx.non_edges(G_train))

# Sample equal number of negatives as positives
random.seed(42)
train_non_edges = random.sample(non_edges, len(train_edges))
test_non_edges = random.sample(
    list(set(non_edges) - set(train_non_edges)),
    len(test_edges)
)


In [86]:
def edge_embedding(edge):
    u, v = edge
    emb_u = embeddings[str(u)]
    emb_v = embeddings[str(v)]
    return emb_u * emb_v  # Hadamard product




In [87]:
X_train = []
y_train = []

# Positive edges
for edge in train_edges:
    X_train.append(edge_embedding(edge))
    y_train.append(1)

# Negative edges
for edge in train_non_edges:
    X_train.append(edge_embedding(edge))
    y_train.append(0)

X_train = np.array(X_train)
y_train = np.array(y_train)

X_test = []
y_test = []

# Positive test edges (UNSEEN during embedding training)
for edge in test_edges:
    X_test.append(edge_embedding(edge))
    y_test.append(1)

# Negative test edges
for edge in test_non_edges:
    X_test.append(edge_embedding(edge))
    y_test.append(0)

X_test = np.array(X_test)
y_test = np.array(y_test)

In [88]:


# Train classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Probabilities for AUC & AP
y_pred_proba = clf.predict_proba(X_test)[:, 1]

# Binary predictions (threshold = 0.5)
y_pred = (y_pred_proba >= 0.5).astype(int)

# -------------------------
# Metrics
# -------------------------

# ROC-AUC
auc = roc_auc_score(y_test, y_pred_proba)

# Average Precision
ap = average_precision_score(y_test, y_pred_proba)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Node2Vec Link Prediction")
print("ROC-AUC:", auc)
print("Average Precision (AP):", ap)

print("\nConfusion Matrix:")
print(cm)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Node2Vec Link Prediction
ROC-AUC: 0.8479230336030762
Average Precision (AP): 0.8776935629318559

Confusion Matrix:
[[1029   27]
 [ 499  557]]

Classification Report:
              precision    recall  f1-score   support

           0       0.67      0.97      0.80      1056
           1       0.95      0.53      0.68      1056

    accuracy                           0.75      2112
   macro avg       0.81      0.75      0.74      2112
weighted avg       0.81      0.75      0.74      2112



**p = 1, q = 0.25**

In [42]:
# p = 1
# q = 0.25


node2vec = Node2Vec(
    G_train,
    dimensions=128,
    walk_length=30,
    num_walks=150,
    p=1,
    q=0.25,
    workers=1
)

model2 = node2vec.fit(
    window=10,
    min_count=1,
    epochs=5
)


Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|          | 0/150 [00:00<?, ?it/s][A
Generating walks (CPU: 1):   1%|▏         | 2/150 [00:00<00:27,  5.44it/s][A
Generating walks (CPU: 1):   2%|▏         | 3/150 [00:00<00:36,  4.01it/s][A
Generating walks (CPU: 1):   3%|▎         | 4/150 [00:01<00:40,  3.61it/s][A
Generating walks (CPU: 1):   3%|▎         | 5/150 [00:01<00:43,  3.31it/s][A
Generating walks (CPU: 1):   4%|▍         | 6/150 [00:01<00:45,  3.16it/s][A
Generating walks (CPU: 1):   5%|▍         | 7/150 [00:02<00:46,  3.11it/s][A
Generating walks (CPU: 1):   5%|▌         | 8/150 [00:02<00:46,  3.03it/s][A
Generating walks (CPU: 1):   6%|▌         | 9/150 [00:02<00:47,  2.98it/s][A
Generating walks (CPU: 1):   7%|▋         | 10/150 [00:03<00:46,  2.99it/s][A
Generating walks (CPU: 1):   7%|▋         | 11/150 [00:03<00:46,  2.96it/s][A
Generating walks (CPU: 1):   8%|▊         | 12/150 [00:03<00:46,  2.95it/s][A
Generating walks (CPU: 1):   9%|▊         | 13/150 [00:04<00:46,  2.

In [89]:
embeddings = {str(node): model2.wv[str(node)] for node in G_train.nodes()}


In [90]:
X_train = []
y_train = []

# Positive edges
for edge in train_edges:
    X_train.append(edge_embedding(edge))
    y_train.append(1)

# Negative edges
for edge in train_non_edges:
    X_train.append(edge_embedding(edge))
    y_train.append(0)

X_train = np.array(X_train)
y_train = np.array(y_train)

X_test = []
y_test = []

# Positive test edges (UNSEEN during embedding training)
for edge in test_edges:
    X_test.append(edge_embedding(edge))
    y_test.append(1)

# Negative test edges
for edge in test_non_edges:
    X_test.append(edge_embedding(edge))
    y_test.append(0)

X_test = np.array(X_test)
y_test = np.array(y_test)

In [91]:
# Train classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Probabilities for AUC & AP
y_pred_proba = clf.predict_proba(X_test)[:, 1]

# Binary predictions (threshold = 0.5)
y_pred = (y_pred_proba >= 0.5).astype(int)

# -------------------------
# Metrics
# -------------------------

# ROC-AUC
auc = roc_auc_score(y_test, y_pred_proba)

# Average Precision
ap = average_precision_score(y_test, y_pred_proba)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Node2Vec Link Prediction")
print("ROC-AUC:", auc)
print("Average Precision (AP):", ap)

print("\nConfusion Matrix:")
print(cm)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Node2Vec Link Prediction
ROC-AUC: 0.8573945240759872
Average Precision (AP): 0.8865467402756483

Confusion Matrix:
[[1030   26]
 [ 468  588]]

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.98      0.81      1056
           1       0.96      0.56      0.70      1056

    accuracy                           0.77      2112
   macro avg       0.82      0.77      0.76      2112
weighted avg       0.82      0.77      0.76      2112



**p = 1, q= 0.2**

In [92]:
# p = 1 , q = 0.2
node2vec = Node2Vec(
    G_train,




    dimensions=128,
    walk_length=30,
    num_walks=150,
    p=1,
    q=0.2,
    workers=1
)

model5 = node2vec.fit(
    window=10,
    min_count=1,
    epochs=5
)

Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|          | 0/150 [00:00<?, ?it/s][A
Generating walks (CPU: 1):   1%|▏         | 2/150 [00:00<00:25,  5.77it/s][A
Generating walks (CPU: 1):   2%|▏         | 3/150 [00:00<00:37,  3.93it/s][A
Generating walks (CPU: 1):   3%|▎         | 4/150 [00:01<00:42,  3.44it/s][A
Generating walks (CPU: 1):   3%|▎         | 5/150 [00:01<00:44,  3.27it/s][A
Generating walks (CPU: 1):   4%|▍         | 6/150 [00:01<00:47,  3.06it/s][A
Generating walks (CPU: 1):   5%|▍         | 7/150 [00:02<00:47,  3.04it/s][A
Generating walks (CPU: 1):   5%|▌         | 8/150 [00:02<00:46,  3.02it/s][A
Generating walks (CPU: 1):   6%|▌         | 9/150 [00:02<00:48,  2.92it/s][A
Generating walks (CPU: 1):   7%|▋         | 10/150 [00:03<00:47,  2.92it/s][A
Generating walks (CPU: 1):   7%|▋         | 11/150 [00:03<00:47,  2.94it/s][A
Generating walks (CPU: 1):   8%|▊         | 12/150 [00:03<00:47,  2.88it/s][A
Generating walks (CPU: 1):   9%|▊         | 13/150 [00:04<00:47,  2.

In [93]:
embeddings = {str(node): model5.wv[str(node)] for node in G_train.nodes()}
# training dta

X_train = []
y_train = []

# Positive edges
for edge in train_edges:
    X_train.append(edge_embedding(edge))
    y_train.append(1)

# Negative edges
for edge in train_non_edges:
    X_train.append(edge_embedding(edge))
    y_train.append(0)

X_train = np.array(X_train)
y_train = np.array(y_train)

#testing data
X_test = []
y_test = []

# Positive test edges (UNSEEN during embedding training)
for edge in test_edges:
    X_test.append(edge_embedding(edge))
    y_test.append(1)

# Negative test edges
for edge in test_non_edges:
    X_test.append(edge_embedding(edge))
    y_test.append(0)

X_test = np.array(X_test)
y_test = np.array(y_test)

# Train classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Probabilities for AUC & AP
y_pred_proba = clf.predict_proba(X_test)[:, 1]

# Binary predictions (threshold = 0.5)
y_pred = (y_pred_proba >= 0.5).astype(int)

# -------------------------
# Metrics
# -------------------------

# ROC-AUC
auc = roc_auc_score(y_test, y_pred_proba)

# Average Precision
ap = average_precision_score(y_test, y_pred_proba)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Node2Vec Link Prediction")
print("p = 1 and q = 0.2")
print("ROC-AUC:", auc)
print("Average Precision (AP):", ap)

print("\nConfusion Matrix:")
print(cm)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Node2Vec Link Prediction
p = 1 and q = 0.2
ROC-AUC: 0.843731616592057
Average Precision (AP): 0.8747584590430282

Confusion Matrix:
[[1021   35]
 [ 473  583]]

Classification Report:
              precision    recall  f1-score   support

           0       0.68      0.97      0.80      1056
           1       0.94      0.55      0.70      1056

    accuracy                           0.76      2112
   macro avg       0.81      0.76      0.75      2112
weighted avg       0.81      0.76      0.75      2112



**p = 1, q = 0.15**

In [94]:
# p = 1 , q = 0.15
node2vec = Node2Vec(
    G_train,
    dimensions=128,
    walk_length=30,
    num_walks=150,
    p=1,
    q=0.15,
    workers=1
)

model6 = node2vec.fit(
    window=10,
    min_count=1,
    epochs=5
)

Computing transition probabilities:   0%|          | 0/2708 [00:00<?, ?it/s]


Generating walks (CPU: 1):   0%|          | 0/150 [00:00<?, ?it/s][A
Generating walks (CPU: 1):   1%|▏         | 2/150 [00:00<00:27,  5.36it/s][A
Generating walks (CPU: 1):   2%|▏         | 3/150 [00:00<00:37,  3.90it/s][A
Generating walks (CPU: 1):   3%|▎         | 4/150 [00:01<00:43,  3.36it/s][A
Generating walks (CPU: 1):   3%|▎         | 5/150 [00:01<00:47,  3.06it/s][A
Generating walks (CPU: 1):   4%|▍         | 6/150 [00:01<00:49,  2.94it/s][A
Generating walks (CPU: 1):   5%|▍         | 7/150 [00:02<00:50,  2.85it/s][A
Generating walks (CPU: 1):   5%|▌         | 8/150 [00:02<00:51,  2.78it/s][A
Generating walks (CPU: 1):   6%|▌         | 9/150 [00:02<00:50,  2.80it/s][A
Generating walks (CPU: 1):   7%|▋         | 10/150 [00:03<00:51,  2.73it/s][A
Generating walks (CPU: 1):   7%|▋         | 11/150 [00:03<00:50,  2.73it/s][A
Generating walks (CPU: 1):   8%|▊         | 12/150 [00:04<00:50,  2.73it/s][A
Generating walks (CPU: 1):   9%|▊         | 13/150 [00:04<00:51,  2.

In [95]:
embeddings = {str(node): model6.wv[str(node)] for node in G_train.nodes()}
# training dta

X_train = []
y_train = []

# Positive edges
for edge in train_edges:
    X_train.append(edge_embedding(edge))
    y_train.append(1)

# Negative edges
for edge in train_non_edges:
    X_train.append(edge_embedding(edge))
    y_train.append(0)

X_train = np.array(X_train)
y_train = np.array(y_train)

#testing data
X_test = []
y_test = []

# Positive test edges (UNSEEN during embedding training)
for edge in test_edges:
    X_test.append(edge_embedding(edge))
    y_test.append(1)

# Negative test edges
for edge in test_non_edges:
    X_test.append(edge_embedding(edge))
    y_test.append(0)

X_test = np.array(X_test)
y_test = np.array(y_test)

# Train classifier
clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

# Probabilities for AUC & AP
y_pred_proba = clf.predict_proba(X_test)[:, 1]

# Binary predictions (threshold = 0.5)
y_pred = (y_pred_proba >= 0.5).astype(int)

# -------------------------
# Metrics
# -------------------------

# ROC-AUC
auc = roc_auc_score(y_test, y_pred_proba)

# Average Precision
ap = average_precision_score(y_test, y_pred_proba)

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Node2Vec Link Prediction")
print("p = 1 and q = 0.15")
print("ROC-AUC:", auc)
print("Average Precision (AP):", ap)

print("\nConfusion Matrix:")
print(cm)

print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Node2Vec Link Prediction
p = 1 and q = 0.15
ROC-AUC: 0.8455188425447658
Average Precision (AP): 0.8768273083126923

Confusion Matrix:
[[1029   27]
 [ 472  584]]

Classification Report:
              precision    recall  f1-score   support

           0       0.69      0.97      0.80      1056
           1       0.96      0.55      0.70      1056

    accuracy                           0.76      2112
   macro avg       0.82      0.76      0.75      2112
weighted avg       0.82      0.76      0.75      2112



**MATRIX FACTORIZATION METHOD OF LINK PREDICTION**

In [108]:
from sklearn.decomposition import TruncatedSVD
A = nx.to_numpy_array(G_train)

degree = np.sum(A, axis=1)
D_inv_sqrt = np.diag(1.0 / np.sqrt(degree + 1e-10))

A_norm = D_inv_sqrt @ A @ D_inv_sqrt

# -------------------------
# SVD
# -------------------------

svd = TruncatedSVD(n_components=128, random_state=42)
node_embeddings = svd.fit_transform(A_norm)

In [103]:
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import normalize
A = normalize(A, norm='l2')
svd = TruncatedSVD(n_components=128, random_state=42)
node_embeddings = svd.fit_transform(A)

print("Embedding shape:", node_embeddings.shape)


Embedding shape: (2708, 128)


In [109]:
non_edges = list(nx.non_edges(G_train))

random.seed(42)
train_non_edges = random.sample(non_edges, len(train_edges))
test_non_edges = random.sample(
    list(set(non_edges) - set(train_non_edges)),
    len(test_edges)
)

In [110]:
def edge_embedding(edge):
    u, v = edge
    return node_embeddings[u] * node_embeddings[v]


In [111]:
X_train = []
y_train = []

for edge in train_edges:
    X_train.append(edge_embedding(edge))
    y_train.append(1)

for edge in train_non_edges:
    X_train.append(edge_embedding(edge))
    y_train.append(0)

X_train = np.array(X_train)
y_train = np.array(y_train)


X_test = []
y_test = []

for edge in test_edges:
    X_test.append(edge_embedding(edge))
    y_test.append(1)

for edge in test_non_edges:
    X_test.append(edge_embedding(edge))
    y_test.append(0)

X_test = np.array(X_test)
y_test = np.array(y_test)


In [112]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, average_precision_score

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train, y_train)

y_pred_proba = clf.predict_proba(X_test)[:, 1]

auc = roc_auc_score(y_test, y_pred_proba)
ap = average_precision_score(y_test, y_pred_proba)

print("Matrix Factorization AUC:", auc)
print("Matrix Factorization AP:", ap)


Matrix Factorization AUC: 0.5
Matrix Factorization AP: 0.5
