In [1]:
import torch
import torch.nn as nn
import torch.optim as optim 
import numpy as np
import networkx as nx
import random
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec
from torch_geometric.utils import to_networkx
from torch_geometric.datasets import Planetoid



# Prepare data

In [2]:
dataset = Planetoid('./data', 'Cora')
data = dataset[0]
data_nx = to_networkx(data)

# Task 1 Node2Vec

In [3]:
def node2vec_once(G, start_node, walk_length, p, q):
    walk = [start_node]
    current_node = start_node
    previous_node = None

    for _ in range(walk_length - 1):

        neighbors = set(G.neighbors(current_node))
        idx2neighbor = list(neighbors)
        neighbor2idx = {v: k for (k, v) in enumerate(idx2neighbor)}

        if previous_node is None:
            probabilities = np.ones(len(neighbors)) / len(neighbors)
        else:
            neighbor_prev = set(G.neighbors(previous_node))
            probabilities = np.ones(len(neighbors)) / q

            probabilities[neighbor2idx[previous_node]] = 1 / p
            union_neighbor = [neighbor2idx[n] for n in (neighbor_prev & neighbors)]

            probabilities[union_neighbor] = 1
            probabilities = np.array(probabilities) / np.sum(probabilities)
        previous_node = current_node
        current_node = np.random.choice(idx2neighbor, p=probabilities)
        walk.append(current_node)
        
    return walk


def node2vec(G, num_walks, walk_length, p, q):
    walks = []

    for _ in tqdm(range(num_walks)):
        for start_node in G.nodes():
            walks.append(node2vec_once(G, start_node, walk_length, p, q))

    return walks


In [4]:
# Takes about 5 mins
walk_length = 30
num_walks = 100 
p, q = 0.5, 2
node2vec_walks = node2vec(data_nx, num_walks, walk_length, p, q)

100%|██████████| 100/100 [04:56<00:00,  2.96s/it]


In [5]:
# Takes about 40s
dimensions = 100
node2vec_model = Word2Vec(node2vec_walks, 
                          vector_size=dimensions, 
                          window=5, 
                          min_count=1, 
                          sg=1)
node2vec_embeds = np.array([node2vec_model.wv[node] for node in data_nx])

In [6]:
node2vec_embeds.shape

(2708, 100)

## Train MLP Classifier

In [7]:
data.x = torch.tensor(node2vec_embeds)

In [8]:
data

Data(x=[2708, 100], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

In [9]:
N_train = data.x.shape[0] * 9 // 10
all_samples = list(range(data.x.shape[0]))
random.shuffle(all_samples)
train_samples = all_samples[:N_train]
test_samples = all_samples[N_train:]
len(train_samples), len(test_samples)

(2437, 271)

In [10]:
train_x, test_x = data.x[train_samples, :], data.x[test_samples, :]
train_y, test_y = data.y[train_samples], data.y[test_samples]
train_x.shape, test_x.shape, train_y.shape, test_y.shape

(torch.Size([2437, 100]),
 torch.Size([271, 100]),
 torch.Size([2437]),
 torch.Size([271]))

In [52]:
class MLP(torch.nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.fc1 = torch.nn.Linear(input_dim, hidden_dim)
        self.fc2 = torch.nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = torch.nn.Linear(hidden_dim, output_dim)
        self.act = torch.nn.ReLU()
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.fc2(x)
        x = self.act(x)
        x = self.fc3(x)
        return x

In [48]:
input_dim = data.x.shape[1]
hidden_dim = 128
output_dim = data.y.unique().shape[0]

model = MLP(input_dim, hidden_dim, output_dim)
optimizer = optim.Adam(model.parameters(), lr=1e-2)
lr_schedule = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)

loss_fn = nn.CrossEntropyLoss()

In [53]:
print(model)

MLP(
  (fc1): Linear(in_features=100, out_features=128, bias=True)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (fc3): Linear(in_features=128, out_features=7, bias=True)
  (act): ReLU()
)


In [50]:
epochs = 1000
for epoch in tqdm(range(epochs)):
    model.train()
    optimizer.zero_grad()

    logits = model(train_x)
    loss = loss_fn(logits, train_y)

    loss.backward()
    optimizer.step()
    lr_schedule.step()

100%|██████████| 1000/1000 [00:09<00:00, 110.22it/s]


In [51]:
model.eval()
with torch.no_grad():
    pred = model(test_x).argmax(dim=1)
accuracy = accuracy_score(test_y, pred)
print(accuracy)

0.8671586715867159


# Task 2 Other Classifiers

In [16]:
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier

In [17]:
svc = SVC()
svc.fit(train_x.detach().numpy(), train_y.detach().numpy())
pred_svc = svc.predict(test_x.detach().numpy())
accuracy = accuracy_score(test_y, pred_svc)
print(accuracy)

0.8634686346863468


In [18]:
knn = KNeighborsClassifier()
knn.fit(train_x.detach().numpy(), train_y.detach().numpy())
pred_knn = knn.predict(test_x.detach().numpy())
accuracy = accuracy_score(test_y, pred_knn)
print(accuracy)

0.8487084870848709


在使用Node2Vec特征进行分类时，MLP、SVC、kNN的性能相近，且都能达到80%以上，表明Node2Vec提取的特征在这个数据集上用于分类任务是合适的

继续增加MLP的训练轮数，指标也不会有显著增加，表明87%左右可能是此时利用Node2Vec提取特征进行分类的一个上限