In [22]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from sklearn.metrics import roc_auc_score
from torch_geometric.datasets import Planetoid
from torch_geometric.utils import negative_sampling
import pandas as pd
import torch
import networkx as nx
import scipy.sparse as sp
from torch_geometric.nn import SAGEConv
import torch.nn.functional as F

In [23]:
class GraphSAGE(nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, num_layers):
        super(GraphSAGE, self).__init__()
        self.convs = nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        for i in range(num_layers - 2):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        self.convs.append(SAGEConv(hidden_channels, out_channels))
        
    def forward(self, x, edge_index):
        for i, conv in enumerate(self.convs):
            x = conv(x, edge_index)
            if i != len(self.convs) - 1:
                x = F.relu(x)
                x = F.dropout(x, p=0.5, training=self.training)
        return x

In [4]:
path = "hw2_data/"
dss = ['dataset1','dataset2','dataset3'] #datasets
dataset = dict()
for ds in dss:
    dataset[ds] = dict()
    dataset[ds]['content'] = pd.read_csv(path+ds+"/content.csv",delimiter = '\t',header = None)
    dataset[ds]['train'] = pd.read_csv(path+ds+"/train.csv",delimiter = ',')
    dataset[ds]['test'] = pd.read_csv(path+ds+"/test.csv",delimiter = ',')
    dataset[ds]['upload'] = pd.read_csv(path+ds+"/upload.csv",delimiter = ',')
dataset[dss[2]]['test'].head()

Unnamed: 0,id,to,from
0,E370,26,317
1,E667,196,323
2,E3190,739,468
3,E848,576,156
4,E2161,466,199


In [5]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Device:', device)

Device: cuda


In [10]:
# Load the Cora dataset
dataset = Planetoid(root='/tmp/cora', name='Cora')

In [11]:
# Extract node features and edges from the dataset
x = dataset.data.x
edge_index = dataset.data.edge_index

In [26]:
# Define a binary classification task for link prediction
train_mask = dataset.data.train_mask
pos_edge_index = edge_index[:, torch.where(train_mask)[0]]
neg_edge_index = negative_sampling(
    edge_index=edge_index, num_nodes=dataset.data.num_nodes,
    num_neg_samples=pos_edge_index.size(1))

x = nn.functional.normalize(x, p=2, dim=-1)

# Train the model on the positive and negative edges
model = GraphSAGE(in_channels=dataset.num_features, hidden_channels=16,
                  out_channels=1, num_layers=2)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.BCEWithLogitsLoss()

for epoch in range(200):
    optimizer.zero_grad()
    pos_pred = model(x, pos_edge_index).flatten()
    neg_pred = model(x, neg_edge_index).flatten()
    pred = torch.cat([pos_pred, neg_pred])
    pos_labels = torch.ones(pos_pred.size(0), dtype=torch.float)
    neg_labels = torch.zeros(neg_pred.size(0), dtype=torch.float)
    labels = torch.cat([pos_labels, neg_labels])
    loss = criterion(pred, labels)
    loss.backward()
    optimizer.step()

    train_auc = roc_auc_score(labels.cpu().detach().numpy(), 
                              pred.cpu().detach().numpy())

    print(f'Epoch: {epoch + 1:03d}, Loss: {loss:.4f}, Train AUC: {train_auc:.4f}')

Epoch: 001, Loss: 0.6943, Train AUC: 0.4941
Epoch: 002, Loss: 0.6929, Train AUC: 0.5171
Epoch: 003, Loss: 0.6916, Train AUC: 0.5337
Epoch: 004, Loss: 0.6907, Train AUC: 0.5404
Epoch: 005, Loss: 0.6907, Train AUC: 0.5310
Epoch: 006, Loss: 0.6891, Train AUC: 0.5473
Epoch: 007, Loss: 0.6875, Train AUC: 0.5612
Epoch: 008, Loss: 0.6865, Train AUC: 0.5492
Epoch: 009, Loss: 0.6854, Train AUC: 0.5515
Epoch: 010, Loss: 0.6845, Train AUC: 0.5512
Epoch: 011, Loss: 0.6830, Train AUC: 0.5484
Epoch: 012, Loss: 0.6818, Train AUC: 0.5505
Epoch: 013, Loss: 0.6808, Train AUC: 0.5478
Epoch: 014, Loss: 0.6790, Train AUC: 0.5586
Epoch: 015, Loss: 0.6782, Train AUC: 0.5576
Epoch: 016, Loss: 0.6768, Train AUC: 0.5444
Epoch: 017, Loss: 0.6749, Train AUC: 0.5548
Epoch: 018, Loss: 0.6740, Train AUC: 0.5651
Epoch: 019, Loss: 0.6729, Train AUC: 0.5633
Epoch: 020, Loss: 0.6721, Train AUC: 0.5577
Epoch: 021, Loss: 0.6719, Train AUC: 0.5567
Epoch: 022, Loss: 0.6691, Train AUC: 0.5741
Epoch: 023, Loss: 0.6679, Train 

Epoch: 196, Loss: 0.6356, Train AUC: 0.5913
Epoch: 197, Loss: 0.6360, Train AUC: 0.5921
Epoch: 198, Loss: 0.6362, Train AUC: 0.5844
Epoch: 199, Loss: 0.6363, Train AUC: 0.5894
Epoch: 200, Loss: 0.6372, Train AUC: 0.5869


In [27]:
# Evaluate the model on a test set of positive and negative edges
test_mask = dataset.data.test_mask
pos_edge_index = edge_index[:, torch.where(test_mask)[0]]
neg_edge_index = negative_sampling(
    edge_index=edge_index, num_nodes=dataset.data.num_nodes,
    num_neg_samples=pos_edge_index.size(1))

model.eval()
with torch.no_grad():
    pos_pred = model(x, pos_edge_index).flatten()
    neg_pred = model(x, neg_edge_index).flatten()
    pred = torch.cat([pos_pred, neg_pred])
    pos_labels = torch.ones(pos_pred.size(0), dtype=torch.float)
    neg_labels = torch.zeros(neg_pred.size(0), dtype=torch.float)
    labels = torch.cat([pos_labels, neg_labels])
    test_auc = roc_auc_score(labels.cpu().detach().numpy(), 
                             pred.cpu().detach().numpy())
    print(f'Test AUC: {test_auc:.4f}')

Test AUC: 0.5211


In [31]:
dataset.data

Data(x=[2708, 1433], edge_index=[2, 10556], y=[2708], train_mask=[2708], val_mask=[2708], test_mask=[2708])

In [37]:
np.unique(dataset.data.y)

array([0, 1, 2, 3, 4, 5, 6], dtype=int64)

In [35]:
2708*3

8124