<a href="https://colab.research.google.com/github/BonanYang/GNN/blob/main/GNN_GraphSAGE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import urllib.request
import zipfile
import scipy.sparse as sp
import torch.nn as nn
import torch.optim as optim
from torch.optim import Adam
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import scipy.sparse as sp

In [2]:
urllib.request.urlretrieve('https://data.dgl.ai/dataset/reddit.zip', 'reddit.zip')

with zipfile.ZipFile('reddit.zip', 'r') as z:
    z.extractall('reddit/')

#Data at a glance

In [None]:
data = np.load('reddit/reddit_data.npz')
graph = np.load('reddit/reddit_graph.npz')

In [None]:
df1 = pd.DataFrame(data['feature'][:10, :10]).round(2)
df1['label'] = data['label'][:10]
print(df1)

In [None]:
df1.iloc[0]

In [None]:
df = pd.DataFrame(data['feature'])
df['label'] = data['label']

In [None]:
df.iloc[0]

In [None]:
adj = sp.load_npz('reddit/reddit_graph.npz')
print(type(adj))
print(adj.shape)
print(adj.nnz)

#MLP

In [None]:

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

data = np.load('reddit/reddit_data.npz')
features = torch.FloatTensor(data['feature'])
labels = torch.LongTensor(data['label'])
node_types = data['node_types']

train_idx = np.where(node_types == 1)[0]
val_idx = np.where(node_types == 2)[0]
test_idx = np.where(node_types == 3)[0]

class RedditDataset(Dataset):
    def __init__(self, idx):
        self.idx = idx

    def __len__(self):
        return len(self.idx)

    def __getitem__(self, i):
        node_id = self.idx[i]
        return features[node_id], labels[node_id]

train_loader = DataLoader(RedditDataset(train_idx), batch_size=1024, shuffle=True)

class MLP(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, out_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        return self.fc3(x)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MLP(602, 256, 41).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()


for epoch in range(100):
    model.train()
    total_loss = 0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        loss = criterion(model(batch_x), batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    with torch.no_grad():
        val_out = model(features[val_idx].to(device))
        val_acc = (val_out.argmax(1) == labels[val_idx].to(device)).float().mean().item()

    print(f"Epoch {epoch+1:02d} | Loss: {total_loss/len(train_loader):.4f} | Val Acc: {val_acc:.4f}")


model.eval()
with torch.no_grad():
    test_out = model(features[test_idx].to(device))
    test_acc = (test_out.argmax(1) == labels[test_idx].to(device)).float().mean().item()
print(f"\nTest Accuracy: {test_acc:.4f}")

#GraphSAGE

In [4]:


data = np.load('reddit/reddit_data.npz')
adj = sp.load_npz('reddit/reddit_graph.npz').tocsr()

In [5]:
class GraphSAGELayer(nn.Module):
    def __init__(self, in_dim, out_dim):
        super().__init__()
        self.W = nn.Linear(in_dim, out_dim)
        self.B = nn.Linear(in_dim, out_dim)

    def forward(self, X, self_idx, neigh_lists):
        neighbor_feats = []
        for neighs in neigh_lists:
            if len(neighs) == 0:
                neighbor_feats.append(torch.zeros(X.size(1), device=X.device))
            else:
                neighbor_feats.append(X[neighs].mean(dim=0))
        neighbor_feats = torch.stack(neighbor_feats)
        return self.W(neighbor_feats) + self.B(X[self_idx])


class GraphSAGE(nn.Module):
    def __init__(self, in_dim, hid_dim, out_dim):
        super().__init__()
        self.sage1 = GraphSAGELayer(in_dim, hid_dim)
        self.sage2 = GraphSAGELayer(hid_dim, out_dim)
        self.S1, self.S2 = 25, 10

    def sample(self, nodes, adj_list, size):
        sampled = {}
        for n in nodes:
            neighs = adj_list[n]
            if len(neighs) > size:
                neighs = np.random.choice(neighs, size, replace=False).tolist()
            sampled[n] = neighs
        return sampled

    def forward(self, X, batch, adj_list):
        batch = list(batch)
        batch_neighs = self.sample(batch, adj_list, self.S2)
        L1_nodes = set(batch)
        for neighs in batch_neighs.values():
            L1_nodes.update(neighs)
        L1_nodes = list(L1_nodes)
        L1_neighs = self.sample(L1_nodes, adj_list, self.S1)
        H1 = F.relu(self.sage1(X, L1_nodes, [L1_neighs[n] for n in L1_nodes]))

        L1_map = {n: i for i, n in enumerate(L1_nodes)}
        batch_idx = [L1_map[n] for n in batch]
        neigh_idx = [[L1_map[nb] for nb in batch_neighs[n]] for n in batch]
        H2 = self.sage2(H1, batch_idx, neigh_idx)

        return H2

In [None]:
X = torch.FloatTensor(data['feature'])
y = torch.LongTensor(data['label'])
node_types = data['node_types']

train_idx = np.where(node_types == 1)[0]
val_idx = np.where(node_types == 2)[0]
test_idx = np.where(node_types == 3)[0]

adj_list = [adj[i].indices.tolist() for i in range(adj.shape[0])]

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X = X.to(device)
y = y.to(device)

model = GraphSAGE(602, 256, 41).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

batch_size = 512
for epoch in range(10):
    model.train()
    np.random.shuffle(train_idx)
    total_loss = 0

    for i in range(0, len(train_idx), batch_size):
        batch_nodes = train_idx[i:i+batch_size]
        out = model(X, batch_nodes, adj_list)
        loss = F.cross_entropy(out, y[batch_nodes])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    with torch.no_grad():
        val_out = model(X, val_idx, adj_list)
        val_acc = (val_out.argmax(1) == y[val_idx]).float().mean().item()

    print(f"Epoch {epoch+1} | Loss: {total_loss*batch_size/len(train_idx):.4f} | Val Acc: {val_acc:.4f}")

model.eval()
with torch.no_grad():
    test_out = model(X, test_idx, adj_list)
    test_acc = (test_out.argmax(1) == y[test_idx]).float().mean().item()
print(f"\nTest Acc: {test_acc:.4f}")

In [19]:
print(f"X shape: {X.shape}")
print(f"max node in adj_list: {max(max(adj_list[i]) if adj_list[i] else 0 for i in range(len(adj_list)))}")

X shape: torch.Size([232965, 602])
max node in adj_list: 232964
