<a href="https://colab.research.google.com/github/BonanYang/GNN/blob/main/GNN_GraphSAGE.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import urllib.request
import zipfile
import scipy.sparse as sp
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

urllib.request.urlretrieve('https://data.dgl.ai/dataset/reddit.zip', 'reddit.zip')

with zipfile.ZipFile('reddit.zip', 'r') as z:
    z.extractall('reddit/')

In [2]:
data = np.load('reddit/reddit_data.npz')
graph = np.load('reddit/reddit_graph.npz')

In [3]:
df1 = pd.DataFrame(data['feature'][:10, :10]).round(2)
df1['label'] = data['label'][:10]
print(df1)

      0     1     2     3     4     5     6     7     8     9  label
0  1.23  9.04 -0.92  1.05 -1.11 -0.02  0.04  2.15 -0.91  0.71     30
1 -0.14 -0.20  0.13 -0.42  0.11  0.30 -0.94 -0.98 -0.10  0.63     17
2 -0.13 -0.20 -0.03  0.31  0.07  1.35  0.70 -0.66  1.14 -1.34     18
3 -0.14 -0.20  0.18  0.57  0.37 -0.13 -0.13  0.39  1.67  0.05     23
4 -0.16  0.01 -0.99  1.67  1.60 -0.30 -0.06  0.79 -0.78  0.74     22
5 -0.13 -0.21  1.10  0.35  0.04 -0.08 -0.38 -1.02  0.45 -0.39     15
6 -0.10 -0.19 -0.91 -0.30  0.20 -0.17 -0.52  0.34  0.00  0.34     33
7 -0.14 -0.21 -2.06  0.07 -1.05 -0.56  0.53 -0.29  0.88 -1.09     14
8 -0.16 -0.20  4.09  0.12 -0.90  2.71  1.37 -0.66 -0.85  1.28     38
9 -0.16 -0.20  0.93  0.15  2.47  2.12 -2.59  0.61 -1.09 -1.56     18


In [None]:
df1.iloc[0]

Unnamed: 0,0
0,1.23
1,9.04
2,-0.92
3,1.05
4,-1.11
5,-0.02
6,0.04
7,2.15
8,-0.91
9,0.71


In [None]:
df = pd.DataFrame(data['feature'])
df['label'] = data['label']

In [None]:
df.iloc[0]

Unnamed: 0,0
0,1.233415
1,9.043012
2,-0.923280
3,1.054183
4,-1.112501
...,...
598,-0.443911
599,-0.257895
600,0.311193
601,-0.377212


In [None]:
adj = sp.load_npz('reddit/reddit_graph.npz')
print(type(adj))
print(adj.shape)
print(adj.nnz)

<class 'scipy.sparse._coo.coo_matrix'>
(232965, 232965)
114615892


In [None]:

import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

data = np.load('reddit/reddit_data.npz')
features = torch.FloatTensor(data['feature'])
labels = torch.LongTensor(data['label'])
node_types = data['node_types']

train_idx = np.where(node_types == 1)[0]
val_idx = np.where(node_types == 2)[0]
test_idx = np.where(node_types == 3)[0]

class RedditDataset(Dataset):
    def __init__(self, idx):
        self.idx = idx

    def __len__(self):
        return len(self.idx)

    def __getitem__(self, i):
        node_id = self.idx[i]
        return features[node_id], labels[node_id]

train_loader = DataLoader(RedditDataset(train_idx), batch_size=1024, shuffle=True)

class MLP(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super().__init__()
        self.fc1 = nn.Linear(in_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.fc3 = nn.Linear(hidden_dim, out_dim)
        self.dropout = nn.Dropout(0.5)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        return self.fc3(x)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MLP(602, 256, 41).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()


for epoch in range(100):
    model.train()
    total_loss = 0
    for batch_x, batch_y in train_loader:
        batch_x, batch_y = batch_x.to(device), batch_y.to(device)
        optimizer.zero_grad()
        loss = criterion(model(batch_x), batch_y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    model.eval()
    with torch.no_grad():
        val_out = model(features[val_idx].to(device))
        val_acc = (val_out.argmax(1) == labels[val_idx].to(device)).float().mean().item()

    print(f"Epoch {epoch+1:02d} | Loss: {total_loss/len(train_loader):.4f} | Val Acc: {val_acc:.4f}")


model.eval()
with torch.no_grad():
    test_out = model(features[test_idx].to(device))
    test_acc = (test_out.argmax(1) == labels[test_idx].to(device)).float().mean().item()
print(f"\nTest Accuracy: {test_acc:.4f}")

Epoch 01 | Loss: 2.0036 | Val Acc: 0.5988
Epoch 02 | Loss: 1.7953 | Val Acc: 0.6110
Epoch 03 | Loss: 1.7420 | Val Acc: 0.6205
Epoch 04 | Loss: 1.7227 | Val Acc: 0.6189
Epoch 05 | Loss: 1.7086 | Val Acc: 0.6128
Epoch 06 | Loss: 1.6862 | Val Acc: 0.6131
Epoch 07 | Loss: 1.6782 | Val Acc: 0.6179
Epoch 08 | Loss: 1.6724 | Val Acc: 0.6172
Epoch 09 | Loss: 1.6597 | Val Acc: 0.6248
Epoch 10 | Loss: 1.6807 | Val Acc: 0.6200
Epoch 11 | Loss: 1.6773 | Val Acc: 0.6192
Epoch 12 | Loss: 1.6757 | Val Acc: 0.6215
Epoch 13 | Loss: 1.6746 | Val Acc: 0.6252
Epoch 14 | Loss: 1.6711 | Val Acc: 0.6227
Epoch 15 | Loss: 1.6718 | Val Acc: 0.6232
Epoch 16 | Loss: 1.6498 | Val Acc: 0.6345
Epoch 17 | Loss: 1.6535 | Val Acc: 0.6268
Epoch 18 | Loss: 1.6590 | Val Acc: 0.6312
Epoch 19 | Loss: 1.6506 | Val Acc: 0.6340
Epoch 20 | Loss: 1.6374 | Val Acc: 0.6338
Epoch 21 | Loss: 1.6447 | Val Acc: 0.6287
Epoch 22 | Loss: 1.6432 | Val Acc: 0.6322
Epoch 23 | Loss: 1.6200 | Val Acc: 0.6361
Epoch 24 | Loss: 1.6227 | Val Acc: