In [5]:
import dgl
from dgl.data import PubmedGraphDataset
from dgl import AddSelfLoop
from dgl import to_networkx
import numpy as np
import scipy.sparse as sp
import torch

# Load the dataset
transform = AddSelfLoop()  # Add self-loops
data = PubmedGraphDataset(transform=transform)
g = data[0]  # Get the first graph object from the dataset

# Convert to NetworkX graph to easily get adjacency matrix, 
# or use DGL's own functions for adjacency matrix
#adj = g.adjacency_matrix()  # Get adjacency matrix in CSR format #scipy_fmt="csr"

# Extract features
features = g.ndata['feat']  # Node features

# Extract labels
labels = g.ndata['label']  # Node labels

# Training, validation, and test indices
# Assuming these are provided in the dataset (common in citation network datasets)
# Otherwise, you'll need to create these splits
idx_train = torch.nonzero(g.ndata['train_mask'], as_tuple=False).squeeze()
idx_val = torch.nonzero(g.ndata['val_mask'], as_tuple=False).squeeze()
idx_test = torch.nonzero(g.ndata['test_mask'], as_tuple=False).squeeze()

# Convert adjacency matrix to dense format if needed
# adj_dense = adj.todense()

# Convert everything to the desired format, e.g., numpy arrays or torch tensors
# Note: This step depends on what format you need for your use case
features_np = features.numpy()  # Convert features to numpy array if needed
labels_np = labels.numpy()  # Convert labels to numpy array if needed

# idx_train, idx_val, and idx_test are already torch tensors; convert if needed

  NumNodes: 19717
  NumEdges: 88651
  NumFeats: 500
  NumClasses: 3
  NumTrainingSamples: 60
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.


In [8]:
src, dst = g.edges()
num_nodes = g.num_nodes()

# Create a sparse adjacency matrix
# Note: Adjust 'dtype' as needed, e.g., for weighted graphs
adj_sparse = sp.coo_matrix((torch.ones(src.shape[0]), (src.numpy(), dst.numpy())),
                           shape=(num_nodes, num_nodes),
                           dtype=np.float32)

In [1]:
from scipy.sparse import load_npz
load_npz('pubmed_dgl/adj_sparse.npz')

FileNotFoundError: [Errno 2] No such file or directory: 'pubmed_dgl/adj_sparse.npz'

In [10]:
print(adj_sparse)

  (0, 14442)	1.0
  (0, 1378)	1.0
  (0, 1544)	1.0
  (0, 6092)	1.0
  (0, 7636)	1.0
  (1, 10199)	1.0
  (1, 8359)	1.0
  (1, 2943)	1.0
  (2, 11485)	1.0
  (2, 15572)	1.0
  (2, 10471)	1.0
  (3, 8249)	1.0
  (4, 14044)	1.0
  (5, 1312)	1.0
  (5, 12968)	1.0
  (6, 17284)	1.0
  (6, 8661)	1.0
  (6, 3150)	1.0
  (6, 18614)	1.0
  (6, 7296)	1.0
  (6, 2216)	1.0
  (6, 8981)	1.0
  (6, 13656)	1.0
  (6, 6572)	1.0
  (6, 3509)	1.0
  :	:
  (19692, 19692)	1.0
  (19693, 19693)	1.0
  (19694, 19694)	1.0
  (19695, 19695)	1.0
  (19696, 19696)	1.0
  (19697, 19697)	1.0
  (19698, 19698)	1.0
  (19699, 19699)	1.0
  (19700, 19700)	1.0
  (19701, 19701)	1.0
  (19702, 19702)	1.0
  (19703, 19703)	1.0
  (19704, 19704)	1.0
  (19705, 19705)	1.0
  (19706, 19706)	1.0
  (19707, 19707)	1.0
  (19708, 19708)	1.0
  (19709, 19709)	1.0
  (19710, 19710)	1.0
  (19711, 19711)	1.0
  (19712, 19712)	1.0
  (19713, 19713)	1.0
  (19714, 19714)	1.0
  (19715, 19715)	1.0
  (19716, 19716)	1.0


In [18]:
print(features.shape)
print(labels.shape)
print(labels)


torch.Size([19717, 500])
torch.Size([19717])
tensor([1, 1, 0,  ..., 2, 0, 2])


In [11]:
print(features)

tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0554, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0114, 0.0047,  ..., 0.0000, 0.0000, 0.0000],
        [0.0531, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0145, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])


In [12]:
print(labels)

tensor([1, 1, 0,  ..., 2, 0, 2])


In [13]:
print(idx_train)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
        36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
        54, 55, 56, 57, 58, 59])


In [15]:
def encode_onehot(labels):
    # The classes must be sorted before encoding to enable static class encoding.
    # In other words, make sure the first class always maps to index 0.
    classes = sorted(list(set(labels)))
    classes_dict = {c: np.identity(len(classes))[i, :] for i, c in enumerate(classes)}
    labels_onehot = np.array(list(map(classes_dict.get, labels)), dtype=np.int32)
    return labels_onehot

def normalize_adj(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv_sqrt = np.power(rowsum, -0.5).flatten()
    r_inv_sqrt[np.isinf(r_inv_sqrt)] = 0.
    r_mat_inv_sqrt = sp.diags(r_inv_sqrt)
    return mx.dot(r_mat_inv_sqrt).transpose().dot(r_mat_inv_sqrt)


def normalize_features(mx):
    """Row-normalize sparse matrix"""
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx


def accuracy(output, labels):
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()
    return correct / len(labels)

In [16]:
features = normalize_features(features)

In [20]:
features.shape
features = torch.FloatTensor(np.array(features))
print(features)

tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0554, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        ...,
        [0.0000, 0.0114, 0.0047,  ..., 0.0000, 0.0000, 0.0000],
        [0.0531, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0145, 0.0000,  ..., 0.0000, 0.0000, 0.0000]])


In [21]:
print(labels)

tensor([1, 1, 0,  ..., 2, 0, 2])


#ENDS HERE

In [None]:
g = dgl.graph(([0, 1, 2], [1, 2, 3]))
g.adj()

In [11]:


from dgl.data import CiteseerGraphDataset, CoraGraphDataset, PubmedGraphDataset
from dgl import AddSelfLoop

transform = (
        AddSelfLoop()
    )  # by default, it will first remove self-loops to prevent duplication
data = PubmedGraphDataset(transform=transform)



  NumNodes: 19717
  NumEdges: 88651
  NumFeats: 500
  NumClasses: 3
  NumTrainingSamples: 60
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.


Graph(num_nodes=19717, num_edges=108365,
      ndata_schemes={'feat': Scheme(shape=(500,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64), 'val_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'train_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})

In [None]:
path = "/Users/ariellerosinski/My Drive/Cambridge/MLMI4/Project/Code/pyGAT/data/cora/Pubmed-Diabetes.NODE.paper.tab"