In [None]:
import pandas as pd
import networkx as nx
import dgl
    
# # pandas reads csv
# edges_data = pd.read_csv('data/knowledge_aquisition_reference.csv')
# # networkx reads pandas
# g_nx: nx.DiGraph = nx.from_pandas_edgelist(edges_data,
#                                                'paper_id',
#                                                'reference_id',
#                                                create_using=nx.DiGraph())

# # dgl read networkx
# # ATTENTION!!!: nodes in dgl graph is ordered by paperid
# g = dgl.from_networkx(g_nx)
# print(g.number_of_nodes())
# print(g.number_of_edges())

In [None]:
import dgl.function as fn
import torch as th
import torch.nn as nn
import torch.nn.functional as F

gcn_msg = fn.copy_src(src='h' ,out='m')
gcn_reduce = fn.sum(msg='m' ,out='h')

class GCNLayer(nn.Module):
    def __init__(self, in_feats, out_feats):
        super(GCNLayer, self).__init__()
        self.linear = nn.Linear(in_feats, out_feats)

    def forward(self, g, feature):
        # Creating a local scope so that all the stored ndata and edata
        # (such as the `'h'` ndata below) are automatically popped out
        # when the scope exits.
        with g.local_scope():
            g.ndata['h'] = feature
            g.update_all(gcn_msg, gcn_reduce)
            h = g.ndata['h']
            return self.linear(h)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.layer1 = GCNLayer(1433, 160)
        self.layer2 = GCNLayer(160,16)
        self.layer3 = GCNLayer(16, 6)
    
    def forward(self, g, features):
        x = F.relu(self.layer1(g, features))
        x = F.relu(self.layer2(g, x))
        x = self.layer3(g, x)
        return x

net = Net()
print(net)

In [None]:
from dgl.data import citation_graph as citegrh
import networkx as nx

def load_cora_data():
    data = citegrh.load_cora()
    features = th.FloatTensor(data.features)
    labels = th.LongTensor(data.labels)
    train_mask = th.BoolTensor(data.train_mask)
    test_mask = th.BoolTensor(data.test_mask)
    g = data.graph
    # add self loop
    g.remove_edges_from(nx.selfloop_edges(g))
    g = dgl.DGLGraph(g)
    g.add_edges(g.nodes(), g.nodes())
    return g, features, labels, train_mask, test_mask



In [None]:
def load_paper_data():
    # pandas reads csv
    edges_data = pd.read_csv('data/knowledge_aquisition_reference.csv')
    # networkx reads pandas
    g_nx: nx.DiGraph = nx.from_pandas_edgelist(edges_data,
                                               'paper_id',
                                               'reference_id',
                                               create_using=nx.DiGraph())

    # dgl read networkx
    # ATTENTION!!!: nodes in dgl graph is ordered by paperid
    g = dgl.from_networkx(g_nx)
    features = th.FloatTensor(np.zeros((g.number_of_nodes(), 1433)))
    label_pd = pd.read_csv('data/rank_id.csv')
    labels = th.LongTensor(label_pd['reference_count'])
    train_m = [(labels[i] != 0 and i % 10 <= 1) for i in range(g.number_of_nodes())]
    test_m = [(labels[i] != 0 and i % 10 > 1) for i in range(g.number_of_nodes())]
    train_mask = th.BoolTensor(train_m)
    test_mask = th.BoolTensor(test_m)
    return g, features, labels, train_mask, test_mask


In [None]:
def evaluate(model, g, features, labels, mask):
    model.eval()
    with th.no_grad():
        logits = model(g, features)
        logits = logits[mask]
        labels = labels[mask]
        _, indices = th.max(logits, dim=1)
        correct = th.sum(indices == labels)
        return labels, indices, correct.item() * 1.0 / len(labels)

In [None]:
import time
import numpy as np
# g, features, labels, train_mask, test_mask = load_cora_data()
print('loading graph')
g, features, labels, train_mask, test_mask = load_paper_data()
print('done')

In [None]:
net = Net()

In [None]:
optimizer = th.optim.Adam(net.parameters(), lr=0.04)
scheduler = th.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=1)
dur = []
for epoch in range(100):
    t0 = time.time()

    net.train()
    logits = net(g, features)
    logp = F.log_softmax(logits, 1)
    loss = F.nll_loss(logp[train_mask], labels[train_mask])

    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    scheduler.step()

    dur.append(time.time() - t0)

    tmpl,tmpi,acc = evaluate(net, g, features, labels, test_mask)
    print("Epoch {:05d} | Loss {:.4f} | Test Acc {:.4f} | Time(s) {:.4f}".format(
            epoch, loss.item(), acc, np.mean(dur)))