### Load and preprocess data

In [1]:
from collections import defaultdict, OrderedDict

DATA_DIR = "data/PubMed/"
EDGE_PATH = DATA_DIR + "Pubmed-Diabetes.DIRECTED.cites.tab"
NODE_PATH = DATA_DIR + "Pubmed-Diabetes.NODE.paper.tab"
TF_IDF_DIM = 500

# Load and process graph links
print("Loading and processing graph links...")
node_pairs = set()
with open(EDGE_PATH, 'r') as f:
    next(f)  # skip header
    next(f)  # skip header
    for line in f:
        columns = line.split()
        src = int(columns[1][6:])
        dest = int(columns[3].strip()[6:])
        node_pairs.add((src, dest))
        
# Load and process graph nodes
print("Loading and processing graph nodes...")
node2vec = OrderedDict()
node2label = dict()
class_1 = list()
class_2 = list()
class_3 = list()
with open(NODE_PATH, 'r') as f:
    next(f)  # skip header
    vocabs = [e.split(':')[1] for e in next(f).split()[1:]]
    for line in f:
        columns = line.split()
        node = int(columns[0])
        label = int(columns[1][-1])
        tf_idf_vec = [0.0] * TF_IDF_DIM

        for e in columns[2:-1]:
            word, value = e.split('=')
            tf_idf_vec[vocabs.index(word)] = float(value)

        node2vec[node] = tf_idf_vec
        node2label[node] = label - 1
        if label == 1:
            class_1.append(node)
        elif label == 2:
            class_2.append(node)
        elif label == 3:
            class_3.append(node)

# Debug statistics
print("Number of links:", len(node_pairs))
assert len(node2vec) == (len(class_1) + len(class_2) + len(class_3))
print("Number of nodes:", len(node2vec))
print("Number of nodes belong to Class 1", len(class_1))
print("Number of nodes belong to Class 2", len(class_2))
print("Number of nodes belong to Class 3", len(class_3))


Loading and processing graph links...
Loading and processing graph nodes...
('Number of links:', 44338)
('Number of nodes:', 19717)
('Number of nodes belong to Class 1', 4103)
('Number of nodes belong to Class 2', 7875)
('Number of nodes belong to Class 3', 7739)


### Neural Network related parameters

In [2]:
MODEL_DIR = "model/"
TEST_SIZE = 1000
SEED_NODES = 20
NUM_CATEGORIES = 3

ALPHA = 0.2
HIDDEN_1_DIM = 250
HIDDEN_2_DIM = 100

NUM_EPOCH = 12
BATCH_SIZE = 12
LEARNING_RATE = 0.0001

### Split data into train/test set

In [3]:
# Important variables from previous cells: node_pairs, class_1, class_2, class_3
test_nodes = class_1[-TEST_SIZE:] + class_2[-TEST_SIZE:] + class_3[-TEST_SIZE:]
train_node_pairs = []
for src, dest in node_pairs:
    if not (src in test_nodes or dest in test_nodes):
        train_node_pairs.append((src, dest))

seed_nodes = class_1[:SEED_NODES] + class_2[:SEED_NODES] + class_3[:SEED_NODES]

### Model Architecture

In [4]:
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm_notebook as tqdm

class NGM_FFNN(nn.Module):
    def __init__(self, alpha, input_dim, hidden1_dim, hidden2_dim, output_dim, device=torch.device('cpu')):
        super(NGM_FFNN, self).__init__()

        self.alpha = alpha

        self.hidden1 = nn.Linear(input_dim, hidden1_dim)
        self.hidden2 = nn.Linear(hidden1_dim, hidden2_dim)
        self.output = nn.Linear(hidden2_dim, output_dim)

        self.device = device
        self.to(device)

    def save(self, output_dir, model_name):
        print("Saving model...")
        torch.save(self.state_dict(), output_dir + model_name + ".pt")
        print("Model saved.")

    def load(self, output_dir, model_name):
        print("Loading model...")
        self.load_state_dict(torch.load(output_dir + model_name + ".pt"))
        print("Model loaded.")
        
    def forward(self, tf_idf_vec):
        # First feed-forward layer
        hidden1 = F.relu(self.hidden1(tf_idf_vec))

        # Second feed-forward layer
        hidden2 = F.relu(self.hidden2(hidden1))

        # Output layer
        return F.log_softmax(self.output(hidden2), -1)
    
    def reset_parameters(self):
        self.hidden1.reset_parameters()
        self.hidden2.reset_parameters()
        self.output.reset_parameters()
    
    def get_last_hidden(self, tf_idf_vec):
        # First feed-forward layer
        hidden1 = F.relu(self.hidden1(tf_idf_vec))

        # Second feed-forward layer
        return F.relu(self.hidden2(hidden1))
    
    def train_(self, seed_nodes, train_node_pairs, node2vec, node2label, 
               num_epoch, batch_size, learning_rate):
        print("Training...")
        self.train()

        loss_function = nn.NLLLoss()
        optimizer = optim.SGD(self.parameters(), lr=learning_rate)
        
        node2neighbors = defaultdict(list)
        for src, dest in train_node_pairs:
            node2neighbors[src].append(dest)
            node2neighbors[dest].append(src)
            
        labeled_nodes = dict()
        for node in seed_nodes:
            labeled_nodes[node] = node2label[node]

        iteration = 1
        while True:
            print("=" * 80)
            print "Generation: {} (with {} labeled nodes)".format(iteration, len(labeled_nodes))
            iteration += 1

            for e in range(num_epoch):
                train_node_pairs_cpy =  train_node_pairs[:]
                total_loss = 0
                while train_node_pairs_cpy:
                    optimizer.zero_grad()
                    loss = torch.tensor(0, dtype=torch.float32, device=self.device)
                    label_label_loss = defaultdict(list)
                    label_unlabel_loss = defaultdict(list)
                    
                    try:
                        batch = random.sample(train_node_pairs_cpy, batch_size)
                    except ValueError:
                        break
                        
                    for src, dest in batch:
                        train_node_pairs_cpy.remove((src, dest))

                        if src in labeled_nodes:
                            tf_idf_vector = torch.tensor(node2vec[src], device=self.device)
                            targets = torch.tensor([labeled_nodes[src]], device=self.device)
                            lg_softmax = self.forward(tf_idf_vector)
                            nll_loss = loss_function(lg_softmax.view(1, -1), targets)
                            # NLL loss from classification
                            if dest in labeled_nodes:
                                label_label_loss[src].append(nll_loss)
                            else:
                                label_unlabel_loss[src].append(nll_loss)
                            
                        if dest in labeled_nodes:
                            tf_idf_vector = torch.tensor(node2vec[dest], device=self.device)
                            targets = torch.tensor([labeled_nodes[dest]], device=self.device)
                            lg_softmax = self.forward(tf_idf_vector)
                            nll_loss = loss_function(lg_softmax.view(1, -1), targets)
                            # NLL loss from classification
                            label_label_loss[dest].append(nll_loss)
                    
                        if self.alpha != 0:
                            if ((src in labeled_nodes and dest in labeled_nodes) or 
                              (src in labeled_nodes and dest not in labeled_nodes) or 
                              (src not in labeled_nodes and dest not in labeled_nodes)):
                                # L2 loss from labeled-labeled/labeled-unlabeled/
                                # unlabeled-unlabeled edges
                                src_hidden = self.get_last_hidden(
                                    torch.tensor(node2vec[src], device=self.device))
                                dest_hidden = self.get_last_hidden(
                                    torch.tensor(node2vec[dest], device=self.device))
                                loss += self.alpha * torch.dist(src_hidden, dest_hidden)

                    for incident in label_label_loss:
                        temp = label_label_loss[incident]
                        loss += torch.stack(temp).sum() / len(temp)
                    
                    for incident in label_unlabel_loss:
                        temp = label_unlabel_loss[incident]
                        loss += torch.stack(temp).sum() / len(temp)
                        
                    if loss.item() != 0:
                        assert not torch.isnan(loss)
                        loss.backward()
                        optimizer.step()
                        total_loss += loss.item()
                        del loss

                avg_loss = total_loss / len(labeled_nodes)
                print("Epoch: {} Loss: {} (avg: {})".format(e + 1, total_loss, avg_loss))

            # Assign label to immediate neighbors
            label_changed = False
            for node in list(labeled_nodes.keys()):
                for neighbor in node2neighbors[node]:
                    label = labeled_nodes[node]
                    if neighbor not in labeled_nodes:
#                         prediction = self.predict(
#                             torch.tensor(node2vec[neighbor], device=self.device))
                        labeled_nodes[neighbor] = label
                        label_changed = True
            if not label_changed:
                break
#             else:
#                 self.reset_parameters()

    def predict(self, tf_idf_vec):
        return torch.argmax(self.forward(tf_idf_vec)).item()
        
    def evaluate(self, test_nodes, node2vec, node2label):
        self.eval()

        correct_count = 0
        for node in test_nodes:
            predicted = self.predict(torch.tensor(node2vec[node], device=self.device))
            if predicted == node2label[node]:
                correct_count += 1

        return float(correct_count) / len(test_nodes)

### Baseline feed-forward neural network

In [5]:
# Important variable from previous cells: node_pairs, node2vec, node2label, seed_nodes, train_node_pairs, test_nodes
from datetime import datetime
baseline_model = NGM_FFNN(0, TF_IDF_DIM, HIDDEN_1_DIM, HIDDEN_2_DIM, NUM_CATEGORIES)
start = datetime.now()
baseline_model.train_(seed_nodes, train_node_pairs, node2vec, node2label, NUM_EPOCH, BATCH_SIZE, LEARNING_RATE)
baseline_time = (datetime.now()-start).total_seconds()

Training...
Generation: 1 (with 60 labeled nodes)
Epoch: 1 Loss: 265.468328595 (avg: 4.42447214325)
Epoch: 2 Loss: 265.283905506 (avg: 4.4213984251)
Epoch: 3 Loss: 263.944190741 (avg: 4.39906984568)
Epoch: 4 Loss: 264.928966284 (avg: 4.4154827714)
Epoch: 5 Loss: 263.604874253 (avg: 4.39341457089)
Epoch: 6 Loss: 262.295018077 (avg: 4.37158363461)
Epoch: 7 Loss: 264.429841042 (avg: 4.40716401736)
Epoch: 8 Loss: 263.191653252 (avg: 4.38652755419)
Epoch: 9 Loss: 264.103588104 (avg: 4.4017264684)
Epoch: 10 Loss: 263.947010994 (avg: 4.3991168499)
Epoch: 11 Loss: 262.664132357 (avg: 4.37773553928)
Epoch: 12 Loss: 262.519589663 (avg: 4.37532649438)
Generation: 2 (with 293 labeled nodes)
Epoch: 1 Loss: 3583.54682088 (avg: 12.2305352248)
Epoch: 2 Loss: 3531.30918151 (avg: 12.0522497663)
Epoch: 3 Loss: 3479.75781226 (avg: 11.8763065265)
Epoch: 4 Loss: 3444.04305166 (avg: 11.7544131456)
Epoch: 5 Loss: 3418.04156131 (avg: 11.6656708577)
Epoch: 6 Loss: 3387.50046641 (avg: 11.5614350389)
Epoch: 7 Los

### Neural graph machine feed-forward neural network

In [6]:
# Important variable from previous cells: node_pairs, node2vec, node2label, seed_nodes, train_node_pairs, test_nodes

NGM_model = NGM_FFNN(ALPHA, TF_IDF_DIM, HIDDEN_1_DIM, HIDDEN_2_DIM, NUM_CATEGORIES)
start = datetime.now()
NGM_model.train_(seed_nodes, train_node_pairs, node2vec, node2label, NUM_EPOCH, BATCH_SIZE, LEARNING_RATE)
NGM_time = (datetime.now()-start).total_seconds()

Training...
Generation: 1 (with 60 labeled nodes)
Epoch: 1 Loss: 456.32806924 (avg: 7.60546782066)
Epoch: 2 Loss: 441.315778628 (avg: 7.35526297713)
Epoch: 3 Loss: 430.109133132 (avg: 7.1684855522)
Epoch: 4 Loss: 424.046260335 (avg: 7.06743767224)
Epoch: 5 Loss: 419.131263293 (avg: 6.98552105489)
Epoch: 6 Loss: 410.430202842 (avg: 6.8405033807)
Epoch: 7 Loss: 406.970913898 (avg: 6.78284856497)
Epoch: 8 Loss: 402.675507873 (avg: 6.71125846455)
Epoch: 9 Loss: 397.419615209 (avg: 6.62366025349)
Epoch: 10 Loss: 391.325186882 (avg: 6.52208644804)
Epoch: 11 Loss: 385.212630328 (avg: 6.42021050546)
Epoch: 12 Loss: 385.697685815 (avg: 6.42829476359)
Generation: 2 (with 293 labeled nodes)
Epoch: 1 Loss: 3662.01440263 (avg: 12.4983426711)
Epoch: 2 Loss: 3613.83835028 (avg: 12.3339192842)
Epoch: 3 Loss: 3569.20593877 (avg: 12.1815902347)
Epoch: 4 Loss: 3543.33056805 (avg: 12.0932783892)
Epoch: 5 Loss: 3508.97634298 (avg: 11.9760284743)
Epoch: 6 Loss: 3499.08436466 (avg: 11.9422674562)
Epoch: 7 Lo

### Time taken

In [7]:
print(baseline_time)
print(NGM_time)

5136.538772
8578.217842


### Evaluations

In [8]:
# Important variable from previous cells: node2vec, node2label, test_nodes

print(baseline_model.evaluate(test_nodes, node2vec, node2label))
print(NGM_model.evaluate(test_nodes, node2vec, node2label))

0.694666666667
0.722666666667


### Save model

In [9]:
baseline_model.save(MODEL_DIR, "PubMed_baseline")
NGM_model.save(MODEL_DIR, "PubMed_NGM")

Saving model...
Model saved.
Saving model...
Model saved.
