In [1]:
import numpy as np 
import torch 
import torch.nn as nn 
import torch.nn.functional as F 

In [2]:
import torch
import numpy as np 
import scipy.sparse as sp 

def load_data():
    
    #dataset="cora"
    dataset="citeseer"
    
    path="./data/"+dataset+"/" 
    
    idx_features_labels = np.genfromtxt("{}{}.content".format(path,dataset), dtype=np.dtype(str))
    features = sp.csr_matrix(idx_features_labels[:,1:-1], dtype=np.float32)
    labels = encode_onehot(idx_features_labels[:,-1])

    idx = np.array(idx_features_labels[:,0],dtype=np.dtype(str))
    idx_map = {j: i for i,j in enumerate(idx)}
    edges_unordered = np.genfromtxt("{}{}.cites".format(path,dataset), dtype=np.dtype(str))
    edges = np.array(list(map(idx_map.get, edges_unordered.flatten())), dtype=np.dtype(str)).reshape(edges_unordered.shape)
    
    #edges = np.concatenate((edges,np.flip(edges,1)))
    adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:,0], edges[:,1])), shape=(labels.shape[0], labels.shape[0]), dtype=np.float32)

    adj = adj + adj.T.multiply(adj.T>adj) - adj.multiply(adj.T>adj)
    features = normalize_features(features)
    adj = normalize_adj(adj+sp.eye(adj.shape[0]))

    idx_train = range(140)
    idx_val = range(200,500)
    idx_test = range(500,1500)

    adj = torch.FloatTensor(np.array(adj.todense()))
    features = torch.FloatTensor(np.array(features.todense()))
    labels = torch.LongTensor(np.where(labels)[1])

    idx_train = torch.LongTensor(idx_train)
    idx_val = torch.LongTensor(idx_val)
    idx_test = torch.LongTensor(idx_test)

    return adj, features, labels, idx_train, idx_val, idx_test 

def accuracy(output, labels):
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()

    return correct / len(labels)

def normalize_adj(mx): 
    rowsum = np.array(mx.sum(1))
    r_inv_sqrt = np.power(rowsum, -0.5).flatten()
    r_inv_sqrt[np.isinf(r_inv_sqrt)] = 0.
    r_mat_inv_sqrt = sp.diags(r_inv_sqrt)
    mx_to =  mx.dot(r_mat_inv_sqrt).transpose().dot(r_mat_inv_sqrt)
    return mx_to

def normalize_features(mx):
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx_to =  r_mat_inv.dot(mx) 
    return mx_to 

def encode_onehot(labels):
    classes = set(labels)
    classes_dict = {c: np.identity(len(classes))[i,:] for i, c in enumerate(classes)}
    labels_onehot = np.array(list(map(classes_dict.get, labels)), dtype=np.int32)
    return labels_onehot


In [3]:
class Attention(nn.Module):
    def __init__(self, in_features, out_features, alpha):
        super(Attention, self).__init__()
        self.alpha = alpha

        self.W = nn.Linear(in_features, out_features, bias = False)
        self.a_T = nn.Linear(2 * out_features, 1, bias = False)

        nn.init.xavier_uniform_(self.W.weight)
        nn.init.xavier_uniform_(self.a_T.weight)

    def forward(self, h, adj):
        N = h.size(0)
        Wh = self.W(h) 
        
        H1 = Wh.unsqueeze(1).repeat(1,N,1)
        H2 = Wh.unsqueeze(0).repeat(N,1,1)
        attn_input = torch.cat([H1, H2], dim = -1)

        e = F.leaky_relu(self.a_T(attn_input).squeeze(-1), negative_slope = self.alpha)
        
        attn_mask = -1e18*torch.ones_like(e)
        masked_e = torch.where(adj > 0, e, attn_mask)
        attn_scores = F.softmax(masked_e, dim = -1) 

        h_prime = torch.mm(attn_scores, Wh) 

        return F.relu(h_prime) 

In [4]:
class GraphAttentionLayer(nn.Module):
    def __init__(self, in_features, out_features, num_heads, alpha, concat=True):
        super(GraphAttentionLayer, self).__init__()
        self.concat = concat
        self.attentions = nn.ModuleList([Attention(in_features, out_features, alpha) for _ in range(num_heads)])
        
    def forward(self, input, adj):

        if self.concat :
            # concatenate
            outputs = []
            for attention in self.attentions:
                outputs.append(attention(input, adj))
            
            return torch.cat(outputs, dim = -1) 

        else :
            # average
            output = None
            for attention in self.attentions:
                if output == None:
                    output = attention(input, adj)
                else:
                    output += attention(input, adj)
            
            return output/len(self.attentions) 

In [5]:
class GAT(nn.Module):
    def __init__(self, F, H, C, dropout, alpha, K):
        super(GAT, self).__init__()
        self.layer1 = GraphAttentionLayer(F, H, K, alpha)
        self.layer2 = GraphAttentionLayer(K * H, C, 1, alpha, concat = False)
        self.dropout = nn.Dropout(p = dropout)

    def forward(self, x, adj):

        x = self.dropout(F.relu(self.layer1(x, adj))) 
        return self.layer2(x, adj) 

In [6]:
import os 
import time 
import random 
import glob
import matplotlib.pyplot as plt

import numpy as np 
import torch
import torch.nn as nn 
import torch.nn.functional as F 
import torch.optim as optim 
from torch.autograd import Variable
 
    
device = torch.device('cuda' if(torch.cuda.is_available()) else 'cpu')

# load the data
adj, features, labels, idx_train, idx_val, idx_test = load_data()
features = features.to(device)
labels = labels.to(device)

# parameter intialization
N = features.size(0) # num_of_nodes
F_F = features.size(1) # num_of_features
H = 4 # hidden nodes
C = labels.max().item() + 1 # num_classes

# for validation
epochs_since_improvement = 0
best_loss = 10.

# init training object
network = GAT(F_F, H, C, 0.5, 0.2, 3).to(device)
adj = adj.to(device)

optimizer = optim.Adam(network.parameters(), lr = 0.005, weight_decay = 5e-4)
criterion = nn.CrossEntropyLoss()

train_losses, train_accs, val_losses, val_accs = [], [], [], []


# Train
for epoch in range(350):
    t = time.time()
    network.train()

    preds = network(features, adj) 
    train_loss = criterion(preds[idx_train], labels[idx_train])
    train_acc = accuracy(preds[idx_train], labels[idx_train])

    optimizer.zero_grad()
    train_loss.backward()
    optimizer.step()

    # validation
    with torch.no_grad():
        network.eval()
        preds_val = network(features, adj)
        val_loss = criterion(preds_val[idx_val], labels[idx_val])
        val_acc = accuracy(preds_val[idx_val], labels[idx_val])

        # early stopping
        if val_loss < best_loss :
            best_loss = val_loss
            epochs_since_improvement = 0
        else:
            epochs_since_improvement += 1

    train_losses.append(train_loss.item())
    train_accs.append(train_acc.item())
    val_losses.append(val_loss.item())
    val_accs.append(val_acc.item())

    print('[%d/%d] train loss : %.4f | train acc %.2f%% | val loss %.4f | val acc %.2f%% | time %.3fs'
                %(epoch+1, 350, train_loss.item(), train_acc.item() * 100, val_loss.item(), val_acc.item() * 100, time.time() - t))

    if epochs_since_improvement > 10 - 1 :
        print("There's no improvements during %d epochs and so stop the training."%(10))
        break

# Test
with torch.no_grad():
    network.eval()
    preds = network(features, adj)
    test_acc = accuracy(preds[idx_test], labels[idx_test])
    print('Test Accuracy : %.2f'%(test_acc * 100))

[1/350] train loss : 1.7917 | train acc 25.71% | val loss 1.7904 | val acc 35.33% | time 0.273s
[2/350] train loss : 1.7899 | train acc 41.43% | val loss 1.7887 | val acc 44.00% | time 0.268s
[3/350] train loss : 1.7875 | train acc 53.57% | val loss 1.7868 | val acc 46.33% | time 0.270s
[4/350] train loss : 1.7849 | train acc 53.57% | val loss 1.7849 | val acc 46.00% | time 0.276s
[5/350] train loss : 1.7815 | train acc 56.43% | val loss 1.7829 | val acc 46.33% | time 0.269s
[6/350] train loss : 1.7786 | train acc 55.71% | val loss 1.7807 | val acc 47.67% | time 0.271s
[7/350] train loss : 1.7768 | train acc 57.86% | val loss 1.7784 | val acc 50.00% | time 0.271s
[8/350] train loss : 1.7719 | train acc 62.86% | val loss 1.7760 | val acc 54.33% | time 0.270s
[9/350] train loss : 1.7698 | train acc 62.86% | val loss 1.7736 | val acc 57.67% | time 0.270s
[10/350] train loss : 1.7642 | train acc 66.43% | val loss 1.7712 | val acc 59.67% | time 0.270s
[11/350] train loss : 1.7639 | train ac

[86/350] train loss : 1.2756 | train acc 76.43% | val loss 1.3950 | val acc 67.00% | time 0.272s
[87/350] train loss : 1.2552 | train acc 77.14% | val loss 1.3881 | val acc 67.00% | time 0.273s
[88/350] train loss : 1.2473 | train acc 74.29% | val loss 1.3811 | val acc 67.00% | time 0.271s
[89/350] train loss : 1.2280 | train acc 77.86% | val loss 1.3742 | val acc 67.00% | time 0.271s
[90/350] train loss : 1.2122 | train acc 74.29% | val loss 1.3674 | val acc 67.00% | time 0.271s
[91/350] train loss : 1.2366 | train acc 75.00% | val loss 1.3606 | val acc 67.00% | time 0.271s
[92/350] train loss : 1.2184 | train acc 76.43% | val loss 1.3538 | val acc 66.67% | time 0.271s
[93/350] train loss : 1.2090 | train acc 77.86% | val loss 1.3469 | val acc 66.67% | time 0.271s
[94/350] train loss : 1.1979 | train acc 78.57% | val loss 1.3400 | val acc 66.67% | time 0.271s
[95/350] train loss : 1.1811 | train acc 74.29% | val loss 1.3333 | val acc 66.67% | time 0.272s
[96/350] train loss : 1.1582 |

[170/350] train loss : 0.7544 | train acc 83.57% | val loss 1.0762 | val acc 71.67% | time 0.269s
[171/350] train loss : 0.7637 | train acc 83.57% | val loss 1.0750 | val acc 71.67% | time 0.269s
[172/350] train loss : 0.8260 | train acc 81.43% | val loss 1.0742 | val acc 71.33% | time 0.267s
[173/350] train loss : 0.7588 | train acc 81.43% | val loss 1.0739 | val acc 71.00% | time 0.267s
[174/350] train loss : 0.7507 | train acc 82.14% | val loss 1.0738 | val acc 71.00% | time 0.267s
[175/350] train loss : 0.7851 | train acc 82.86% | val loss 1.0740 | val acc 71.00% | time 0.267s
[176/350] train loss : 0.7618 | train acc 80.71% | val loss 1.0739 | val acc 71.33% | time 0.266s
[177/350] train loss : 0.7373 | train acc 82.86% | val loss 1.0738 | val acc 71.33% | time 0.266s
[178/350] train loss : 0.7811 | train acc 78.57% | val loss 1.0737 | val acc 71.33% | time 0.268s
[179/350] train loss : 0.7160 | train acc 84.29% | val loss 1.0734 | val acc 71.33% | time 0.267s
[180/350] train loss

[254/350] train loss : 0.6418 | train acc 83.57% | val loss 1.0342 | val acc 71.33% | time 0.267s
[255/350] train loss : 0.6644 | train acc 83.57% | val loss 1.0344 | val acc 71.33% | time 0.268s
[256/350] train loss : 0.6297 | train acc 85.71% | val loss 1.0345 | val acc 71.33% | time 0.267s
[257/350] train loss : 0.6650 | train acc 79.29% | val loss 1.0346 | val acc 71.33% | time 0.268s
[258/350] train loss : 0.6424 | train acc 85.00% | val loss 1.0346 | val acc 71.33% | time 0.267s
There's no improvements during 10 epochs and so stop the training.
Test Accuracy : 63.40
