https://github.com/DaehanKim/vgae_pytorch

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import os
import numpy as np

In [13]:

class VGAE(nn.Module):
	def __init__(self, adj):
		super(VGAE,self).__init__()
		self.base_gcn = GraphConvSparse(input_dim, hidden1_dim, adj)
		self.gcn_mean = GraphConvSparse(hidden1_dim, hidden2_dim, adj, activation=lambda x:x)
		self.gcn_logstddev = GraphConvSparse(hidden1_dim, hidden2_dim, adj, activation=lambda x:x)

	def encode(self, X,adj):
		hidden = self.base_gcn(X,adj)
		self.mean = self.gcn_mean(hidden,adj)
		self.logstd = self.gcn_logstddev(hidden,adj)
		gaussian_noise = torch.randn(X.size(0), hidden2_dim)
		sampled_z = gaussian_noise*torch.exp(self.logstd) + self.mean
		return sampled_z

	def forward(self, X,adj):
		Z = self.encode(X,adj)
		A_pred = dot_product_decode(Z)
		return A_pred

class GraphConvSparse(nn.Module):
	def __init__(self, input_dim, output_dim, adj, activation = F.relu, **kwargs):
		super(GraphConvSparse, self).__init__(**kwargs)
		self.weight = glorot_init(input_dim, output_dim) 
		self.adj = adj
		self.activation = activation

	def forward(self, inputs):
		x = inputs
		x = torch.mm(x,self.weight)
		x = torch.mm(self.adj, x)
		outputs = self.activation(x)
		return outputs


def dot_product_decode(Z):
	A_pred = torch.sigmoid(torch.matmul(Z,Z.t()))
	return A_pred

def glorot_init(input_dim, output_dim):
	init_range = np.sqrt(6.0/(input_dim + output_dim))
	initial = torch.rand(input_dim, output_dim)*2*init_range - init_range
	return nn.Parameter(initial)


class GAE(nn.Module):
	def __init__(self,adj):
		super(GAE,self).__init__()
		self.base_gcn = GraphConvSparse(input_dim, hidden1_dim, adj)
		self.gcn_mean = GraphConvSparse(hidden1_dim, hidden2_dim, adj, activation=lambda x:x)

	def encode(self, X):
		hidden = self.base_gcn(X)
		z = self.mean = self.gcn_mean(hidden)
		return z

	def forward(self, X):
		Z = self.encode(X)
		A_pred = dot_product_decode(Z)
		return A_pred

In [14]:
dataset = 'cora'
model = 'VGAE'

input_dim = 1433 
hidden1_dim = 32
hidden2_dim = 16
use_feature = True

num_epoch = 200
learning_rate = 0.01

In [15]:
'''
****************NOTE*****************
CREDITS : Thomas Kipf
since datasets are the same as those in kipf's implementation, 
Their preprocessing source was used as-is.
*************************************
'''
import numpy as np
import scipy.sparse as sp

def sparse_to_tuple(sparse_mx):
    if not sp.isspmatrix_coo(sparse_mx):
        sparse_mx = sparse_mx.tocoo()
    coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
    values = sparse_mx.data
    shape = sparse_mx.shape
    return coords, values, shape

def preprocess_graph(adj):
    adj = sp.coo_matrix(adj)
    adj_ = adj + sp.eye(adj.shape[0])
    rowsum = np.array(adj_.sum(1))
    degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten())
    adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo()
    return sparse_to_tuple(adj_normalized)

def mask_test_edges(adj):
    # Function to build test set with 10% positive links
    # NOTE: Splits are randomized and results might slightly deviate from reported numbers in the paper.
    # TODO: Clean up.

    # Remove diagonal elements
    adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape)
    adj.eliminate_zeros()
    # Check that diag is zero:
    assert np.diag(adj.todense()).sum() == 0

    adj_triu = sp.triu(adj)
    adj_tuple = sparse_to_tuple(adj_triu)
    edges = adj_tuple[0]
    edges_all = sparse_to_tuple(adj)[0]
    num_test = int(np.floor(edges.shape[0] / 10.))
    num_val = int(np.floor(edges.shape[0] / 20.))

    all_edge_idx = list(range(edges.shape[0]))
    np.random.shuffle(all_edge_idx)
    val_edge_idx = all_edge_idx[:num_val]
    test_edge_idx = all_edge_idx[num_val:(num_val + num_test)]
    test_edges = edges[test_edge_idx]
    val_edges = edges[val_edge_idx]
    train_edges = np.delete(edges, np.hstack([test_edge_idx, val_edge_idx]), axis=0)

    def ismember(a, b, tol=5):
        rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1)
        return np.any(rows_close)

    test_edges_false = []
    while len(test_edges_false) < len(test_edges):
        idx_i = np.random.randint(0, adj.shape[0])
        idx_j = np.random.randint(0, adj.shape[0])
        if idx_i == idx_j:
            continue
        if ismember([idx_i, idx_j], edges_all):
            continue
        if test_edges_false:
            if ismember([idx_j, idx_i], np.array(test_edges_false)):
                continue
            if ismember([idx_i, idx_j], np.array(test_edges_false)):
                continue
        test_edges_false.append([idx_i, idx_j])

    val_edges_false = []
    while len(val_edges_false) < len(val_edges):
        idx_i = np.random.randint(0, adj.shape[0])
        idx_j = np.random.randint(0, adj.shape[0])
        if idx_i == idx_j:
            continue
        if ismember([idx_i, idx_j], train_edges):
            continue
        if ismember([idx_j, idx_i], train_edges):
            continue
        if ismember([idx_i, idx_j], val_edges):
            continue
        if ismember([idx_j, idx_i], val_edges):
            continue
        if val_edges_false:
            if ismember([idx_j, idx_i], np.array(val_edges_false)):
                continue
            if ismember([idx_i, idx_j], np.array(val_edges_false)):
                continue
        val_edges_false.append([idx_i, idx_j])

    assert ~ismember(test_edges_false, edges_all)
    assert ~ismember(val_edges_false, edges_all)
    assert ~ismember(val_edges, train_edges)
    assert ~ismember(test_edges, train_edges)
    assert ~ismember(val_edges, test_edges)

    data = np.ones(train_edges.shape[0])

    # Re-build adj matrix
    adj_train = sp.csr_matrix((data, (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape)
    adj_train = adj_train + adj_train.T

    # NOTE: these edge lists only contain single direction of edge!
    return adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false

In [16]:
'''
****************NOTE*****************
CREDITS : Thomas Kipf
since datasets are the same as those in kipf's implementation, 
Their preprocessing source was used as-is.
*************************************
'''
import numpy as np
import sys
import pickle as pkl
import networkx as nx
import scipy.sparse as sp

def parse_index_file(filename):
    index = []
    for line in open(filename):
        index.append(int(line.strip()))
    return index

def load_data(dataset):
    # load the data: x, tx, allx, graph
    names = ['x', 'tx', 'allx', 'graph']
    objects = []
    for i in range(len(names)):
        with open("data/ind.{}.{}".format(dataset, names[i]), 'rb') as f:
            if sys.version_info > (3, 0):
                objects.append(pkl.load(f, encoding='latin1'))
            else:
                objects.append(pkl.load(f))
    x, tx, allx, graph = tuple(objects)
    test_idx_reorder = parse_index_file("data/ind.{}.test.index".format(dataset))
    test_idx_range = np.sort(test_idx_reorder)

    if dataset == 'citeseer':
        # Fix citeseer dataset (there are some isolated nodes in the graph)
        # Find isolated nodes, add them as zero-vecs into the right position
        test_idx_range_full = range(min(test_idx_reorder), max(test_idx_reorder)+1)
        tx_extended = sp.lil_matrix((len(test_idx_range_full), x.shape[1]))
        tx_extended[test_idx_range-min(test_idx_range), :] = tx
        tx = tx_extended

    features = sp.vstack((allx, tx)).tolil()
    features[test_idx_reorder, :] = features[test_idx_range, :]
    adj = nx.adjacency_matrix(nx.from_dict_of_lists(graph))

    return adj, features

In [17]:
import torch
import torch.nn.functional as F
from torch.optim import Adam
from sklearn.metrics import roc_auc_score, average_precision_score
import scipy.sparse as sp
import numpy as np
import os
import time


# Train on CPU (hide GPU) due to memory constraints
os.environ['CUDA_VISIBLE_DEVICES'] = ""

adj, features = load_data('cora')

# Store original adjacency matrix (without diagonal entries) for later
adj_orig = adj
adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
adj_orig.eliminate_zeros()

adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj)
adj = adj_train



In [18]:
features[0]

<1x1433 sparse matrix of type '<class 'numpy.float32'>'
	with 9 stored elements in List of Lists format>

In [19]:
adj

<2708x2708 sparse matrix of type '<class 'numpy.float64'>'
	with 8976 stored elements in Compressed Sparse Row format>

In [20]:
# Some preprocessing
adj_norm = preprocess_graph(adj)


num_nodes = adj.shape[0]

features = sparse_to_tuple(features.tocoo())
num_features = features[2][1]
features_nonzero = features[1].shape[0]

# Create Model
pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)


adj_label = adj_train + sp.eye(adj_train.shape[0])
adj_label = sparse_to_tuple(adj_label)



adj_norm = torch.sparse.FloatTensor(torch.LongTensor(adj_norm[0].T), 
                            torch.FloatTensor(adj_norm[1]), 
                            torch.Size(adj_norm[2]))
adj_label = torch.sparse.FloatTensor(torch.LongTensor(adj_label[0].T), 
                            torch.FloatTensor(adj_label[1]), 
                            torch.Size(adj_label[2]))
features = torch.sparse.FloatTensor(torch.LongTensor(features[0].T), 
                            torch.FloatTensor(features[1]), 
                            torch.Size(features[2]))

weight_mask = adj_label.to_dense().view(-1) == 1
weight_tensor = torch.ones(weight_mask.size(0)) 
weight_tensor[weight_mask] = pos_weight


In [21]:

# init model and optimizer
model = GAE(adj_norm)
optimizer = Adam(model.parameters(), lr=learning_rate)



In [23]:

def get_scores(edges_pos, edges_neg, adj_rec):

    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    # Predict on test set of edges
    preds = []
    pos = []
    for e in edges_pos:
        # print(e)
        # print(adj_rec[e[0], e[1]])
        preds.append(sigmoid(adj_rec[e[0], e[1]].item()))
        pos.append(adj_orig[e[0], e[1]])

    preds_neg = []
    neg = []
    for e in edges_neg:

        preds_neg.append(sigmoid(adj_rec[e[0], e[1]].data))
        neg.append(adj_orig[e[0], e[1]])

    preds_all = np.hstack([preds, preds_neg])
    labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))])
    roc_score = roc_auc_score(labels_all, preds_all)
    ap_score = average_precision_score(labels_all, preds_all)

    return roc_score, ap_score

def get_acc(adj_rec, adj_label):
    labels_all = adj_label.to_dense().view(-1).long()
    preds_all = (adj_rec > 0.5).view(-1).long()
    accuracy = (preds_all == labels_all).sum().float() / labels_all.size(0)
    return accuracy

# train model
for epoch in range(num_epoch):
    t = time.time()

    A_pred = model(features)
    print(np.shape(A_pred))
    print(np.shape(adj_label))
    optimizer.zero_grad()
    loss = log_lik = norm*F.binary_cross_entropy(A_pred.view(-1), adj_label.to_dense().view(-1), weight = weight_tensor)
    if model == 'VGAE':
        kl_divergence = 0.5/ A_pred.size(0) * (1 + 2*model.logstd - model.mean**2 - torch.exp(model.logstd)**2).sum(1).mean()
        loss -= kl_divergence

    loss.backward()
    optimizer.step()

    train_acc = get_acc(A_pred,adj_label)

    val_roc, val_ap = get_scores(val_edges, val_edges_false, A_pred)
    print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(loss.item()),
          "train_acc=", "{:.5f}".format(train_acc), "val_roc=", "{:.5f}".format(val_roc),
          "val_ap=", "{:.5f}".format(val_ap),
          "time=", "{:.5f}".format(time.time() - t))


test_roc, test_ap = get_scores(test_edges, test_edges_false, A_pred)
print("End of training!", "test_roc=", "{:.5f}".format(test_roc),
      "test_ap=", "{:.5f}".format(test_ap))

torch.Size([2708, 2708])
torch.Size([2708, 2708])
Epoch: 0001 train_loss= 0.47183 train_acc= 0.60303 val_roc= 0.90778 val_ap= 0.90528 time= 0.23836
torch.Size([2708, 2708])
torch.Size([2708, 2708])
Epoch: 0002 train_loss= 0.47087 train_acc= 0.60307 val_roc= 0.90862 val_ap= 0.90647 time= 0.26330
torch.Size([2708, 2708])
torch.Size([2708, 2708])
Epoch: 0003 train_loss= 0.46973 train_acc= 0.60324 val_roc= 0.90863 val_ap= 0.90607 time= 0.22639
torch.Size([2708, 2708])
torch.Size([2708, 2708])
Epoch: 0004 train_loss= 0.46858 train_acc= 0.60348 val_roc= 0.90885 val_ap= 0.90619 time= 0.22639
torch.Size([2708, 2708])
torch.Size([2708, 2708])
Epoch: 0005 train_loss= 0.46748 train_acc= 0.60356 val_roc= 0.90918 val_ap= 0.90708 time= 0.23038
torch.Size([2708, 2708])
torch.Size([2708, 2708])
Epoch: 0006 train_loss= 0.46641 train_acc= 0.60363 val_roc= 0.90893 val_ap= 0.90660 time= 0.24834
torch.Size([2708, 2708])
torch.Size([2708, 2708])
Epoch: 0007 train_loss= 0.46537 train_acc= 0.60334 val_roc= 0.

KeyboardInterrupt: 

## TEXT DATA


In [1]:
import os
import pickle
import torch

class Vocab(object):

    def __init__(self, filename='', load=False, threshold=0):
        if load:
            assert os.path.exists(filename), "Vocab file does not exist at " + filename

            self.id2word, self.word2id = self.load(filename)
            self.size = len(self.id2word)
            self.threshold = threshold
            self.wordCounter = None
        else:
            self.id2word, self.word2id = {}, {}
            self.size = 0
            self.threshold = threshold
            # We always add some custom tokens into the vocabulary.
            self.add_words(
                {'<PAD>': float('inf'), '<UNK>': float('inf')})
        self.word_embed = None

    def add_words(self, counterOfTokens):
        for item, value in counterOfTokens.items():
            if value >= self.threshold:
                if item not in self.word2id:
                    # add it to the vocab
                    self.word2id[item] = self.size
                    self.id2word[self.size] = item
                    self.size += 1

    def load(self, filename):
        with open(filename, 'rb') as infile:
            id2word = pickle.load(infile)
            word2id = {word:id for id, word in id2word.items()}
            self.id2word, self.word2id = id2word, word2id
            self.size = len(self.id2word)

        return id2word, word2id

    def save(self, filename):
        if os.path.exists(filename):
            os.remove(filename)
           
        with open(filename, 'wb') as outfile:
            pickle.dump(self.id2word, outfile)

    def __len__(self):
        return self.size


    def init_word_embed(self, cfg, cache_dir='datasets/.word_vectors_cache'):
        if cfg['word_vectors'] == 'Word2Vec':
            from torchnlp.word_to_vector import FastText
            all_word_vector = FastText(language=cfg['language'], cache=cache_dir, aligned=True)
        else:
            raise NotImplementedError('No word_vectors found which are called {}.'.format(cfg['word_vectors']))

        # The the vectors only correspond to lower character words:
        all_words = [word.lower() for word in list(self.word2id.keys())]
        weights = all_word_vector[all_words]
        
        word_embed = torch.nn.Embedding(*weights.shape, _weight=weights)
        if cfg['device'] == 'cuda':
            word_embed.cuda()

        self.word_embed = word_embed
        self.embed_size = weights.shape[1]

    def words2vecs(self, words: list):
        if not self.word_embed:
            raise AttributeError("The word embeddings aren't initialized yet.")
        else:
            vecs = self.word_embed(torch.tensor(self.map(words), requires_grad=False))
        return vecs

    def one_hot_ids2vecs(self, ids):
        vecs = self.word_embed(ids)
        return vecs

    def map(self, token_list):
        """
        Map a list of tokens to their ids.
        """
        return [self.word2id[w] if w in self.word2id else self.word2id['<UNK>'] for w in token_list]

    def unmap(self, idx_list):
        """
        Unmap ids back to tokens.
        """
        return [self.id2word[idx] for idx in idx_list]
    
def get_pos_vocab():
    """
    Function to set up a part of speech vocabulary handcrafed.
    """
    pos_id2word = {0: '<PAD>', 1: '<UNK>', 2: 'DET', 3: 'PROPN', 4: 'VERB', 5: 'PART', 6: 'ADJ', 7: 'PUNCT', 8: 'CCONJ',
                   9: 'ADP', 10: 'PRON', 11: 'NOUN', 12: 'ADV', 13: 'INTJ', 14: 'NUM', 15: 'X', 16: 'SYM'}
    pos_word2id = {word: id for id, word in pos_id2word.items()}
    pos_vocab = Vocab()
    pos_vocab.id2word = pos_id2word
    pos_vocab.word2id = pos_word2id
    pos_vocab.size = len(pos_vocab.id2word)
    
    return pos_vocab


In [3]:
# Code to load sst data
from torch.utils.data import Dataset

import torch
import numpy as np


class SSTData(Dataset):
    def __init__(self,
                 sst_data,
                 vocab,
                 nlp,
                 lemma_vocab,
                 pos_vocab = None,
                 self_loop=True):

        self.lemma_vocab = lemma_vocab
        self.self_loop = self_loop
        
        
        sst_data = [sample for sample in sst_data if sample['label'] != 'neutral']
        self.sst_data = sst_data

        self.sentiment_vocab = {'negative': 0, 'positive': 1}
        
        # Add sentencizer in the nlp if not already in it:
        if "sentencizer" not in nlp.pipe_names:
            # sentencizer = nlp.create_pipe("sentencizer")
            nlp.add_pipe('sentencizer', first=True)
        self.nlp = nlp

        self.vocab = vocab
        if pos_vocab is None:
            self.pos_vocab = get_pos_vocab()
        else:
            self.pos_vocab = pos_vocab

    def __getitem__(self, idx):
        doc = self.nlp(self.sst_data[idx]['text'])
        # make lambda

        adj, root_id = doc_to_adj(doc, directed=False, self_loop=self.self_loop)

        lamb = adj

        # normalize
        denom = lamb.sum(1)
        lamb /= denom

        # make text indices
        token_ids = self.vocab.map([token.text for token in doc])

        # make pos ids
        pos_ids = self.pos_vocab.map([token.pos_ for token in doc])

        # make lemma ids
        lemma_ids = self.lemma_vocab.map([token.lemma_ for token in doc])

        # make label
        label = self.sentiment_vocab[self.sst_data[idx]['label']]

        return token_ids, pos_ids, lamb, label, root_id, lemma_ids

    def __len__(self):
        return len(self.sst_data)


def collate_fn_sentim(batch):
    lens = []
    for sample in batch:
        text, pos, lamb, label, root_id, lemma = sample[:6]
        lens.append(lamb.shape[1])

    max_len = max(lens)
    lambs = []
    texts = []
    poss = []
    lemmas = []
    labels = []
    root_ids = []
    for sample in batch:
        text, pos, lamb, label, root_id, lemma = sample[:6]
        # Big lamb
        lamb_ = torch.zeros(1, max_len, max_len)
        lamb_[0, :lamb.shape[0], :lamb.shape[1]] = lamb
        lambs.append(lamb_)

        # Big text
        text_ = torch.zeros(1, max_len, dtype=torch.long)
        text_[0, :len(text)] = torch.tensor(text)
        texts.append(text_)

        # Big pos
        pos_ = torch.zeros(1, max_len, dtype=torch.long)
        pos_[0, :len(pos)] = torch.tensor(pos)
        poss.append(pos_)

        # Big lemma
        lemma_ = torch.zeros(1, max_len, dtype=torch.long)
        lemma_[0, :len(lemma)] = torch.tensor(lemma)
        lemmas.append(lemma_)

        # Big label:
        label_ = torch.ones(1) * label
        labels.append(label_.long())

        # Big root_id:
        root_id_ = torch.ones(1) * root_id
        root_ids.append(root_id_.long())

    lambs = torch.cat(lambs, dim=0)
    texts = torch.cat(texts, dim=0)
    poss = torch.cat(poss, dim=0)
    labels = torch.cat(labels, dim=0)
    root_ids = torch.cat(root_ids, dim=0)
    lemmas = torch.cat(lemmas, dim=0)

    return lambs, poss, texts, labels, lens, root_ids, lemmas


def doc_to_adj(sent, directed=True, self_loop=False):
    # Sent should be a spacy document. Can also be longer than a sentence.
    sent_len = len(sent)

    ret = torch.zeros(sent_len, sent_len, dtype=torch.float32)

    for token in sent:
        for child in token.children:
            if child.i >= sent_len:
                print('Something goes wrong here.')
                print(child.i, sent_len, sent, token.i, token)
            ret[token.i, child.i] = 1
        if token.dep_ == 'ROOT':
            root_id = token.i

    if not directed:
        ret = ret + ret.transpose(0, 1)

    if self_loop:
        for i in range(sent_len):
            ret[i, i] = 1

    return ret, root_id
