# 第五次作业

本次作业我们强化对GAE和VGAE的实践。具体地，我们用它们来完成（无监督）节点分类任务。

## 0. 课后习题

1. 图的自编码器和图卷积神经网络的区别是什么？
2. 图的变分自编码器比起图的自编码器的优点是什么？

## 1. 加载数据集

本次作业我们使用Cora数据集。

In [1]:
# set up session:
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import dgl
import dgl.function as fn
from dgl.nn import GraphConv
from dgl.data import CoraGraphDataset

from sklearn.preprocessing import normalize
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.metrics import classification_report, accuracy_score

In [2]:
def generate_negative_samples(g, k=19, seed=None):
    """ generate negative sample dataset for edge prediction whose size is k times of positive sample dataset
    """
    # set seed for numpy.random if provided:
    if seed is not None:
        np.random.seed(seed)

    # parse graph configuration:
    num_nodes = g.num_nodes() - 1
    adj = g.adj(transpose=False, ctx='cpu', scipy_fmt='csr')

    # generate negative samples:
    num_neg_samples = k*g.num_edges()
    neg_samples = set()
    while len(neg_samples) < num_neg_samples:
        proposal = np.random.randint(0, num_nodes, [num_neg_samples, 2])

        proposal = set(
            list(map(tuple, proposal[adj[proposal[:, 0], proposal[:, 1]].A1 < 1.0]))
        )

        neg_samples = neg_samples.union(proposal)
    
    # format:
    neg_samples = list(neg_samples)[:num_neg_samples]
    neg_samples = tuple(zip(*neg_samples))
    
    # done:
    return dgl.graph(neg_samples, num_nodes=g.num_nodes())

In [3]:
def get_train_valid_test_split(g, train_ratio=0.85, valid_ratio=0.05):
    """ get train-validation-test split of edges
    """
    edge_idx = np.arange(g.num_edges())
    np.random.shuffle(edge_idx)
    
    num_train = int(train_ratio*g.num_edges())
    num_valid = int(valid_ratio*g.num_edges())
    
    train_edge_idx = edge_idx[:num_train]
    valid_edge_idx = edge_idx[num_train:(num_train+num_valid)]
    test_edge_idx = edge_idx[(num_train+num_valid):]
    
    return (train_edge_idx, valid_edge_idx, test_edge_idx)

In [4]:
# set device:
device = torch.device('cuda:0')
# load dataset:
dataset = CoraGraphDataset('./data')

# get dataset
pos_g = dataset[0]
neg_g = generate_negative_samples(pos_g)

# train-validation-test split:
pos_train_edge_idx, pos_valid_edge_idx, pos_test_edge_idx = get_train_valid_test_split(pos_g)
neg_train_edge_idx, neg_valid_edge_idx, neg_test_edge_idx = get_train_valid_test_split(neg_g)

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.


In [5]:
# dataset overview, positive
pos_g

Graph(num_nodes=2708, num_edges=10556,
      ndata_schemes={'feat': Scheme(shape=(1433,), dtype=torch.float32), 'label': Scheme(shape=(), dtype=torch.int64), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'val_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={'__orig__': Scheme(shape=(), dtype=torch.int64)})

In [6]:
# dataset overview, negative
neg_g

Graph(num_nodes=2708, num_edges=200564,
      ndata_schemes={}
      edata_schemes={})

## 2. 定义GAE和VGAE
首先请同学们定义两个类：GAE和VGAE。

In [7]:
class GCNEncoder(torch.nn.Module):
    """ deep GCN encoder
    """
    def __init__(self, in_feats, out_feats):
        """ init layers
        """
        super().__init__()
        
        self.gcn1 = GraphConv(
            in_feats=in_feats, out_feats=2*out_feats, 
            weight=True, bias=True, 
            activation=F.relu, 
            allow_zero_in_degree=True
        )
        
        self.output = GraphConv(
            in_feats=2*out_feats, out_feats=out_feats, 
            weight=True, bias=True, 
            activation=None, 
            allow_zero_in_degree=True
        )
    
    def forward(self, g, x):
        """ forward propagation
        """
        h = self.gcn1(g, x)
        h = self.output(g, h)
        
        return h
    
class InnerProductDecoder(torch.nn.Module):
    """ inner product decoder
    """
    def __init__(self):
        super().__init__()
    
    def forward(self, g, z, sigmoid=True):
        """ forward propagation
        """
        g.ndata['z'] = z
        g.apply_edges(fn.u_dot_v('z', 'z', 'logit'))
        logit = g.edata['logit'].sum(dim=1)
        
        return torch.sigmoid(logit) if sigmoid else logit
    
class GAE(torch.nn.Module):
    """ graph autoencoder
    """
    EPSILON = 1e-16
    
    def __init__(self, encoder, decoder):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
    
    def encode(self, *args, **kwargs):
        """ encode
        """
        return self.encoder(*args, **kwargs)
    
    def decode(self, *args, **kwargs):
        """ decode
        """
        return self.decoder(*args, **kwargs)
    
    def get_reconstruction_loss(self, z, pos_g, neg_g, pos_edge_idx, neg_edge_idx):
        """ get edge reconstruction loss
        """
        pos_edge_prob = self.decode(g=pos_g, z=z)[pos_edge_idx] + GAE.EPSILON
        neg_edge_prob = 1.0 - self.decode(g=neg_g, z=z)[neg_edge_idx] + GAE.EPSILON
        
        pos_edge_loss = (-torch.log(pos_edge_prob)).mean()
        neg_edge_loss = (-torch.log(neg_edge_prob)).mean()
        
        return pos_edge_loss + neg_edge_loss
    
    def get_loss(self, *args, **kwargs):
        """ wrapper for loss function evaluation
        """
        return self.get_reconstruction_loss(*args, **kwargs)

In [8]:
class VariationalGCNEncoder(torch.nn.Module):
    MAX_LOGSTD = 10.0
    
    def __init__(self, in_feats, out_feats, variation_scale=3):
        super().__init__()
        
        # gcn1:
        self.gcn1 = GraphConv(
            in_feats=in_feats, out_feats=2*out_feats, 
            weight=True, bias=True, 
            activation=F.relu, 
            allow_zero_in_degree=True
        )
        
        # output, mu
        self.output_mu = GraphConv(
            in_feats=2*out_feats, out_feats=out_feats, 
            weight=True, bias=True, 
            activation=None, 
            allow_zero_in_degree=True
        )
        
        # output, log(std):
        self.output_logstd = GraphConv(
            in_feats=2*out_feats, out_feats=out_feats, 
            weight=True, bias=True, 
            activation=None, 
            allow_zero_in_degree=True
        )
        
        # for sampling from encoded Gaussian
        self.output_std_scale = variation_scale
        
    def forward(self, g, x):
        """ forward propagation
        """
        h = self.gcn1(g, x)
        
        mu = self.output_mu(g, h)
        logstd = self.output_logstd(g, h)
        
        return (mu, logstd)
    
    def sample_from_encoded_gaussian(self, mu, logstd, training):
        """ sample from encoded Gaussian
        """
        if training:
            return mu + (2*torch.randn_like(logstd) - 1) * self.output_std_scale * torch.exp(logstd)
        
        return mu
    
class VGAE(GAE): 
    """变分自编码器。继承自GAE这个类，可以使用GAE里面定义的函数。
    """
    
    def __init__(self, encoder, decoder):
        super().__init__(encoder=encoder, decoder=decoder)
    
    
    def encode(self, *args, **kwargs):
        """ encode
        """
        # get encoding Gaussian:
        self.__mu__, self.__logstd__ = self.encoder(*args, **kwargs)
        
        # limit standard deviation scale:
        self.__logstd__ = self.__logstd__.clamp(max=self.encoder.MAX_LOGSTD)
        
        # sample from encoding Gaussian:
        return self.encoder.sample_from_encoded_gaussian(
            self.__mu__, self.__logstd__, 
            self.training
        )

    def get_regulation_loss(self, mu=None, logstd=None):
        """ get encoding Gaussian regulation loss
        """
        mu = mu if not mu is None else self.__mu__
        logstd = logstd.clamp(max=self.encoder.MAX_LOGSTD) if not logstd is None else self.__logstd__
        
        # KL(p||q), with p as actual Gaussian and q as prior Gaussian:
        return -0.5*torch.mean(
            torch.mean(1.0 + 2*logstd - mu**2 - logstd.exp()**2, dim=1)
        )
    
    def get_loss(self, *args, **kwargs):
        """ wrapper for loss function evaluation
        """
        # TODO: the introduction of Gaussian prior regulation seems to hurt the performance
        return super().get_loss(*args, **kwargs) # + self.get_regulation_loss()

##  3. 训练模型

这部分请同学们自由发挥。

In [9]:
def train(model, g, lr=0.01, weight_decay=5e-4, epochs=1000, validation_step_size=125):
    """ train (variational) autoencoder
    """
    # parse dataset:
    (pos, neg) = g
    (pos_g, pos_train_edge_idx, pos_valid_edge_idx) = pos
    (neg_g, neg_train_edge_idx, neg_valid_edge_idx) = neg
    
    # init optimizer:
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    print(
        """Training..."""
    )
    
    # optimize:
    for i in range(epochs + 1):
        model.train()
        
        optimizer.zero_grad()
        # get encoding:
        z = model.encode(g=pos_g, x=pos_g.ndata['feat'])
        # get loss:
        train_loss = model.get_loss(z, pos_g, neg_g, pos_train_edge_idx, neg_train_edge_idx)
        
        # back propagation
        train_loss.backward()
        optimizer.step()
        
        # do validation
        if i % validation_step_size == 0:
            valid_loss = model.get_loss(z, pos_g, neg_g, pos_valid_edge_idx, neg_valid_edge_idx)
            print(
                """\tEpoch {}:\n"""
                """\t\t training / validation losses: {:.4f} / {:.4f}""".format(
                    i, 
                    train_loss.item(), valid_loss.item()
                )
            )

In [10]:
@torch.no_grad()
def test(model, pos_g, neg_g, pos_edge_idx, neg_edge_idx):
    model.eval()
    
    x = pos_g.ndata['feat']
    
    pos_z = model.encode(g=pos_g, x=x)
    neg_z = model.encode(g=neg_g, x=x)
    
    pos_y = pos_z.new_ones(pos_edge_idx.size)
    neg_y = neg_z.new_zeros(neg_edge_idx.size)
    y = torch.cat([pos_y, neg_y], dim=0)

    pos_pred = model.decoder(pos_g, pos_z)[pos_edge_idx]
    neg_pred = model.decoder(neg_g, neg_z)[neg_edge_idx]
    pred = torch.cat([pos_pred, neg_pred], dim=0)

    y, pred = y.detach().cpu().numpy(), pred.detach().cpu().numpy()

    return roc_auc_score(y, pred), average_precision_score(y, pred)

### 3.1 GAE

In [11]:
# config:
in_feats, out_feats = pos_g.ndata['feat'].shape[1], 16

# init:
gae = GAE(
    encoder=GCNEncoder(in_feats, out_feats),
    decoder=InnerProductDecoder()
).to(device)
pos_g = pos_g.to(device)
neg_g = neg_g.to(device)

# training:
train(
    # model:
    gae, 
    # dataset:
    (
        (pos_g, pos_train_edge_idx, pos_valid_edge_idx),
        (neg_g, neg_train_edge_idx, neg_valid_edge_idx)
    ),
    # optimizer:
    lr=5e-3, weight_decay=5e-4, 
    # training and validation:
    epochs=2500
)

Training...
	Epoch 0:
		 training / validation losses: 1.3862 / 1.3862
	Epoch 125:
		 training / validation losses: 1.1126 / 1.1431
	Epoch 250:
		 training / validation losses: 1.1006 / 1.1350
	Epoch 375:
		 training / validation losses: 1.0989 / 1.1346
	Epoch 500:
		 training / validation losses: 1.0985 / 1.1343
	Epoch 625:
		 training / validation losses: 1.0985 / 1.1333
	Epoch 750:
		 training / validation losses: 1.0983 / 1.1339
	Epoch 875:
		 training / validation losses: 1.0983 / 1.1334
	Epoch 1000:
		 training / validation losses: 1.0982 / 1.1337
	Epoch 1125:
		 training / validation losses: 1.0982 / 1.1335
	Epoch 1250:
		 training / validation losses: 1.0982 / 1.1338
	Epoch 1375:
		 training / validation losses: 1.0984 / 1.1333
	Epoch 1500:
		 training / validation losses: 1.0984 / 1.1344
	Epoch 1625:
		 training / validation losses: 1.0982 / 1.1334
	Epoch 1750:
		 training / validation losses: 1.0983 / 1.1333
	Epoch 1875:
		 training / validation losses: 1.0983 / 1.1342
	Epoch

In [12]:
# test:
(roc_auc, acc) = test(
    gae,
    pos_g, neg_g,
    pos_test_edge_idx, neg_test_edge_idx
)

print(
    """GAE Edge Prediction Summary:\n"""
    """\t ROC AUC / Accuracy: {:.4f} / {:.4f}""".format(
        roc_auc, acc
    )
)

GAE Edge Prediction Summary:
	 ROC AUC / Accuracy: 0.9263 / 0.9275


In [13]:
# get embedding:
embedding_gae = gae.encode(g=pos_g, x=pos_g.ndata['feat']).cpu().detach().numpy()

### 3.2 VGAE

In [14]:
# config:
in_feats, out_feats = pos_g.ndata['feat'].shape[1], 16

# init:
vgae = VGAE(
    encoder=VariationalGCNEncoder(in_feats, out_feats),
    decoder=InnerProductDecoder()
).to(device)

pos_g = pos_g.to(device)
neg_g = neg_g.to(device)

# training:
train(
    # model:
    vgae, 
    # dataset:
    (
        (pos_g, pos_train_edge_idx, pos_valid_edge_idx),
        (neg_g, neg_train_edge_idx, neg_valid_edge_idx)
    ),
    # optimizer:
    lr=5e-3, weight_decay=5e-4, 
    # training and validation:
    epochs=2500
)

Training...
	Epoch 0:
		 training / validation losses: 34.9745 / 34.9409
	Epoch 125:
		 training / validation losses: 1.3785 / 1.3834
	Epoch 250:
		 training / validation losses: 1.3324 / 1.3343
	Epoch 375:
		 training / validation losses: 1.3203 / 1.3391
	Epoch 500:
		 training / validation losses: 1.3150 / 1.3203
	Epoch 625:
		 training / validation losses: 1.3096 / 1.3180
	Epoch 750:
		 training / validation losses: 1.2992 / 1.3112
	Epoch 875:
		 training / validation losses: 1.2788 / 1.2919
	Epoch 1000:
		 training / validation losses: 1.1771 / 1.2069
	Epoch 1125:
		 training / validation losses: 1.1417 / 1.1736
	Epoch 1250:
		 training / validation losses: 1.1238 / 1.1534
	Epoch 1375:
		 training / validation losses: 1.0772 / 1.1054
	Epoch 1500:
		 training / validation losses: 1.0451 / 1.0670
	Epoch 1625:
		 training / validation losses: 1.0329 / 1.0555
	Epoch 1750:
		 training / validation losses: 1.0211 / 1.0507
	Epoch 1875:
		 training / validation losses: 1.0121 / 1.0378
	Epo

In [15]:
# test:
(roc_auc, acc) = test(
    vgae,
    pos_g, neg_g,
    pos_test_edge_idx, neg_test_edge_idx
)

print(
    """Variational GAE Edge Prediction Summary:\n"""
    """\t ROC AUC / Accuracy: {:.4f} / {:.4f}""".format(
        roc_auc, acc
    )
)

Variational GAE Edge Prediction Summary:
	 ROC AUC / Accuracy: 0.9833 / 0.9838


In [16]:
# get embedding:
embedding_vgae = vgae.encode(g=pos_g, x=pos_g.ndata['feat']).cpu().detach().numpy()

## 4. 测试模型

训练一个线性模型（比如逻辑回归模型）来预测节点的标签，并输出预测准确率。

In [23]:
def evaluate_node_classification(
    embeddings, labels, 
    train_mask, test_mask, 
    normalize_embedding=True, 
    max_iter=1000
):
    """ use single-layer MLP for node label prediction using (variational) graph auto-encoder embeddings
    """
    # normalize:
    X = embeddings
    if normalize_embedding:
        X = normalize(embeddings)
    
    # split train-test sets:
    X_train, y_train = X[train_mask, :], labels[train_mask]
    X_test, y_test = X[test_mask, :], labels[test_mask]
    
    # build classifier:
    clf = MLPClassifier(
        random_state=42,
        hidden_layer_sizes=[32],
        max_iter=max_iter
    ).fit(X_train, y_train)
    
    # make prediction:
    preds = clf.predict(X_test)
    
    # get classification report:
    print(
        classification_report(
            y_true=y_test, y_pred=preds
        )
    )
    # get accuracy score:
    test_acc = accuracy_score(y_true=y_test, y_pred=preds)
    
    return preds, test_acc

In [24]:
preds, test_acc = evaluate_node_classification(
    embedding_gae, pos_g.ndata['label'].cpu().detach().numpy(), 
    pos_g.ndata['train_mask'].cpu().detach().numpy(), pos_g.ndata['test_mask'].cpu().detach().numpy()
)

print('GAE Test Accuracy: %.4f' % test_acc)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       130
           1       0.15      0.95      0.25        91
           2       0.00      0.00      0.00       144
           3       0.00      0.00      0.00       319
           4       0.34      0.92      0.49       149
           5       1.00      0.01      0.02       103
           6       0.00      0.00      0.00        64

    accuracy                           0.22      1000
   macro avg       0.21      0.27      0.11      1000
weighted avg       0.17      0.22      0.10      1000

GAE Test Accuracy: 0.2240


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
preds, test_acc = evaluate_node_classification(
    embedding_vgae, pos_g.ndata['label'].cpu().detach().numpy(), 
    pos_g.ndata['train_mask'].cpu().detach().numpy(), pos_g.ndata['test_mask'].cpu().detach().numpy()
)

print('Variational GAE Test Accuracy: %.4f' % test_acc)

              precision    recall  f1-score   support

           0       0.41      0.51      0.46       130
           1       0.38      0.69      0.49        91
           2       0.78      0.88      0.83       144
           3       0.74      0.36      0.49       319
           4       0.70      0.54      0.61       149
           5       0.46      0.70      0.55       103
           6       0.28      0.36      0.32        64

    accuracy                           0.55      1000
   macro avg       0.54      0.58      0.53      1000
weighted avg       0.61      0.55      0.55      1000

Variational GAE Test Accuracy: 0.5470




到这里本次作业就结束了。这次的任务其实是无监督的节点分类问题。可以看到，我们会使用一些和前面DeepWalk作业中相似的代码。同学可以讲DeepWalk和GAE/VGAE的结果做一个比较，看看谁在这个任务上效果更好。