# 图神经网络入门代码集
作者：丁雨山、彬斌
## 实战案例：搭建影视作品的图网络

In [None]:
import pandas as pd

# 加载数据
df = pd.read_csv('/home/jovyan/kernel/netflix_titles.csv')

# 取出导演属性，如果该属性的值为空，则返回空列表 [] ，否则返回所有导演列表
df['directors'] = df['director'].apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])

# 取出演员属性，如果该属性的值为空，则返回空列表 [] ，否则返回所有演员列表
df['actors'] = df['cast'].apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])

# 取出导演属性，如果该属性的值为空，则返回空列表 [] ，否则返回所有影视类型列表
df['categories'] = df['listed_in'].apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])

# 取出国家属性，如果该属性的值为空，则返回空列表 [] ，否则返回所有国家列表
df['countries'] = df['country'].apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])

# 取出我们需要的5个属性
df = df[["title", "directors", "actors", "categories", "countries"]]

df

In [None]:
import networkx as nx

# 初始化一个图网络 graph network (gn)
gn = nx.Graph(label="Netflix")

# 遍历数据来给图网络添加节点和边
for i, row in df.iterrows():
    # 添加影视节点
    gn.add_node(row['title'], label="MOVIE")
    
    # 遍历演员列表
    for actor in row['actors']:
        
        # 添加人物节点
        gn.add_node(actor, label="PERSON")
        
        # 添加该人物与该影视之间的边，关系为 ACTED_IN
        gn.add_edge(row['title'], actor, label="ACTED_IN")
    
    # 遍历导演列表
    for director in row['directors']:
        
        # 添加人物节点
        gn.add_node(director, label="PERSON")
        
        # 添加该人物与该影视之间的边， 关系为 DERECTED_BY
        gn.add_edge(row['title'], director, label="DERECTED_BY")
    
    # 遍历影视类型列表
    for cat in row['categories']:
        
        # 添加影视类型节点
        gn.add_node(cat, label="CATEGORY")
        
        # 添加该影视类型与该影视之间的边， 关系为 CATEGORY_IN
        gn.add_edge(row['title'], cat, label="CATEGORY_IN")
    
    # 遍历涉及的国家列表
    for cou in row['countries']:
        
        # 添加国家节点
        gn.add_node(cou, label="COUNTRY")
        
        # 添加该国家与该影视之间的边， 关系为 COUNTRY_IN
        gn.add_edge(row['title'], cou, label="COUNTRY_IN")


In [None]:
import matplotlib.pyplot as plt
plt.style.use('seaborn')
plt.rcParams['figure.figsize'] = [14,14]



def get_adjacent_nodes(G, nodes):
    sub_graph=set()
    for n in nodes:
        sub_graph.add(n)
        for e in G.neighbors(n):        
            sub_graph.add(e)
    return list(sub_graph)

def draw_sub_graph(G, sub_graph):
    
    # 从图网络 G 中取出子图 sub_graph
    subgraph = G.subgraph(sub_graph)
    pos = nx.spring_layout(subgraph)
    
    # 为每一种图节点标注一种颜色和大小
    node_colors=[]
    node_sizes = []
    for n in subgraph.nodes():
        if G.nodes[n]['label'] == "MOVIE":
            node_colors.append('blue')
            node_sizes.append(700)
        elif G.nodes[n]['label'] == "PERSON":
            node_colors.append('red')
            node_sizes.append(600)
        elif G.nodes[n]['label'] == "CATEGORY":
            node_colors.append('green')
            node_sizes.append(500)
        elif G.nodes[n]['label'] == "COUNTRY":
            node_colors.append('yellow')
            node_sizes.append(400)
            
    nx.draw(subgraph, pos, with_labels=True, node_color=node_colors, node_size=node_sizes, width=2, font_size=15)
    
    # 给每一条边绘制 label
    edge_labels = {}
    for e in subgraph.edges():
        if G.edges[e]['label'] == "ACTED_IN":
            edge_labels[e] = "ACTED_IN"
        if G.edges[e]['label'] == "DERECTED_BY":
            edge_labels[e] = "DERECTED_BY"
        if G.edges[e]['label'] == "CATEGORY_IN":
            edge_labels[e] = "CATEGORY_IN"
        if G.edges[e]['label'] == "COUNTRY_IN":
            edge_labels[e] = "COUNTRY_IN"
    nx.draw_networkx_edge_labels(subgraph, pos, edge_labels=edge_labels, font_color="red")
    

In [None]:
nodes = ["Ocean's Twelve", "Ocean's Thirteen"]
sub_graph = get_adjacent_nodes(gn, nodes)

draw_sub_graph(gn, sub_graph)

In [None]:
nodes = ["Superman Returns", "Tom and Jerry: The Magic Ring"]
sub_graph = get_adjacent_nodes(gn, nodes)

draw_sub_graph(gn, sub_graph)

In [None]:
path = nx.shortest_path(gn, "Superman Returns", "Tom and Jerry: The Magic Ring")
print("最短路径图节点：")
print(path)
print("\n")

print("最短路径图节点以及它们直接的关系：")
for i in range(len(path) - 1):
    print("{}\t{}\t{}".format(path[i], path[i+1], gn.edges[(path[i], path[i+1])]["label"]))

## 四步理解图网络模型

In [None]:
! pip install node2vec

import pandas as pd
import networkx as nx


# 加载数据
df = pd.read_csv('/home/jovyan/kernel/netflix_titles.csv')

# 取出导演属性，如果该属性的值为空，则返回空列表 [] ，否则返回所有导演列表
df['directors'] = df['director'].apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])

# 取出演员属性，如果该属性的值为空，则返回空列表 [] ，否则返回所有演员列表
df['actors'] = df['cast'].apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])

# 取出导演属性，如果该属性的值为空，则返回空列表 [] ，否则返回所有影视类型列表
df['categories'] = df['listed_in'].apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])

# 取出国家属性，如果该属性的值为空，则返回空列表 [] ，否则返回所有国家列表
df['countries'] = df['country'].apply(lambda l: [] if pd.isna(l) else [i.strip() for i in l.split(",")])

# 取出我们需要的5个属性
df = df[["title", "directors", "actors", "categories", "countries"]]


# 初始化一个图网络 graph network (gn)
gn = nx.Graph(label="Netflix")

# 遍历数据来给图网络添加节点和边
for i, row in df.iterrows():
    
    # 添加影视节点
    gn.add_node(row['title'], label="MOVIE")
    
    # 遍历演员列表
    for actor in row['actors']:
        
        # 添加人物节点
        gn.add_node(actor, label="PERSON")
        
        # 添加该人物与该影视之间的边，关系为 ACTED_IN
        gn.add_edge(row['title'], actor, label="ACTED_IN")
    
    # 遍历导演列表
    for director in row['directors']:
        
        # 添加人物节点
        gn.add_node(director, label="PERSON")
        
        # 添加该人物与该影视之间的边， 关系为 DERECTED_BY
        gn.add_edge(row['title'], director, label="DERECTED_BY")
    
    # 遍历影视类型列表
    for cat in row['categories']:
        
        # 添加影视类型节点
        gn.add_node(cat, label="CATEGORY")
        
        # 添加该影视类型与该影视之间的边， 关系为 CATEGORY_IN
        gn.add_edge(row['title'], cat, label="CATEGORY_IN")
    
    # 遍历涉及的国家列表
    for cou in row['countries']:
        
        # 添加国家节点
        gn.add_node(cou, label="COUNTRY")
        
        # 添加该国家与该影视之间的边， 关系为 COUNTRY_IN
        gn.add_edge(row['title'], cou, label="COUNTRY_IN")

        
# Node2Vec

from node2vec import Node2Vec

n2v = Node2Vec(gn, dimensions=100, walk_length=16, num_walks=10)
model = n2v.fit(window=5, min_count=1)

In [None]:
# generate similiar movies to given genre or title
def print_similiar(name):
    for node, _ in model.most_similar(name):
        print(node)

In [None]:
from sklearn.manifold import TSNE

# # 定义 TSNE，映射至2维空间
tsne = TSNE(n_components=2)
# # 从训练好的 Node2Vec 中取出 Node Embedding
node_embeddings = model.wv.vectors
node_embeddings_2d = tsne.fit_transform(node_embeddings)

# 在画图前，我们将每一个节点按 label 区分不同的颜色
node_ids = model.wv.index2word 
node_labels = [gn.nodes[node_id]["label"] for node_id in node_ids]
label_map = {
    "MOVIE":"blue",
    "PERSON":"red",
    "CATEGORY":"green",
    "COUNTRY":"yellow"
}

node_colours = [ label_map[lab] for lab in node_labels]

plt.figure()
plt.scatter(node_embeddings_2d[:,0], 
            node_embeddings_2d[:,1], 
            c=node_colours, cmap="jet")

In [None]:
label_map = {
    "MOVIE":"blue",
    "PERSON":"red",
    "CATEGORY":"green",
    "COUNTRY":"yellow"
}


def draw_n2v_sub_graph(nodes, markers):
    '''
    nodes: list of string，影视作品名称列表
    markers: list of string, 用不同的标记绘制不同的影视作品
    '''
    fig = plt.figure()
    ax = fig.add_subplot(111)

    for node, marker in zip(nodes, markers):
        
        sub_graph_emb_x = []
        sub_graph_emb_y = []
        sub_graph_title = []
        sub_graph_targets = []
        color_cls = []
        for n in get_adjacent_nodes(gn, [node]):
            sub_graph_title.append(n)
            n_id = model.wv.index2word.index(n)
            sub_graph_emb_x.append(node_embeddings_2d[n_id, 0])
            sub_graph_emb_y.append(node_embeddings_2d[n_id, 1])
            sub_graph_targets.append(gn.nodes[n]["label"])
            color_cls.append(label_map[gn.nodes[n]["label"]])


        ax.scatter(sub_graph_emb_x, sub_graph_emb_y, s=300, c=color_cls, marker=marker, cmap="jet", label=node)

        for emb_x, emb_y, emb_title in zip(sub_graph_emb_x, sub_graph_emb_y, sub_graph_title):
            ax.annotate(emb_title, (emb_x, emb_y), textcoords="offset points", xytext=(0,10), ha='center', size=16)

In [None]:
draw_n2v_sub_graph(["Ocean's Twelve", "Ocean's Thirteen"], ["o", "d"]) 

In [None]:
draw_n2v_sub_graph(["Superman Returns", "Tom and Jerry: The Magic Ring"], ["o", "d"])

## node2vec 模型搭建影视作品推荐系统

In [None]:
import math
import pandas as pd
import numpy as np


def get_recommendation(G, root, label):
	# 在图G中，使用 Adamic/Adar 公式，根据root节点以及label标签进行推荐
	# 使用字典记录共同邻居，字典中的key，value表示root与key的共同邻居是value
    commons_dict = {}
	# 遍历root节点附近的邻居e1
    for e1 in G.neighbors(root):
	    # 遍历e1附近的邻居e2
        for e2 in G.neighbors(e1):
		 # 假如e2为root节点本身，则跳过
            if e2 == root:
                continue
			# 判断e2节点的标签是否符合需求
            if G.nodes[e2]['label'] == label:
			    # 将e1加入 root与e2 的共同邻居集合
                commons = commons_dict.get(e2)
                if commons==None:
                    commons_dict.update({e2 : [e1]})
                else:
                    commons.append(e1)
                    commons_dict.update({e2 : commons})
    nodes =[]
    weight=[]
	# 根据Adamic/Adar公式计算相似度
    for key, values in commons_dict.items():
        w = 0.0
        for e in values:
            w = w + 1 / math.log(G.degree(e))
        nodes.append(key) 
        weight.append(w)
    # 按相似度从高到低排序返回结果
    result = pd.Series(data=np.array(weight), index=nodes)
    result.sort_values(inplace=True, ascending=False)        
    return result

In [None]:
# 根据Ocean's Twelve推荐一批电影
recommends = get_recommendation(gn, "Ocean's Twelve", label="MOVIE")
print("*"*40+"\n Recommendation for 'Ocean's Twelve'\n"+"*"*40)
print(recommends.head())

In [None]:
recommends = get_recommendation(gn, "Superman Returns", label="MOVIE")
print("*"*40+"\n Recommendation for 'Superman Returns'\n"+"*"*40)
print(recommends.head())

In [None]:
recommends = get_recommendation(gn, "Tom and Jerry: The Magic Ring", label="MOVIE")
print("*"*40+"\n Recommendation for 'Tom and Jerry: The Magic Ring'\n"+"*"*40)
print(recommends.head())

In [None]:
recommends = get_recommendation(gn, "Brad Pitt", label="PERSON")
print("*"*40+"\n Recommendation for 'Brad Pitt'\n"+"*"*40)
print(recommends.head())

In [None]:
def get_recommendation_by_n2v(name):
    for node, _ in model.most_similar(name):
        print(node)

get_recommendation_by_n2v("Ocean's Twelve")

In [None]:
model.wv.save_word2vec_format("./models/netflix_n2v")
model.save("./models/netflix_n2v_model")

## 基于图卷积神经网络的图节点分类

In [None]:
! wget https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz
! tar -xf cora.tgz

! ls cora

In [None]:
import pandas as pd
# 将数据读入dataFrame数据结构
raw_data = pd.read_csv('cora/cora.content',sep = '\t',header = None)
 # 样本点数2708
num = raw_data.shape[0]
print("样本点数", num)
raw_data

In [None]:
print('论文id：', raw_data[0][0])

In [None]:
# 第一篇论文的bag of words向量
raw_data.iloc[0:1, 1:1434]

In [None]:
print('论文出现词语数量：', raw_data.iloc[0:1, 1:1434].sum(1)[0])

In [None]:
print('论文类型：', raw_data.iloc[0, 1434])

In [None]:
# 论文引用数据
raw_data_cites = pd.read_csv('cora/cora.cites',sep = '\t',header = None)
raw_data_cites[:10]

In [None]:
import numpy as np
import scipy.sparse as sp
import torch


def encode_onehot(labels):
    """
    将标签变为onehot向量
    """
    classes = set(labels)
    classes_dict = {c: np.identity(len(classes))[i, :] for i, c in
                    enumerate(classes)}
    labels_onehot = np.array(list(map(classes_dict.get, labels)),
                             dtype=np.int32)
    return labels_onehot

def load_data(path="./cora/", dataset="cora"):
    """Load citation network dataset (cora only for now)"""
    print('Loading {} dataset...'.format(dataset))

    idx_features_labels = np.genfromtxt("{}{}.content".format(path, dataset),
                                        dtype=np.dtype(str))
    features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
    labels = encode_onehot(idx_features_labels[:, -1])

    # build graph
    idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
    idx_map = {j: i for i, j in enumerate(idx)}
    edges_unordered = np.genfromtxt("{}{}.cites".format(path, dataset),
                                    dtype=np.int32)
    edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                     dtype=np.int32).reshape(edges_unordered.shape)
    adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                        shape=(labels.shape[0], labels.shape[0]),
                        dtype=np.float32)

    # build symmetric adjacency matrix
    adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

    features = normalize(features)
    adj = normalize(adj + sp.eye(adj.shape[0]))

    idx_train = range(140)
    idx_val = range(200, 500)
    idx_test = range(500, 1500)

    features = torch.FloatTensor(np.array(features.todense()))
    labels = torch.LongTensor(np.where(labels)[1])
    adj = sparse_mx_to_torch_sparse_tensor(adj)

    idx_train = torch.LongTensor(idx_train)
    idx_val = torch.LongTensor(idx_val)
    idx_test = torch.LongTensor(idx_test)

    return adj, features, labels, idx_train, idx_val, idx_test

def normalize(mx):
    """
    按行对稀疏矩阵进行归一化
    """
    rowsum = np.array(mx.sum(1))
    r_inv = np.power(rowsum, -1).flatten()
    r_inv[np.isinf(r_inv)] = 0.
    r_mat_inv = sp.diags(r_inv)
    mx = r_mat_inv.dot(mx)
    return mx

def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """
    将一个稀疏矩阵从scipy格式转化为torch格式
    """
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    values = torch.from_numpy(sparse_mx.data)
    shape = torch.Size(sparse_mx.shape)
    return torch.sparse.FloatTensor(indices, values, shape)

def accuracy(output, labels):
    """
    准确率计算方法
    """
    preds = output.max(1)[1].type_as(labels)
    correct = preds.eq(labels).double()
    correct = correct.sum()
    return correct / len(labels)

In [None]:
# content数据转换为numpy向量
idx_features_labels = np.array(raw_data)
# 将每篇论文的词袋向量取出作为每篇文章的特征向量并存储为稀疏矩阵格式
features = sp.csr_matrix(idx_features_labels[:, 1:-1], dtype=np.float32)
# 将每篇论文的类型取出作为label并转换成one hot向量
labels = encode_onehot(idx_features_labels[:, -1])

# 取出每篇论文的id
idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
# 将论文id映射到[0, 2708这个区间]
idx_map = {j: i for i, j in enumerate(idx)}

# cites数据转换为numpy向量
edges_unordered = np.array(raw_data_cites)
# 将cites数据中的id映射到[0, 2708这个区间]
edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                 dtype=np.int32).reshape(edges_unordered.shape)
# 将论文间的引用关系存储成稀疏矩阵格式
adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                    shape=(labels.shape[0], labels.shape[0]),
                    dtype=np.float32)

# 构建对称的邻接矩阵
adj = adj + adj.T.multiply(adj.T > adj) - adj.multiply(adj.T > adj)

# 对文章的特征进行归一化
features = normalize(features)
# 【先将邻接矩阵加上一个单位矩阵，然后对其进行归一化】对邻接矩阵进行归一化
adj = normalize(adj + sp.eye(adj.shape[0]))

# 产出最终的向量
idx_train = range(140)
idx_val = range(200, 500)
idx_test = range(500, 1500)

features = torch.FloatTensor(np.array(features.todense()))
labels = torch.LongTensor(np.where(labels)[1])
adj = sparse_mx_to_torch_sparse_tensor(adj)

idx_train = torch.LongTensor(idx_train)
idx_val = torch.LongTensor(idx_val)
idx_test = torch.LongTensor(idx_test)

In [None]:
import math
import torch

from torch.nn.parameter import Parameter
from torch.nn.modules.module import Module


class GraphConvolution(Module):
    """
    图卷积的一个简单实现，具体可以参考论文 https://arxiv.org/abs/1609.02907
    ...
    Attributes
    ----------
    in_features : int
        图卷积输入特征向量的大小，即 $|H^{(l)}|$
    out_features : int
        图卷积输出向量的大小，即 $|H^{(l+1)}|$
    bias : bool
        是否使用偏置向量，默认为 True，即默认是使用偏置向量
    weight: Parameter
        图卷积中可训练的参数，
        
    Methods
    -------
    __init__(self, in_features, out_features, bias=True)
        图卷积的构造函数，定义输入特征的大小，输出向量的大小，是否使用偏置，参数
    reset_parameters(self)
        初始化图卷积中的参数
    forward(self, input, adj)
        前向传播函数，input 是特征输入，adj 是变换后的邻接矩阵 $N(A)=D^{-1}\tilde{A}$。完成前向传播的计算逻辑，$N(A) H^{(l)} W^{(l)}$
    __repr__(self)
        重构类名表达
    """

    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, input, adj):
        # H * W
        support = torch.mm(input, self.weight)
        # N(A) * H * W
        output = torch.spmm(adj, support)
        if self.bias is not None:
            # N(A) * H * W + b
            return output + self.bias
        else:
            return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' + str(self.in_features) + ' -> ' + str(self.out_features) + ')'

In [None]:
import torch.nn as nn
import torch.nn.functional as F


class GCN(nn.Module):
    '''
    两层图卷积神经网络模型
    ...
    Attributes
    ----------
    n_feat : int
        图网络输入特征向量的大小
    n_hid : int
        隐藏层维度大小，即第一层图卷积层的输出向量的大小
    n_class : int
        分类器类别数量
    dropout: float
        dropout 率
        
    Methods
    -------
    __init__(self, n_feat, n_hid, n_class, dropout)
        两层图卷积神经网络构造函数，定义输入 feature 的维度，隐藏层维度，分类器类别数量，dropout 率
    forward(self, x, adj)
        前向传播函数，x 是图网络输入 feature，adj 是已经变换过的邻接矩阵 $N(A)$
    '''
    def __init__(self, n_feat, n_hid, n_class, dropout):
        super(GCN, self).__init__()
        # 定义第一层图卷积层，输入是图网络 feature，维度是 n_feat，输出维度是 n_hid
        self.gc1 = GraphConvolution(n_feat, n_hid)
        # 定义第二层图卷积层，输入是第一层的输出向量，维度是 n_hid，输出是分类器在各个类别上的概率
        self.gc2 = GraphConvolution(n_hid, n_class)
        # 定义熟悉 dropout 率
        self.dropout = dropout

    def forward(self, x, adj):
        # 第一层卷积层的输出，并经过非线性激活函数 Relu 的输出
        x = F.relu(self.gc1(x, adj))
        # dropout
        x = F.dropout(x, self.dropout, training=self.training)
        # 第二层卷积层的输出，映射到输出类别维度
        x = self.gc2(x, adj)
        # 计算 log softmax
        return F.log_softmax(x, dim=1)

In [None]:
import time
import argparse
import numpy as np

import torch
import torch.nn.functional as F
import torch.optim as optim


# 训练超参配置
class Args:
    no_cuda = False     # 是否使用 cuda/gpu
    seed = 42           # 设置随机种子
    epochs = 200        # 迭代次数
    lr = 0.01           # 学习率
    weight_decay = 5e-4 # 学习率衰减
    hidden = 16         # 隐藏层维度
    dropout = 0.5       # dropout 率



args = Args()
# 是否使用 gpu/cuda
args.cuda = not args.no_cuda and torch.cuda.is_available()
# 设置随机种子
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)

# 加载数据，包括变换后的邻接矩阵，图网络输入 feature，分类标签，训练数据，验证数据，测试数据
adj, features, labels, idx_train, idx_val, idx_test = load_data()

# 利用定义好的两层图卷积神经网络模型来构造一个 GCN 实例，
# 图网络输入 feature 维度为 features.shape[1]
# 隐藏层维度为 args.hidden
# 分类输出类别数量为 labels.max().item() + 1
# dropout 率为 args.dropout
model = GCN(n_feat=features.shape[1],
            n_hid=args.hidden,
            n_class=labels.max().item() + 1,
            dropout=args.dropout)

# 构造一个 Adam 优化器，
# 需要优化的参数是 GCN 模型里的可训练参数
# 学习率设置为 args.lr
# 学习率衰减是 args.weight_decay
optimizer = optim.Adam(model.parameters(),
                       lr=args.lr, 
                       weight_decay=args.weight_decay)


# 模型训练函数，epoch 为第几次迭代
def train(epoch):
    # 记录第 epoch 次迭代的开始时间
    t = time.time()
    # 标记 GCN 模型处于 train mode
    model.train()
    # 在每一个 epoch 都需要先清空之前计算过的梯度
    optimizer.zero_grad()
    # 将图网络输入 feature 和变换后的邻接矩阵 adj 输入至两层图卷积神经网络 GCN 模型中，经过前向传播得到输出，该输出即为在分类类别上的预测概率
    output = model(features, adj)
    # 根据训练集的数据索引找到对应的输出概率和标签，由此计算损失 loss，以及准确率
    loss_train = F.nll_loss(output[idx_train], labels[idx_train])
    acc_train = accuracy(output[idx_train], labels[idx_train])
    # 误差反向传播
    loss_train.backward()
    # 优化器开始进行优化 GCN 中的可训练参数
    optimizer.step()

    
    # 利用验证集数据对该 epoch 训练结果进行验证。验证过程需要关闭train mode 并打开 eval model
    model.eval()
    # 同样进行前向传播
    output = model(features, adj)
    # 根据验证集的数据索引找到对应的输出概率和标签，由此计算损失 loss，以及准确率
    loss_val = F.nll_loss(output[idx_val], labels[idx_val])
    acc_val = accuracy(output[idx_val], labels[idx_val])
    
    # 打印所有的结果，以及所需要的时间
    print('Epoch: {:04d}'.format(epoch+1),
          'loss_train: {:.4f}'.format(loss_train.item()),
          'acc_train: {:.4f}'.format(acc_train.item()),
          'loss_val: {:.4f}'.format(loss_val.item()),
          'acc_val: {:.4f}'.format(acc_val.item()),
          'time: {:.4f}s'.format(time.time() - t))

# 记录模型训练开始的时间
t_start = time.time()
# 开始迭代训练 GCN 模型，迭代次数设置为 args.epochs
for epoch in range(args.epochs):
    train(epoch)
    
print("模型训练完成！")
print("模型训练总耗时: {:.4f}s".format(time.time() - t_start))

In [None]:
# 模型测试函数
def test():
    # 首先标记模型为 eval mode
    model.eval()
    # 将图网络输入 feature 和变换后的邻接矩阵 adj 输入至两层图卷积神经网络 GCN 模型中，经过前向传播得到输出，该输出即为在分类类别上的预测概率
    output = model(features, adj)
    # 根据测试集的数据索引找到对应的输出概率和标签，由此计算损失 loss，以及准确率
    loss_test = F.nll_loss(output[idx_test], labels[idx_test])
    acc_test = accuracy(output[idx_test], labels[idx_test])
    # 打印测试结果
    print("Test set results:",
          "loss= {:.4f}".format(loss_test.item()),
          "accuracy= {:.4f}".format(acc_test.item()))
    
    
test()

## 基于图卷积神经网络的链接预测

In [None]:
import torch
import numpy as np
import pandas as pd
import networkx as nx
import scipy.sparse as sp
```
```python
def encode_onehot(labels):
    """
    将标签变为onehot向量
    """
    classes = set(labels)
    classes_dict = {c: np.identity(len(classes))[i, :] for i, c in
                    enumerate(classes)}
    labels_onehot = np.array(list(map(classes_dict.get, labels)),
                             dtype=np.int32)
    return labels_onehot

def sparse_mx_to_torch_sparse_tensor(sparse_mx):
    """
    将scipy稀疏矩阵转换成torch稀疏矩阵
    """
    # 取出矩阵中每个不为0的值的坐标
    sparse_mx = sparse_mx.tocoo().astype(np.float32)
    # 坐标转化为numpy格式
    indices = torch.from_numpy(
        np.vstack((sparse_mx.row, sparse_mx.col)).astype(np.int64))
    # 取出矩阵中的值，转化为torch格式
    values = torch.from_numpy(sparse_mx.data)
    # 获得矩阵的形状，转化为torch格式
    shape = torch.Size(sparse_mx.shape)
    # 返回torch格式的稀疏矩阵
    return torch.sparse.FloatTensor(indices, values, shape)

def sparse_to_tuple(sparse_mx):
    """
    将scipy稀疏矩阵转换成三元组，包括（坐标，值，矩阵形状）
    """
    if not sp.isspmatrix_coo(sparse_mx):
        sparse_mx = sparse_mx.tocoo()
    coords = np.vstack((sparse_mx.row, sparse_mx.col)).transpose()
    values = sparse_mx.data
    shape = sparse_mx.shape
    return coords, values, shape

def preprocess_graph(adj):
    """
    图的预处理，包括归一化操作
    """
    adj = sp.coo_matrix(adj)
    adj_ = adj + sp.eye(adj.shape[0])
    rowsum = np.array(adj_.sum(1))
    degree_mat_inv_sqrt = sp.diags(np.power(rowsum, -0.5).flatten())
    adj_normalized = adj_.dot(degree_mat_inv_sqrt).transpose().dot(degree_mat_inv_sqrt).tocoo()
    return sparse_mx_to_torch_sparse_tensor(adj_normalized)

def mask_test_edges(adj):
    """
    数据集划分，随机将10%的边作为测试集
    """

    # 将对角线上的值去掉
    adj = adj - sp.dia_matrix((adj.diagonal()[np.newaxis, :], [0]), shape=adj.shape)
    adj.eliminate_zeros()

    adj_triu = sp.triu(adj)
    adj_tuple = sparse_to_tuple(adj_triu)
    edges = adj_tuple[0]
    edges_all = sparse_to_tuple(adj)[0]
    num_test = int(np.floor(edges.shape[0] / 10.))
    num_val = int(np.floor(edges.shape[0] / 20.))

    all_edge_idx = list(range(edges.shape[0]))
    np.random.shuffle(all_edge_idx)
    val_edge_idx = all_edge_idx[:num_val]
    test_edge_idx = all_edge_idx[num_val:(num_val + num_test)]
    test_edges = edges[test_edge_idx]
    val_edges = edges[val_edge_idx]
    train_edges = np.delete(edges, np.hstack([test_edge_idx, val_edge_idx]), axis=0)

    def ismember(a, b, tol=5):
        rows_close = np.all(np.round(a - b[:, None], tol) == 0, axis=-1)
        return np.any(rows_close)

    test_edges_false = []
    while len(test_edges_false) < len(test_edges):
        idx_i = np.random.randint(0, adj.shape[0])
        idx_j = np.random.randint(0, adj.shape[0])
        if idx_i == idx_j:
            continue
        if ismember([idx_i, idx_j], edges_all):
            continue
        if test_edges_false:
            if ismember([idx_j, idx_i], np.array(test_edges_false)):
                continue
            if ismember([idx_i, idx_j], np.array(test_edges_false)):
                continue
        test_edges_false.append([idx_i, idx_j])

    val_edges_false = []
    while len(val_edges_false) < len(val_edges):
        idx_i = np.random.randint(0, adj.shape[0])
        idx_j = np.random.randint(0, adj.shape[0])
        if idx_i == idx_j:
            continue
        if ismember([idx_i, idx_j], train_edges):
            continue
        if ismember([idx_j, idx_i], train_edges):
            continue
        if ismember([idx_i, idx_j], val_edges):
            continue
        if ismember([idx_j, idx_i], val_edges):
            continue
        if val_edges_false:
            if ismember([idx_j, idx_i], np.array(val_edges_false)):
                continue
            if ismember([idx_i, idx_j], np.array(val_edges_false)):
                continue
        val_edges_false.append([idx_i, idx_j])

    assert ~ismember(test_edges_false, edges_all)
    assert ~ismember(val_edges_false, edges_all)
    assert ~ismember(val_edges, train_edges)
    assert ~ismember(test_edges, train_edges)
    assert ~ismember(val_edges, test_edges)

    data = np.ones(train_edges.shape[0])

    # 重新构建邻接矩阵
    adj_train = sp.csr_matrix((data, (train_edges[:, 0], train_edges[:, 1])), shape=adj.shape)
    adj_train = adj_train + adj_train.T

    return adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false



In [None]:

# 将数据读入dataFrame数据结构
raw_data = pd.read_csv('cora/cora.content', sep='\t', header=None)
# 样本点数2708
num = raw_data.shape[0]
raw_data_cites = pd.read_csv('cora/cora.cites', sep='\t', header=None)
# content数据转换为numpy向量
idx_features_labels = np.array(raw_data)
# 将每篇论文的词袋向量取出作为每篇文章的特征向量并存储为稀疏矩阵格式
features = sp.csr_matrix(idx_features_labels[:, 1: -1], dtype=np.float32)

# 取出每篇论文的id
idx = np.array(idx_features_labels[:, 0], dtype=np.int32)
# 将论文id映射到[0, 2708这个区间]
idx_map = {j: i for i, j in enumerate(idx)}

# cites数据转换为numpy向量
edges_unordered = np.array(raw_data_cites)
# 将cites数据中的id映射到[0, 2708这个区间]
edges = np.array(list(map(idx_map.get, edges_unordered.flatten())),
                 dtype=np.int32).reshape(edges_unordered.shape)

label = encode_onehot(idx_features_labels[:, -1])
# 将论文间的引用关系存储成稀疏矩阵格式
adj = sp.coo_matrix((np.ones(edges.shape[0]), (edges[:, 0], edges[:, 1])),
                    shape=(label.shape[0], label.shape[0]),
                    dtype=np.float32)

features = torch.FloatTensor(np.array(features.todense()))
adj = nx.adjacency_matrix(nx.convert_matrix.from_scipy_sparse_matrix(adj))

# 把原始的邻接矩阵存下来方便后面评测用
adj_orig = adj
adj_orig = adj_orig - sp.dia_matrix((adj_orig.diagonal()[np.newaxis, :], [0]), shape=adj_orig.shape)
adj_orig.eliminate_zeros()

adj_train, train_edges, val_edges, val_edges_false, test_edges, test_edges_false = mask_test_edges(adj)
adj = adj_train

# 图的预处理
adj_norm = preprocess_graph(adj)
adj_label = adj_train + sp.eye(adj_train.shape[0])
adj_label = torch.FloatTensor(adj_label.toarray())

# 计算引用与引用两个关系之间的比值，用于后续训练
pos_weight = float(adj.shape[0] * adj.shape[0] - adj.sum()) / adj.sum()
# 正则项，用于后续训练
norm = adj.shape[0] * adj.shape[0] / float((adj.shape[0] * adj.shape[0] - adj.sum()) * 2)

In [None]:
import torch.nn as nn
import torch.nn.functional as F

class GCN4Link(nn.Module):
    '''
    两层图卷积神经网络模型
    ...
    Attributes
    ----------
    n_feat : int
        图网络输入特征向量的大小
    n_hid : int
        隐藏层维度大小
    n_class : int
        分类器类别数量
    dropout: float
        dropout 率

    Methods
    -------
    __init__(self, n_feat, n_hid, n_class, dropout)
        两层图卷积神经网络构造函数，定义输入 feature 的维度，隐藏层维度，分类器类别数量，dropout 率
    forward(self, x, adj)
        前向传播函数，x 是图网络输入 feature，adj 是已经变换过的邻接矩阵 $N(A)$
    '''

    def __init__(self, n_feat, n_hid, dropout):
        super(GCN4Link, self).__init__()
        # 定义第一层图卷积层，输入是图网络 feature，维度是 n_feat，输出维度是 n_hid
        self.gc1 = GraphConvolution(n_feat, n_hid)
        # 定义第二层图卷积层，输入维度是 n_hid，输出维度是 n_hid
        self.gc2 = GraphConvolution(n_hid, n_hid, dropout)
        # 定义熟悉 dropout 率
        self.dropout = dropout

    def forward(self, x, adj):
        # 第一层卷积层的输出，并经过非线性激活函数 Relu 的输出
        x = F.relu(self.gc1(x, adj))
        # dropout
        x = F.dropout(x, self.dropout, training=self.training)
        # 第二层卷积层的输出，映射到输出类别维度
        x = self.gc2(x, adj)

        # dropout增加模型鲁棒性 
        x = F.dropout(x, self.dropout, training=self.training)

        # 基于点乘计算每两个节点之间存在边的概率
        adj_preds = torch.mm(x, x.t())
        return adj_preds

In [None]:
# 训练超参配置
class Args:
    no_cuda = False     # 是否使用 cuda/gpu
    seed = 42           # 设置随机种子
    epochs = 500        # 迭代次数
    lr = 0.01           # 学习率
    hidden = 64         # 隐藏层维度
    dropout = 0.       # dropout 率

In [None]:
from sklearn.metrics import average_precision_score

def get_acc_score(emb, adj_orig, edges_pos, edges_neg):
    """
    评测预测的准确率
    """
    def sigmoid(x):
        return 1 / (1 + np.exp(-x))

    adj_rec = np.dot(emb, emb.T)
    # 得到每一个正例的预测分数
    preds = []
    pos = []
    for e in edges_pos:
        preds.append(sigmoid(adj_rec[e[0], e[1]]))
        pos.append(adj_orig[e[0], e[1]])
    # 得到每一个负例的预测分数
    preds_neg = []
    neg = []
    for e in edges_neg:
        preds_neg.append(sigmoid(adj_rec[e[0], e[1]]))
        neg.append(adj_orig[e[0], e[1]])
    # 合并预测分数与标准答案，使用sklearn.metrics里面自带的正确率评价器评测指标
    preds_all = np.hstack([preds, preds_neg])
    labels_all = np.hstack([np.ones(len(preds)), np.zeros(len(preds_neg))])
    acc_score = average_precision_score(labels_all, preds_all)

    return acc_score

In [None]:
from torch import optim
import time

args = Args()
# 是否使用 gpu/cuda
args.cuda = not args.no_cuda and torch.cuda.is_available()
# 设置随机种子
np.random.seed(args.seed)
torch.manual_seed(args.seed)
if args.cuda:
    torch.cuda.manual_seed(args.seed)
    
# 初始化模型
model = GCN4Link(n_feat=features.shape[1],
        n_hid=args.hidden,
        dropout=args.dropout)

# 定义模型优化器
optimizer = optim.Adam(model.parameters(), lr=args.lr)

# 开始训练
for epoch in range(args.epochs):
    t = time.time()
    model.train()
    optimizer.zero_grad()
    # 前向传播
    preds = model(features, adj_norm)
    # 使用binary_cross_entropy计算loss，使用之前计算好的norm与pos_weight调节 引用边 与 非引用边 的数量
    loss = norm * F.binary_cross_entropy_with_logits(preds, adj_label, pos_weight=torch.tensor(pos_weight))

    # 反向传播
    loss.backward()
    cur_loss = loss.item()
    # 更新参数
    optimizer.step()

    # 在验证集上测试模型效果
    acc_curr = get_acc_score(preds.data.numpy(), adj_orig, val_edges, val_edges_false)

    print("Epoch:", '%04d' % (epoch + 1), "train_loss=", "{:.5f}".format(cur_loss),
          "accurancy=", "{:.5f}".format(acc_curr),
          "time=", "{:.5f}".format(time.time() - t)
          )

print("Optimization Finished!")

In [None]:
model.eval()
# 在测试集上测试模型效果
acc = get_acc_score(preds.data.numpy(), adj_orig, test_edges, test_edges_false)
print('Test accurancy: ' + str(acc))