1. 超参数设置：

In [None]:
lr = 0.01
n_epoch = 200
hidden_dim = 16
l2_coef = 5e-4
dataset = 'cora'
dataset_path = './examples/gcn/'
best_model_path = './'
self_loops = 1
gpu = -1
if gpu >= 0:
    tlx.set_device("GPU", gpu)
else:
    tlx.set_device("CPU")

2. 数据集处理与加载

In [None]:
dataset = CoraGraphDataset()
g = dataset[0]
model = DeepWalk(g)
dataloader = DataLoader(torch.arange(g.num_nodes()), batch_size=batch_size,
                        shuffle=True, collate_fn=model.sample)
train_mask = g.ndata['train_mask']
test_mask = g.ndata['test_mask']

X = model.node_embed.weight.detach()
y = g.ndata['label']


3. DeepWalk模型实现

In [None]:
class DeepWalk(nn.Module):
    def __init__(self, g, emb_dim=128, walk_length=40, window_size=5,
        neg_weight=1, negative_size=5, sparse=True):
        super().__init__()
        self.g = g
        self.emb_dim = emb_dim
        self.window_size = window_size
        self.walk_length = walk_length
        self.neg_weight = neg_weight
        self.negative_size = negative_size
        num_nodes = g.num_nodes()
        # 中心节点嵌入
        self.node_embed = nn.Embedding(num_nodes, emb_dim, sparse=sparse)
        self.context_embed = nn.Embedding(num_nodes, emb_dim, sparse=sparse)
        self.reset_parameters()
        # 获得正例列表索引对。
        idx_list_src = []
        idx_list_dst = []
        for i in range(walk_length):
            for j in range(max(0, i - window_size), i):
                idx_list_src.append(j)
                idx_list_dst.append(i)
            for j in range(i + 1, min(walk_length, i + 1 + window_size)):
                idx_list_src.append(j)
                idx_list_dst.append(i)

        self.idx_list_src = torch.LongTensor(idx_list_src)
        self.idx_list_dst = torch.LongTensor(idx_list_dst)

    def reset_parameters(self):
        # 初始化权重
        init_range = 1.0 / self.emb_dim
        init.uniform_(self.node_embed.weight.data, -init_range, init_range)
        init.constant_(self.context_embed.weight.data, 0)

    def sample(self, indices):
        return random_walk(self.g, indices, length=self.walk_length - 1)[0]

    def forward(self, batch_walk):
        batch_size = len(batch_walk)
        device = batch_walk.device

        # 获取批次节点和上下文节点的嵌入
        batch_node_embed = self.node_embed(batch_walk).view(-1, self.emb_dim)
        batch_context_embed = self.context_embed(batch_walk).view(
            -1, self.emb_dim
        )

        # 计算正样本的索引偏移
        batch_idx_list_offset = torch.arange(batch_size) * self.walk_length
        batch_idx_list_offset = batch_idx_list_offset.unsqueeze(1)
        idx_list_src = batch_idx_list_offset + self.idx_list_src.unsqueeze(0)
        idx_list_dst = batch_idx_list_offset + self.idx_list_dst.unsqueeze(0)
        idx_list_src = idx_list_src.view(-1).to(device)
        idx_list_dst = idx_list_dst.view(-1).to(device)

        # 获取正样本的嵌入
        pos_src_emb = batch_node_embed[idx_list_src]
        pos_dst_emb = batch_context_embed[idx_list_dst]

        # 获取负样本的嵌入
        neg_idx_list_src = idx_list_dst.unsqueeze(1) + torch.zeros(
            self.negative_size
        ).unsqueeze(0).to(device)
        neg_idx_list_src = neg_idx_list_src.view(-1)
        neg_src_emb = batch_node_embed[neg_idx_list_src.long()]

        neg_idx_list_dst = list(range(batch_size * self.walk_length)) * (
            self.negative_size * self.window_size * 2
        )
        random.shuffle(neg_idx_list_dst)
        neg_idx_list_dst = neg_idx_list_dst[: len(neg_idx_list_src)]
        neg_idx_list_dst = torch.LongTensor(neg_idx_list_dst).to(device)
        neg_dst_emb = batch_context_embed[neg_idx_list_dst]

        # 计算正样本得分
        pos_score = torch.sum(torch.mul(pos_src_emb, pos_dst_emb), dim=1)
        pos_score = torch.clamp(pos_score, max=6, min=-6)
        pos_score = torch.mean(-F.logsigmoid(pos_score))
        # 计算负样本得分
        neg_score = torch.sum(torch.mul(neg_src_emb, neg_dst_emb), dim=1)
        neg_score = torch.clamp(neg_score, max=6, min=-6)
        neg_score = (
            torch.mean(-F.logsigmoid(-neg_score))
            * self.negative_size
            * self.neg_weight
        )
        # 取正样本得分和负样本得分作为损失
        return torch.mean(pos_score + neg_score)


4. 预测器实现

In [None]:
# 预测器
class MLPClassifier(nn.Module):
    def __init__(self, in_feats, hidden_size, out_feats):
        super(MLPClassifier, self).__init__()
        self.fc1 = nn.Linear(in_feats, hidden_size)
        self.fc2 = nn.Linear(hidden_size, out_feats)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x


5. 设置优化器

In [None]:
optimizer = SparseAdam(model.parameters(), lr=lr)

6. DeepWalk训练流程

In [None]:
for epoch in range(epochs):
    for batch_walk in dataloader:
        loss = model(batch_walk)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"DeepWalk Epoch {epoch}, Loss: {loss.item()}")


7. 预测器训练与推理流程

In [None]:
mlp_model = MLPClassifier(in_feats=emb_dim,
                        hidden_size=hidden_dim,
                        out_feats=g.ndata['label'].max().item() + 1)
optimizer_mlp = optim.Adam(mlp_model.parameters(), lr=lr)
# 训练流程
for epoch in range(epochs):
    mlp_model.train()
    optimizer_mlp.zero_grad()
    # 使用DeepWalk嵌入作为特征输入
    output = mlp_model(X[train_mask])
    loss = F.cross_entropy(output, y[train_mask])
    loss.backward()
    optimizer_mlp.step()
print(f'MLP Epoch {epoch}, Loss: {loss.item()}')
# 推理流程
mlp_model.eval()
with torch.no_grad():
    output = mlp_model(X[test_mask])
    _, predicted = torch.max(output, 1)
    accuracy = (predicted == y[test_mask]).sum().item() / len(y[test_mask])
    print(f'Accuracy: {accuracy:.4f}')
