# 数据准备

In [1]:
from utils import DGraphFin
from utils.utils import prepare_folder
from utils.evaluator import Evaluator

import torch
import torch.nn.functional as F
import torch.nn as nn

import torch_geometric.transforms as T

import numpy as np
from torch_geometric.data import Data
import os

#设置gpu设备
# deviceId = 0
# device = f'cuda:{deviceId}' if torch.cuda.is_available() else 'cpu'
device = 'cpu'
device = torch.device(device)

path='./datasets/632d74d4e2843a53167ee9a1-momodel/' #数据保存路径
save_dir='./results/' #模型保存路径
dataset_name='DGraph'
dataset = DGraphFin(root=path, name=dataset_name, transform=T.ToSparseTensor())

nlabels = dataset.num_classes
if dataset_name in ['DGraph']:
    nlabels = 2    #本实验中仅需预测类0和类1

data = dataset[0]
data.adj_t = data.adj_t.to_symmetric() #将有向图转化为无向图


if dataset_name in ['DGraph']:
    x = data.x
    x = (x - x.mean(0)) / x.std(0)
    data.x = x
if data.y.dim() == 2:
    data.y = data.y.squeeze(1)

split_idx = {'train': data.train_mask, 'valid': data.valid_mask, 'test': data.test_mask}  #划分训练集，验证集

train_idx = split_idx['train']
valid_idx = split_idx['valid']
test_idx = split_idx['test']

data.to(device)
train_idx = split_idx['train'].to(device)

result_dir = prepare_folder(dataset_name,'mlp')

print(data)
print(data.x.shape)  #feature
print(data.y.shape)  #label

nfeats = data.x.shape[1]


  from .autonotebook import tqdm as notebook_tqdm


111
Data(x=[3700550, 20], edge_attr=[4300999], y=[3700550], train_mask=[857899], valid_mask=[183862], test_mask=[183840], adj_t=[3700550, 3700550, nnz=7994520])
torch.Size([3700550, 20])
torch.Size([3700550])


In [2]:
type(data)

torch_geometric.data.data.Data

# 模型结构定义

In [3]:
from typing import Union

from torch import Tensor
from torch_sparse import SparseTensor
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

class SAGE(torch.nn.Module):
    def __init__(self
                 , in_channels
                 , hidden_channels
                 , out_channels
                 , num_layers
                 , dropout
                 , batchnorm=True):
        super(SAGE, self).__init__()

        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        self.bns = torch.nn.ModuleList()
        self.batchnorm = batchnorm
        if self.batchnorm:
            self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        for _ in range(num_layers - 2):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
            if self.batchnorm:
                self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        self.convs.append(SAGEConv(hidden_channels, out_channels))

        self.dropout = dropout

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        if self.batchnorm:
            for bn in self.bns:
                bn.reset_parameters()

    def forward(self, x, edge_index: Union[Tensor, SparseTensor]):
        for i, conv in enumerate(self.convs[:-1]):
            x = conv(x, edge_index)
            if self.batchnorm: 
                x = self.bns[i](x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.convs[-1](x, edge_index)
        return x.log_softmax(dim=-1)
    


In [4]:
sage_parameters = {'lr':0.01
    , 'num_layers':2
    , 'hidden_channels':128
    , 'dropout':0
    , 'batchnorm': False
    , 'weight_decay':5e-7
}

In [5]:
para_dict = sage_parameters
model_para = sage_parameters.copy()
model_para.pop('lr')
model_para.pop('weight_decay')        
model = SAGE(in_channels = data.x.size(-1), out_channels = nlabels, **model_para).to(device)
#AUC
eval_metric = 'auc'
evaluator = Evaluator(eval_metric)


# 训练

In [11]:
addition_para = {
    'epoch':500
    , 'runs':10
    , 'log_steps':10
}
para_dict.update(addition_para)

In [7]:
class Logger(object):
    def __init__(self, runs, info=None):
        self.info = info
        self.results = [[] for _ in range(runs)]

    def add_result(self, run, result):
        assert len(result) == 2
        assert run >= 0 and run < len(self.results)
        self.results[run].append(result)

    def print_statistics(self, run=None):
        if run is not None:
            result = 100 * torch.tensor(self.results[run])
            argmax = result[:, 1].argmax().item()
            print(f'Run {run + 1:02d}:')
            print(f'Highest Train: {result[:, 0].max():.2f}')
            print(f'Highest Valid: {result[:, 1].max():.2f}')
            print(f'  Final Train: {result[argmax, 0]:.2f}')
        else:
            result = 100 * torch.tensor(self.results)

            best_results = []
            for r in result:
                train1 = r[:, 0].max().item()
                valid = r[:, 1].max().item()
                train2 = r[r[:, 1].argmax(), 0].item()
                best_results.append((train1, valid, train2))

            best_result = torch.tensor(best_results)

            print(f'All runs:')
            r = best_result[:, 0]
            highest_train, highest_train_std = r.mean().item(), r.std().item()
            print(f'Highest Train: {r.mean():.4f} ± {r.std():.4f}')
            r = best_result[:, 1]
            highest_valid, highest_valid_std = r.mean().item(), r.std().item()
            print(f'Highest Valid: {r.mean():.4f} ± {r.std():.4f}')
            r = best_result[:, 2]
            final_train, final_train_std = r.mean().item(), r.std().item()
            print(f'  Final Train: {r.mean():.4f} ± {r.std():.4f}')
            
            return {'train': round(final_train, 4)
                    , 'train_std': round(final_train_std, 4)
                    , 'valid': round(highest_valid, 4)
                    , 'valid_std': round(highest_valid_std, 4)
                   }


In [8]:
logger = Logger(runs= 10)

In [9]:
def train(model, data, train_idx, optimizer, no_conv=False):
    # data.y is labels of shape (N, ) 
    model.train()

    optimizer.zero_grad()
    if no_conv:
        out = model(data.x[train_idx])
    else:
        out = model(data.x, data.adj_t)[train_idx]
    loss = F.nll_loss(out, data.y[train_idx])
    loss.backward()
    optimizer.step()

    return loss.item()


@torch.no_grad()
def test(model, data, split_idx, evaluator, no_conv=False):
    # data.y is labels of shape (N, )
    model.eval()
    
    if no_conv:
        out = model(data.x)
    else:
        out = model(data.x, data.adj_t)
        
    y_pred = out.exp()  # (N,num_classes)
    
    losses, eval_results = dict(), dict()
    for key in ['train', 'valid']:
        node_id = split_idx[key]
        losses[key] = F.nll_loss(out[node_id], data.y[node_id]).item()
        eval_results[key] = evaluator.eval(data.y[node_id], y_pred[node_id])[eval_metric]
            
    return eval_results, losses, y_pred

In [None]:
best_model_dict = None
for run in range(para_dict['runs']):

    import gc
    gc.collect()
    print(sum(p.numel() for p in model.parameters()))

    model.reset_parameters()
    optimizer = torch.optim.Adam(model.parameters(), lr=para_dict['lr'], weight_decay=para_dict['weight_decay'])
    best_valid = 0
    min_valid_loss = 1e8
    best_out = None

    for epoch in range(1, para_dict['epoch']+1):
        loss = train(model, data, train_idx, optimizer)
        eval_results, losses, out = test(model, data, split_idx, evaluator)
        train_eval, valid_eval = eval_results['train'], eval_results['valid']
        train_loss, valid_loss = losses['train'], losses['valid']

#                 if valid_eval > best_valid:
#                     best_valid = valid_result
#                     best_out = out.cpu().exp()
        if valid_loss < min_valid_loss:
            min_valid_loss = valid_loss
            best_out = out.cpu()
            best_model_dict =  model.state_dict()
        if epoch % para_dict['log_steps'] == 0:
            print(f'Run: {run + 1:02d}, '
                        f'Epoch: {epoch:02d}, '
                        f'Loss: {loss:.4f}, '
                        f'Train: {100 * train_eval:.3f}%, '
                        f'Valid: {100 * valid_eval:.3f}% ')
        logger.add_result(run, [train_eval, valid_eval])
    
    logger.print_statistics(run)
    torch.save(best_model_dict,save_dir+f'epoch500-SAGE{run}-{100 * valid_eval:.3f}.pt',_use_new_zipfile_serialization=False)

In [None]:
final_results = logger.print_statistics()
print('final_results:', final_results)
para_dict.update(final_results)

In [11]:
## 生成 main.py 时请勾选此 cell
from utils import DGraphFin
from utils.evaluator import Evaluator
import torch
import torch.nn.functional as F
import torch.nn as nn
import torch_geometric.transforms as T
from torch_geometric.data import Data
import numpy as np
import os

nlabels = 2    #本实验中仅需预测类0和类1

sage_parameters = {
    'num_layers':2
    , 'hidden_channels':128
    , 'dropout':0
    , 'batchnorm': False
}
from typing import Union

from torch import Tensor
from torch_sparse import SparseTensor
import torch
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv

class SAGE(torch.nn.Module):
    def __init__(self
                 , in_channels
                 , hidden_channels
                 , out_channels
                 , num_layers
                 , dropout
                 , batchnorm=True):
        super(SAGE, self).__init__()

        self.convs = torch.nn.ModuleList()
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        self.bns = torch.nn.ModuleList()
        self.batchnorm = batchnorm
        if self.batchnorm:
            self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        for _ in range(num_layers - 2):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
            if self.batchnorm:
                self.bns.append(torch.nn.BatchNorm1d(hidden_channels))
        self.convs.append(SAGEConv(hidden_channels, out_channels))

        self.dropout = dropout

    def reset_parameters(self):
        for conv in self.convs:
            conv.reset_parameters()
        if self.batchnorm:
            for bn in self.bns:
                bn.reset_parameters()

    def forward(self, x, edge_index: Union[Tensor, SparseTensor]):
        for i, conv in enumerate(self.convs[:-1]):
            x = conv(x, edge_index)
            if self.batchnorm: 
                x = self.bns[i](x)
            x = F.relu(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.convs[-1](x, edge_index)
        return x.log_softmax(dim=-1)
    
predict_model = None 
y_pred = None

def predict(data,node_id):
    """
    加载模型和模型预测
    :param node_id: int, 需要进行预测节点的下标
    :return: tensor, 类0以及类1的概率, torch.size[1,2]
    """
    global predict_model
    global y_pred
    if predict_model == None:
        predict_model = SAGE(in_channels = data.x.size(-1), out_channels = nlabels, **sage_parameters)
        predict_model.load_state_dict(torch.load('./results/epoch500-SAGE1-76.654.pt'))
    
    predict_model.eval()
    if y_pred == None:
        out = predict_model(data.x,data.adj_t)
        y_pred = out.exp()
    
    return y_pred[node_id]