# MPNN训练 Lipo数据集

## 添加包环境路径

In [1]:
import sys
sys.path.append('/Users/yuekong/Desktop/Github/Kinase-Transformer-GNN')
from molecular_network.mol_dataset import Dataset
from molecular_network.util import modelEvaluator

In [2]:
import numpy as np
import pandas as pd 
import torch
import torch.nn.functional as F
from torch.nn import Sequential, Linear, ReLU, GRU

import torch_geometric.transforms as T
from torch_geometric.nn import NNConv, Set2Set
from torch_geometric.data import DataLoader
from torch_geometric.utils import remove_self_loops
from molecular_network.util import modelEvaluator

# 文件路径定义

In [3]:
dataset_name = 'Lipo'
root = '/Users/yuekong/Desktop/Github/Kinase-Transformer-GNN/data/%s'%(dataset_name)
path_ckpt = '../checkpoints/%s.ckpt'%(dataset_name)
path_prds = '../prds_saved/%s.pkl'%(dataset_name)

# 模型超参

In [4]:
dim = 64

# 载入数据集

In [5]:
dataset = Dataset(root,'Lipophilicity.csv').shuffle()
dataset.data.y.shape

torch.Size([4200])

# 归一化数据集

In [6]:
# Normalize targets to mean = 0 and std = 1.
mean = dataset.data.y.mean(dim=0, keepdim=True)
std = dataset.data.y.std(dim=0, keepdim=True)
dataset.data.y = (dataset.data.y - mean) / std ## 只考虑第1个性质
length = dataset.data.y.shape[0]
print('数据集含%i个化合物'%(length))

数据集含4200个化合物


In [7]:
dataset.data

Data(edge_attr=[247798, 5], edge_index=[2, 247798], x=[113565, 31], y=[4200])

# 划分训练集

In [8]:
# 划分比例 tr: val : te = 8:1:1 
split = length//10
print('test set 含 %i 样本'%(split))

test set 含 420 样本


In [9]:
# Split datasets.

test_dataset = dataset[:split]
val_dataset = dataset[split:2*split]
train_dataset = dataset[2*split:]
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
print('------loader loaded------')


------loader loaded------


# 定义模型

In [10]:
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.lin0 = torch.nn.Linear(dataset.num_features, dim)

        # 这个nn是每个节点在message中用到的网络,5对应边特征数 
        # 注：此脚本中transform部分把edge_attr变成了5，而不是本来的4
        nn = Sequential(Linear(5, 128), ReLU(), Linear(128, dim * dim))
        self.conv = NNConv(dim, dim, nn, aggr='mean')
        self.gru = GRU(dim, dim)

        self.set2set = Set2Set(dim, processing_steps=3)
        self.lin1 = torch.nn.Linear(2 * dim, dim)
        self.lin2 = torch.nn.Linear(dim, 1)

    def forward(self, data):
        out = F.relu(self.lin0(data.x))
        h = out.unsqueeze(0)

        for i in range(3):
            m = F.relu(self.conv(out, data.edge_index, data.edge_attr))
            out, h = self.gru(m.unsqueeze(0), h)
            out = out.squeeze(0)

        out = self.set2set(out, data.batch)
        out = F.relu(self.lin1(out))
        out = self.lin2(out)
        return out.view(-1)

# 训练前

In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Net().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
                                                       factor=0.7, patience=5,
                                                       min_lr=0.00001)

# 定义训练+测试

In [12]:
def train(epoch):
    model.train()
    loss_all = 0

    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
     
        x = model(data)
        x = x.to(torch.float64)
        loss = F.mse_loss(x, data.y)
        loss.backward()
        loss_all += loss.item() * data.num_graphs
        optimizer.step()
    return loss_all / len(train_loader.dataset)


def test(loader):
    test_prds = []
    model.eval()
    error = 0

    for data in loader:
        data = data.to(device)
        y_prd = model(data)
        error += (y_prd * std - data.y * std).abs().sum().item()  # MAE
        test_prds.append(y_prd)
    test_prds = torch.cat(test_prds)

    return error / len(loader.dataset), test_prds # 把预测的结果保存出来

# 训练！

In [18]:
best_val_error = None
for epoch in range(1, 51):
    print('-----------training begin-------------')
    lr = scheduler.optimizer.param_groups[0]['lr']
    loss = train(epoch)
    val_error, val_prds = test(val_loader)
    scheduler.step(val_error)

    if best_val_error is None or val_error <= best_val_error:
        test_error, test_prds = test(test_loader)
        best_val_error = val_error

    print('Epoch: {:03d}, LR: {:7f}, Loss: {:.7f}, Validation MAE: {:.7f}, '
          'Test MAE: {:.7f}'.format(epoch, lr, loss, val_error, test_error))
          
# 保存模型
torch.save(model.state_dict(), path_ckpt)      

-----------training begin-------------
Epoch: 001, LR: 0.001000, Loss: 0.1343046, Validation MAE: 0.3302059, Test MAE: 0.3498425
-----------training begin-------------
Epoch: 002, LR: 0.001000, Loss: 0.1263561, Validation MAE: 0.3775732, Test MAE: 0.3498425
-----------training begin-------------
Epoch: 003, LR: 0.001000, Loss: 0.1111784, Validation MAE: 0.3278644, Test MAE: 0.3549231
-----------training begin-------------
Epoch: 004, LR: 0.001000, Loss: 0.0989907, Validation MAE: 0.3064264, Test MAE: 0.3283437
-----------training begin-------------
Epoch: 005, LR: 0.001000, Loss: 0.0949764, Validation MAE: 0.3202582, Test MAE: 0.3283437
-----------training begin-------------
Epoch: 006, LR: 0.001000, Loss: 0.0813118, Validation MAE: 0.3127012, Test MAE: 0.3283437
-----------training begin-------------
Epoch: 007, LR: 0.001000, Loss: 0.0758711, Validation MAE: 0.3191572, Test MAE: 0.3283437
-----------training begin-------------
Epoch: 008, LR: 0.001000, Loss: 0.0783347, Validation MAE:

KeyboardInterrupt: 

# 加载模型（中断时，重载模型，再运行额外epoch）

In [14]:
# 加载
model.load_state_dict(torch.load(path_ckpt))
# 测试时不启用 BatchNormalization 和 Dropout
#model.eval()

<All keys matched successfully>

### 注意：奇怪！   
原因是：test_dataset是一个包含了很多Data类型的Dataset类型（molecular_network.mol_dataset.dataset.Dataset）
此类型若直接调用XX.data 则得到原始Dataset的总data
解决方法，遍历test_dataset,获取每个y值，再合并，再转化成numpy类型

#  保存测试集真实值+预测值

In [20]:

# 读取预测值
test_prds = test_prds.detach().numpy() # 转成numpy

# 读取真实值
test_ys = [test_dataset[i].y for i in range(len(test_dataset))]
test_true = torch.cat(test_ys).numpy()

# 保存真实值和预测值文件
df = pd.DataFrame()
df['test_true'] = test_true
df['test_pred'] = test_prds

df.to_pickle(path_prds)

# 计算model performance

In [22]:
pfm = modelEvaluator(test_true,test_prds)
df = pd.DataFrame.from_dict(pfm.get_performance(),index=[0])

df.to_csv(path_prds)

model kind: rgs
performance dict: {'r2': 0.833, 'rmse': 0.393, 'mae': 0.277}


ValueError: If using all scalar values, you must pass an index

In [23]:
ll = pfm.get_performance()

model kind: rgs
performance dict: {'r2': 0.833, 'rmse': 0.393, 'mae': 0.277}


In [24]:
ll

{'r2': 0.833, 'rmse': 0.393, 'mae': 0.277}

In [39]:
df = pd.DataFrame(ll,index=[0]) # 若不指定index么，会报错

In [40]:
df

Unnamed: 0,r2,rmse,mae
0,0.833,0.393,0.277
