In [1]:
import pickle
import dgl
import ast
from model import MCDHGN
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
from dgl.nn.pytorch import GATConv
import logging 
import datetime
from model import MySampler
from sklearn.metrics import f1_score
from torch.utils.data import DataLoader
from utils import EarlyStopping, setup_seed, generate_traning_batch,HeteroDotProductPredictor
from sklearn.metrics import roc_auc_score, average_precision_score, accuracy_score,f1_score

In [2]:

torch.manual_seed(42)
torch.cuda.manual_seed_all(42)
np.random.seed(42)
random.seed(42)
dgl.random.seed(42)
torch.backends.cudnn.deterministic = True

In [3]:
meta_paths = [['gg'],['ga','ag'],['gb','bg'],['gc','cg'],['gd','dg'],['ge','eg'],['gf','fg'],['gh','hg'],['gi','ig']]
graph_path = './data/network/hetero/new_9nodes_graph.bin'
graphs,_ = dgl.data.utils.load_graphs(graph_path)
g = graphs[0]
features = g.ndata['feature']['Gene']
#In order to ensure reproducible operation, in addition to fixing the random number seed, I also fixed the intermediate random walk sampling results and the division results of the training set, verification set, and test set.
# you also can get the random sampler result by the codes below.
# gene_ids = g.nodes('Gene')
# my_sampler = MySampler(g, meta_paths, 128)
# _,gs = my_sampler.sample_blocks(gene_ids)
# print(gs)
gs = []
for i in range(9):
    loadpath = './Intermediate/blocks/{}gs.bin'.format(i)
    with open(loadpath,'rb')as f:
        tg = pickle.load(f)
        gs.append(tg)
# you can generate another train \ val \ test \ set by the codes below
# postivefile = './data/label/now_pos427.pkl'
# negtivefile = './data/label/now_neg427.pkl'
# train_batch,test_mask,test_label = generate_traning_batch(postivefile,negtivefile)
test_mask = torch.load('./Intermediate/label_set/mydatatest_mask.pt')
test_label = torch.load('./Intermediate/label_set/mydatatest_label.pt')
with open('./Intermediate/label_set/train_batch.pkl','rb')as f:
    train_batch = pickle.load(f)

In [4]:
if torch.cuda.is_available():
    device = torch.device('cuda:0')
    print(f'There are {torch.cuda.device_count()} GPU(s) available.')
    print('Device name:', torch.cuda.get_device_name(device))
else:
    print('No GPU available, using CPU instead.')
gs1 = [block.to(device) for block in gs]
features = features.to(device)

There are 2 GPU(s) available.
Device name: Tesla V100-SXM2-32GB


In [5]:
def evaluate(g,model, mask,label):
    model.eval() 
    with torch.no_grad():
        _,pred,alpha,beta = model(g,features)
        output = pred[mask]
        output = F.log_softmax(output,dim=1)
        val_loss = F.nll_loss(output, label)
        output = output.cpu().numpy()
        label = label.cpu().numpy()
        predicted_labels = np.argmax(output, axis=1)
        acc = accuracy_score(label, predicted_labels)
        micro_f1 = f1_score(label, predicted_labels, average='micro')
        macro_f1 = f1_score(label, predicted_labels, average='macro')
        output = torch.sigmoid(pred[mask]).cpu().detach().numpy()
        auc = roc_auc_score(label, output[:, 1])
        aupr = average_precision_score(label, output[:, 1])   
    return val_loss,acc,auc,aupr,pred.cpu(),alpha,beta,micro_f1,macro_f1
def train(g,model,train_mask,label,epochs):
    model.train()
    link_pred = HeteroDotProductPredictor()
    optimizer = torch.optim.Adam(
        model.parameters(), lr=0.001, weight_decay=0.0001)
    for epoch in range(epochs+1):
        embeddings,pred,_,_ = model(g,features)
        optimizer.zero_grad()
        output = pred[train_mask]
        output = F.log_softmax(output,dim=1)
        loss = F.nll_loss(output, label)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
       

In [6]:
loss_weight = torch.tensor([1.0, 1.904]).to(device)
logging.basicConfig(filename='train_and_val2024.log', level=logging.INFO)
test_label = test_label.to(device)
start_time = datetime.datetime.now()
logging.info(f'Current datetime: {start_time}')
auc = [];aupr = [];acc = [];micro = [];macro = []
for i,batch in enumerate(train_batch):
    cnt = 0
    best_val_auc = 0
    best_val_aupr = 0
    best_val_acc = 0
    best_val_micro_f1 = 0
    best_val_macro_f1 = 0
    model = MCDHGN(
        num_meta_paths=len(gs1),
        in_size=features.shape[1],
        hidden_size=256,
        out_size=2,
        num_heads=[4],
        dropout=0.4,
    ).to(device)
    optimizer = torch.optim.Adam(
        model.parameters(), lr=0.01, weight_decay=0.0001)
    for epoch in range(1000):
        model.train()
        train_mask,val_mask,train_label,val_label = batch
        train_mask = train_mask.to(device)
        val_mask = val_mask.to(device)
        train_label = train_label.to(device)
        val_label = val_label.to(device)
        l1_regularization = 0
        _,pred,_,_ = model(gs1,features)
        output = pred[train_mask]
        output = F.log_softmax(output,dim=1)
        loss = F.nll_loss(output, train_label,weight = loss_weight)
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        output = output.cpu().detach().numpy()
        train_label = train_label.cpu().numpy()
        if(epoch%10 == 0):
            cnt=cnt+1
            val_loss,val_acc,val_auc,val_aupr,output,_,beta,val_micro,val_macro= evaluate(gs1,model,val_mask,val_label)
            if val_auc>best_val_auc:
                best_val_auc = val_auc
                cnt = 0
            if val_aupr >best_val_aupr:
                best_val_aupr = val_aupr
                cnt = 0
            if val_acc>best_val_acc:
                best_val_acc = val_acc
            if val_micro > best_val_micro_f1:
                best_val_micro_f1 = val_micro
            if val_macro > best_val_macro_f1:
                best_val_macro_f1 = val_macro
    end_time = datetime.datetime.now()
    elapsed_time = end_time - start_time
    hours, remainder = divmod(elapsed_time.total_seconds(), 3600)
    minutes, seconds = divmod(remainder, 60)
    logging.info(f"{i}fold程序运行时间：{int(hours)}小时{int(minutes)}分钟{int(seconds)}秒")
    logging.info(f"auc{best_val_auc},aupr{best_val_aupr}")
    auc.append(best_val_auc)
    aupr.append(best_val_aupr)
    acc.append(best_val_acc)
    macro.append(best_val_macro_f1)
    micro.append(best_val_micro_f1)


In [None]:
logging.info(f"mean: auc{np.mean(auc)},+- {np.std(auc)}")
logging.info(f"mean: aupr{np.mean(aupr)},+- {np.std(aupr)}")
logging.info(f"mean: acc{np.mean(acc)},+- {np.std(acc)}")
logging.info(f"mean: micro_f1{np.mean(micro)},+- {np.std(micro)}")
logging.info(f"mean: macro_f1{np.mean(macro)},+- {np.std(macro)}")