# train & valid & test

In [1]:
import torch
import numpy as np
import random
import os
from sklearn.model_selection import KFold
import itertools

In [2]:
#set hyper param
kfolds=5
train_ratio=0.8
seed=1
no_cuda=False
#set seed
random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True

In [3]:
def GIP_sim(matrix):
    matrix=matrix.float()
    fz=(matrix*matrix).sum(dim=1,keepdims=True)+(matrix*matrix).sum(dim=1,keepdims=True).T-2*matrix@matrix.T
    fm=1/torch.diag(matrix@matrix.T).mean()
    return torch.exp(-1*fz*fm)
def Functional_sim(ass,sim,device,batch=1): #(a,b) #(b,b)
    s1=ass.shape[0]      #a
    ass=ass.to(device)
    sim=sim.to(device)
    sim_m=torch.zeros(s1,s1).to(device)  #(a,a)
    iter_comb=torch.tensor(list(itertools.combinations(range(s1),2))).long().to(device)
    for i in range(iter_comb.shape[0]//batch):
        idx1,idx2=iter_comb[i*batch:(i+1)*batch,0],iter_comb[i*batch:(i+1)*batch,1]
        m1=ass[idx1,:]
        m2=ass[idx2,:]
        sim1=m1[:,:,None]*sim*m2[:,None,:]  # (batch,b,b)
        sim_m[idx1,idx2]=(sim1.max(dim=1)[0].sum(dim=-1)+sim1.max(dim=2)[0].sum(dim=-1))/(m1.sum(dim=-1)+m2.sum(dim=-1))
    if iter_comb.shape[0]%batch!=0:
        idx1,idx2=iter_comb[(i+1)*batch:,0],iter_comb[(i+1)*batch:,1]
        m1=ass[idx1,:]
        m2=ass[idx2,:]
        sim1=m1[:,:,None]*sim*m2[:,None,:]  # (batch,b,b)
        sim_m[idx1,idx2]=(sim1.max(dim=1)[0].sum(dim=-1)+sim1.max(dim=2)[0].sum(dim=-1))/(m1.sum(dim=-1)+m2.sum(dim=-1))
    sim_m=torch.where(torch.isinf(sim_m),torch.zeros_like(sim_m),sim_m)
    sim_m=torch.where(torch.isnan(sim_m),torch.zeros_like(sim_m),sim_m)
    return (sim_m+sim_m.T+torch.eye(s1).to(device)).cpu()

In [4]:
train_set,test_set,common_set={},{},{}
device=torch.device("cuda" if (torch.cuda.is_available() and not no_cuda) else "cpu")
#load data
md=np.load('../miRNA_disease.npy')
mm=np.load('../miRNA_miRNA.npy')
ml=np.load('../miRNA_lncRNA.npy')
dd=np.load('../disease_disease.npy')
dl=np.load('../disease_lncRNA.npy')
ll=np.load('../lncRNA_lncRNA.npy')

common_set['md']=torch.tensor(md).long()
common_set['mm_seq']=torch.tensor(mm).float()
common_set['ml']=torch.tensor(ml).long()
common_set['dd_sem']=torch.tensor(dd).float()
common_set['dl']=torch.tensor(dl).long()
common_set['ll_seq']=torch.tensor(ll).float()

common_set['mm_mlG']=GIP_sim(common_set['ml']).float()
common_set['dd_dlG']=GIP_sim(common_set['dl']).float()
common_set['ll_lmG']=GIP_sim(common_set['ml'].T).float()
common_set['ll_ldG']=GIP_sim(common_set['dl'].T).float()

common_set['mm_mlF']=Functional_sim(common_set['ml'],common_set['ll_seq'],device,128).float()
common_set['dd_dlF']=Functional_sim(common_set['dl'],common_set['ll_seq'],device,128).float()
common_set['ll_ldF']=Functional_sim(common_set['dl'].T,common_set['dd_sem'],device,64).float()
common_set['ll_lmF']=Functional_sim(common_set['ml'].T,common_set['mm_seq'],device,128).float()

torch.save(common_set,'./common_set.pkl')

In [5]:
#train test
pos_x,pos_y=np.where(md==1)
pos_xy=np.concatenate([pos_x[:,None],pos_y[:,None]],axis=1) #(23337, 2)
pos_xy=pos_xy[np.random.permutation(pos_xy.shape[0]),:]
train_pos_xy=pos_xy[:int(pos_xy.shape[0]*train_ratio),:]
test_pos_xy=pos_xy[int(pos_xy.shape[0]*train_ratio):,:]

neg_x,neg_y=np.where(md==0)
neg_xy=np.concatenate([neg_x[:,None],neg_y[:,None]],axis=1) #(2562528, 2)
neg_xy=neg_xy[np.random.permutation(neg_xy.shape[0]),:]
train_neg_xy=neg_xy[:int(neg_xy.shape[0]*train_ratio),:]
test_neg_xy=neg_xy[int(neg_xy.shape[0]*train_ratio):,:]

train_xy=np.concatenate([train_pos_xy,train_neg_xy],axis=0)
train_label=np.concatenate([np.ones(train_pos_xy.shape[0]),np.zeros(train_neg_xy.shape[0])],axis=0)
train_rd=np.random.permutation(train_xy.shape[0])
train_xy,train_label=train_xy[train_rd,:],train_label[train_rd]
test_xy=np.concatenate([test_pos_xy,test_neg_xy],axis=0)
test_label=np.concatenate([np.ones(test_pos_xy.shape[0]),np.zeros(test_neg_xy.shape[0])],axis=0)

kf = KFold(n_splits=kfolds, shuffle=True, random_state=1)
train_idx, valid_idx = [], []
for train_index, valid_index in kf.split(train_xy):
    train_idx.append(train_index)
    valid_idx.append(valid_index)

test_md=np.zeros(md.shape)
test_md[train_pos_xy[:,0],train_pos_xy[:,1]]=1
test_set['edge']=torch.tensor(test_xy).long()
test_set['label']=torch.tensor(test_label).long()
test_set['md']=torch.tensor(test_md).long()
test_set['mm_mdG']=GIP_sim(test_set['md']).float()
test_set['dd_dmG']=GIP_sim(test_set['md'].T).float()
test_set['mm_mdF']=Functional_sim(test_set['md'],common_set['dd_sem'],device,64).float()
test_set['dd_dmF']=Functional_sim(test_set['md'].T,common_set['mm_seq'],device,64).float()

torch.save(test_set,'./test_set.pkl')
print('test_set saved')

for k in range(kfolds):
    xy_train,xy_valid=train_xy[train_idx[k],:],train_xy[valid_idx[k],:]
    label_train,label_valid=train_label[train_idx[k]],train_label[valid_idx[k]]
    train_md=np.zeros(md.shape)
    train_md[xy_train[:,0],xy_train[:,1]]=label_train
    train_set['edge_train_%d'%k]=torch.tensor(xy_train).long()
    train_set['label_train_%d'%k]=torch.tensor(label_train).long()
    train_set['edge_valid_%d'%k]=torch.tensor(xy_valid).long()
    train_set['label_valid_%d'%k]=torch.tensor(label_valid).long()
    train_set['md_%d'%k]=torch.tensor(train_md).long()
    train_set['mm_mdG_%d'%k]=GIP_sim(train_set['md_%d'%k]).float()
    train_set['dd_dmG_%d'%k]=GIP_sim(train_set['md_%d'%k].T).float()
    train_set['mm_mdF_%d'%k]=Functional_sim(train_set['md_%d'%k],common_set['dd_sem'],device,64).float()
    train_set['dd_dmF_%d'%k]=Functional_sim(train_set['md_%d'%k].T,common_set['mm_seq'],device,64).float()

torch.save(train_set,'./train_set.pkl')
print('train_set saved')