In [2]:
import pickle
import numpy as np
import torch 
import dgl
import csv

In [3]:
gene_path = '../data/step1/'
Msig_path = '../data/msigdb/Msig_gene/cpdb/'
ids_toname_path = '../data/network/hetero/ids_to_name/'
name_toids_path = '../data/network/hetero/name_to_ids/'

### Convert the original correspondence to the ID to facilitate the construction of the heterogeneous network
### node types in the list:
*[‘Gene','Pos','Pathway','M','CM','GO','HPO','F','Cell']*

In [4]:
all_name_dict = []
with open(gene_path+'gene_toids.pkl','rb')as f:
    gene_ids = pickle.load(f) 
all_name_dict.append(gene_ids)

for j in range(1,9):
    i = 0
    with open(Msig_path+"c{}_gene_cpdb.pkl".format(j),'rb')as f:
        tmp_c_gene = pickle.load(f)
        tmp_list = list(tmp_c_gene.keys())
        tmp_dict = {}
        for key in tmp_list:
            tmp_dict[key] = i
            i+=1
        print('the {}.th Msig Nodes\'num:  {}'.format(j,len(tmp_dict)))
    all_name_dict.append(tmp_dict)

the 1.th Msig Nodes'num:  293
the 2.th Msig Nodes'num:  3089
the 3.th Msig Nodes'num:  3708
the 4.th Msig Nodes'num:  858
the 5.th Msig Nodes'num:  10525
the 6.th Msig Nodes'num:  5404
the 7.th Msig Nodes'num:  189
the 8.th Msig Nodes'num:  829


In [6]:
nodetype = ['Gene','Pos','Pathway','M','CM','GO','HPO','F','Cell']
for i in range(9):
    tmp_dict = all_name_dict[i]
    with open(name_toids_path+nodetype[i]+'.pkl','wb')as f:
        pickle.dump(tmp_dict,f)

### generate heterograph's correspondence between the ID of various nodes and names, which is convenient for subsequent analysis


In [19]:
all_id_name = []
for j in range(1,9):
    i = 0
    with open(Msig_path+"c{}_gene_cpdb.pkl".format(j),'rb')as f:
        tmp_c_gene = pickle.load(f)
        tmp_list = list(tmp_c_gene.keys())
        tmp_dict = {}
        for key in tmp_list:
            tmp_dict[i] = key
            i+=1
        print('the {}.th Msig Nodes\'num:  {}'.format(j,len(tmp_dict)))
    all_id_name.append(tmp_dict)   
nodetype = ['Pos','Pathway','M','CM','GO','HPO','F','Cell']
for i in range(8):
    with open(ids_toname_path+nodetype[i]+'.pkl','wb')as f:
        pickle.dump(all_id_name[i],f)

the 1.th Msig Nodes'num:  293
the 2.th Msig Nodes'num:  3089
the 3.th Msig Nodes'num:  3708
the 4.th Msig Nodes'num:  858
the 5.th Msig Nodes'num:  10525
the 6.th Msig Nodes'num:  5404
the 7.th Msig Nodes'num:  189
the 8.th Msig Nodes'num:  829


### generate all the edges between different node types 

In [7]:
edges = []
for i in range(1,9):
    tmpedges = []
    with open(Msig_path+"c{}_gene_cpdb.pkl".format(i),'rb')as f:
        tmp_c_gene = pickle.load(f)
        for key,value in tmp_c_gene.items():
            x = all_name_dict[i][key]
            for ty in value:
                y = all_name_dict[0][ty]
                tmpedges.append((x,y))
    print('the {}.th Msig nodes have {} edges with gene nodes'.format(i,len(tmpedges)))
    edges.append(tmpedges)

the 1.th Msig nodes have 12910 edges with gene nodes
the 2.th Msig nodes have 134089 edges with gene nodes
the 3.th Msig nodes have 596322 edges with gene nodes
the 4.th Msig nodes have 78744 edges with gene nodes
the 5.th Msig nodes have 701315 edges with gene nodes
the 6.th Msig nodes have 385629 edges with gene nodes
the 7.th Msig nodes have 24034 edges with gene nodes
the 8.th Msig nodes have 112393 edges with gene nodes


### Generate a heterogeneous graph with *9* different nodes and *16* different edges

In [8]:
edge_name_list = ['G_Pos','Pos_G','G_pathway','Pathway_G','G_M','M_G','G_CM','CM_G','G_GO',
                  'GO_G','G_HPO','HPO_G','G_F','F_G','G_Cell','Cell_G']
node_name_list = ['Pos','Pathway','Microrna','CM','GO','HPO','F','Cell']

In [9]:
Pos_G,G_Pos = [e[0] for e in edges[0]],[e[1] for e in edges[0]] 
Pathway_G,G_Pathway = [e[0] for e in edges[1]],[e[1] for e in edges[1]]
M_G,G_M = [e[0] for e in edges[2]],[e[1] for e in edges[2]]
CM_G,G_CM = [e[0] for e in edges[3]],[e[1] for e in edges[3]] 
GO_G,G_GO = [e[0] for e in edges[4]],[e[1] for e in edges[4]]     
HPO_G,G_HPO = [e[0] for e in edges[5]],[e[1] for e in edges[5]]
F_G,G_F = [e[0] for e in edges[6]],[e[1] for e in edges[6]]
Cell_G,G_Cell =[e[0] for e in edges[7]],[e[1] for e in edges[7]] 

### The graph generated by our research itself contains the isomorphic part of the gene-gene, which is obtained according to the correlation provided by the ppi network

In [10]:
with open('../data/step1/gene_relations_undirected.pkl','rb')as f:
    gene_edges = pickle.load(f)

g1,g2 = [e[0] for e in gene_edges],[e[1] for e in gene_edges]

In [11]:
g = dgl.heterograph({
            ('a_Pos', 'ag', 'Gene'): (Pos_G, G_Pos),
            ('Gene', 'ga', 'a_Pos'): (G_Pos, Pos_G),
            ('b_Pathway', 'bg', 'Gene'): (Pathway_G,G_Pathway,),
            ('Gene', 'gb', 'b_Pathway'): (G_Pathway, Pathway_G),
            ('c_Microrna','cg','Gene'): (M_G,G_M),
            ('Gene','gc',"c_Microrna"): (G_M,M_G),
            ('e_GO','eg','Gene'): (GO_G,G_GO),
            ('Gene','ge','e_GO'): (G_GO,GO_G),
            ('d_CM','dg','Gene'): (CM_G,G_CM),
            ('Gene','gd','d_CM'): (G_CM,CM_G),
            ('f_HPO','fg','Gene'): (HPO_G,G_HPO),
            ('Gene','gf','f_HPO'): (G_HPO,HPO_G),
            ('h_F','hg','Gene'): (F_G,G_F),
            ('Gene','gh','h_F'): (G_F,F_G),
            ('i_Cell','ig','Gene'): (Cell_G,G_Cell),
            ('Gene','gi','i_Cell'): (G_Cell,Cell_G),
            ('Gene','gg','Gene'): (g1,g2)
        })

### My Heterogeneous Graph 

feat = torch.randn(len(gene_ids), 48)
feat_norm = torch.norm(feat, p=2, dim=1, keepdim=True)
feat_normalized = feat / feat_norm
g.nodes['Gene'].data['feature'] = feat_normalized
save_path = '../ablation_experiment/randomfeat_9nodes_graph.bin'
dgl.save_graphs(save_path, [g])

In [18]:
with open('../data/step1/gene_feature_bio.pkl','rb')as f:
    gene_feat = pickle.load(f)
tensors1 = [t for t in gene_feat.values()]
gene_feat_tensor = torch.stack(tensors1)
g.nodes['Gene'].data['feature'] = gene_feat_tensor
save_path = '../data/network/hetero/new_9nodes_graph.bin'
dgl.save_graphs(save_path, [g])

In [13]:
print(g.ntypes)

['Gene', 'a_Pos', 'b_Pathway', 'c_Microrna', 'd_CM', 'e_GO', 'f_HPO', 'h_F', 'i_Cell']


### The following are generate steps for MAGNN

In [20]:
feature_list = []

feature_list.append(gene_feat_tensor)
for j in range(1,9):
    t_featurs = []
    with open(Msig_path+"c{}_feature.pkl".format(j),'rb')as f:
        t_feature_dict = pickle.load(f)
        t_name_toids_idct = all_id_name[j-1]
        
        for key,value in t_name_toids_idct.items():
            t_featurs.append(t_feature_dict[value])
           
    feature_list.append(torch.stack(t_featurs))

In [28]:
magnn_graph = dgl.heterograph({
            ('a_Pos', 'ag', 'Gene'): (Pos_G, G_Pos),
            ('Gene', 'ga', 'a_Pos'): (G_Pos, Pos_G),
            ('b_Pathway', 'bg', 'Gene'): (Pathway_G,G_Pathway,),
            ('Gene', 'gb', 'b_Pathway'): (G_Pathway, Pathway_G),
            ('c_Microrna','cg','Gene'): (M_G,G_M),
            ('Gene','gc',"c_Microrna"): (G_M,M_G),
            ('e_GO','eg','Gene'): (GO_G,G_GO),
            ('Gene','ge','e_GO'): (G_GO,GO_G),
            ('d_CM','dg','Gene'): (CM_G,G_CM),
            ('Gene','gd','d_CM'): (G_CM,CM_G),
            ('f_HPO','fg','Gene'): (HPO_G,G_HPO),
            ('Gene','gf','f_HPO'): (G_HPO,HPO_G),
            ('h_F','hg','Gene'): (F_G,G_F),
            ('Gene','gh','h_F'): (G_F,F_G),
            ('i_Cell','ig','Gene'): (Cell_G,G_Cell),
            ('Gene','gi','i_Cell'): (G_Cell,Cell_G),
            ('Gene','gg','Gene'): (g1,g2)
        })

In [29]:

magnn_graph.nodes['Gene'].data['feature'] = feature_list[0]
magnn_graph.nodes['a_Pos'].data['feature'] = feature_list[1]
magnn_graph.nodes['b_Pathway'].data['feature'] = feature_list[2]
magnn_graph.nodes['c_Microrna'].data['feature'] = feature_list[3]
magnn_graph.nodes['d_CM'].data['feature'] = feature_list[4]
magnn_graph.nodes['e_GO'].data['feature'] = feature_list[5]
magnn_graph.nodes['f_HPO'].data['feature'] = feature_list[6]
magnn_graph.nodes['h_F'].data['feature'] = feature_list[7]
magnn_graph.nodes['i_Cell'].data['feature'] = feature_list[8]
save_path = '../../data/Network/Hetero/magnn_9nodes_graph.bin'
dgl.save_graphs(save_path, [magnn_graph])