In [1]:
import pandas as pd
import numpy as np

from tqdm import tqdm

# Reformat data for PyTorch and Save

## Edge List

Here, we generate a new ID system with index starting at 0. We associate each Ensembl ID with such an index. We map the edge list in terms of Ensembl IDs into this ID system. This is necessary for GNN modeling.

In [2]:
# load network data

# this is a temporary dataset with a few edges removed while we figure out how to map them
edges = pd.read_csv('data/PPI_filtered.edg', delimiter='\t', header=None) 
edges.rename(columns={0: 'gene1', 1:'gene2'}, inplace=True)

edges

Unnamed: 0,gene1,gene2
0,ENSG00000004059,ENSG00000154678
1,ENSG00000004059,ENSG00000180370
2,ENSG00000004059,ENSG00000100228
3,ENSG00000004059,ENSG00000107263
4,ENSG00000004059,ENSG00000184900
...,...,...
5638412,ENSG00000230549,ENSG00000280267
5638413,ENSG00000248167,ENSG00000177853
5638414,ENSG00000179412,ENSG00000279782
5638415,ENSG00000239900,ENSG00000100129


In [3]:
# load feature/label data
HPAv2 = pd.read_csv('data/HPA_Complete_v2.0.csv', index_col=0)
HPAv2.set_index('Ensembl', inplace=True)
HPAv2

Unnamed: 0_level_0,Gene,Gene synonym,Uniprot,Tissue RNA - lung [NX],Single Cell Type RNA - Mucus-secreting cells [NX],0,1,2,3,4,...,96,97,98,99,OMIM_pos,PROG_F_pos,PROG_UF_pos,CANCER_FPKM_pos,NIH_pos,Total_pos
Ensembl,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000175899,A2M,"CPAMD5, FWP007, S863-7",P01023,11.014009,-0.105079,0.443661,0.314312,0.431142,-0.285434,-0.347850,...,0.323598,0.082054,0.288884,0.688563,0,0,0,0,0.0,0.0
ENSG00000128274,A4GALT,"A14GALT, Gb3S, P(k), P1",Q9NPC4,0.000432,-0.105079,0.532335,0.788968,-0.122645,0.019492,1.427088,...,0.150823,-0.140471,-0.089417,0.118545,0,0,0,0,0.0,0.0
ENSG00000094914,AAAS,,Q9NRG9,-0.030534,-0.086981,1.605534,0.936615,0.344810,0.184176,-1.003302,...,-0.249685,0.038119,0.083337,-0.139720,0,0,0,0,0.0,0.0
ENSG00000081760,AACS,"ACSF1, FLJ12389, SUR-5",Q86V21,-0.469219,-0.073447,0.467230,0.690114,-0.067368,0.335650,1.017134,...,-0.056235,0.069261,0.022944,-0.149674,0,0,0,0,0.0,0.0
ENSG00000114771,AADAC,"CES5A1, DAC",P22760,-0.634371,-0.105079,0.956745,0.201582,-0.489768,-0.524596,0.465925,...,-0.007228,-0.006816,0.012393,0.007455,0,0,0,0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000198455,ZXDB,ZNF905,P98169,-0.603405,-0.095479,1.436466,-1.455650,0.510789,0.224684,0.322919,...,-0.012030,0.025383,0.002025,0.015379,0,0,0,0,0.0,0.0
ENSG00000070476,ZXDC,"FLJ13861, MGC11349",Q2QGD7,-0.175042,-0.095794,1.139942,-1.162737,0.551856,0.127642,0.117517,...,-0.001672,0.041680,0.004443,0.009576,0,0,0,0,0.0,0.0
ENSG00000162378,ZYG11B,"FLJ13456, ZYG11",Q9C0D3,-0.324711,-0.087925,0.782716,-0.058278,-0.397427,0.017376,-0.136751,...,-0.035654,-0.023648,0.002322,0.026402,0,0,0,0,0.0,0.0
ENSG00000159840,ZYX,,Q15942,2.002901,-0.073761,1.042355,0.183004,-0.327829,0.065089,-0.459396,...,-0.017115,0.013158,0.019728,0.064606,0,0,0,0,0.0,0.0


In [4]:
# find unique genes in edges
unique_genes_edges = np.unique(list(edges['gene1']) + list(edges['gene2']))
len(unique_genes_edges)

18617

In [5]:
# find unique genes in feature/label data
unique_genes_HPA = np.unique(HPAv2.index)
len(unique_genes_HPA)

15021

In [6]:
# use the intersection of the two datasets
unique_genes = np.intersect1d(unique_genes_HPA, unique_genes_edges)

# TODO: look into which genes are lost by using the intersection
len(unique_genes)

14552

In [7]:
# create a unique id system starting at 0 (for link list for pytorch, etc...)
# maps between Ensembl ID and new ID system
gene_ids_dict = {id: protein for id, protein in enumerate(unique_genes)}
gene_ids_dict_inv = {protein: id for id, protein in enumerate(unique_genes)}

In [8]:
# create the link list in this new unique id system

# first, filter out edges that involve genes outside the intersection of the two datasets
filter = edges.gene1.isin(unique_genes) & edges.gene2.isin(unique_genes)
edge_list_filtered = edges[filter]

# now map to new index system
edge_list_reindexed = edge_list_filtered.applymap(lambda x: gene_ids_dict_inv[x])

In [9]:
# save the id dictionary and link list
np.save('data/protein_ids_dict.npy', gene_ids_dict)
np.save('data/edge_list.npy', edge_list_reindexed)

## Features and Labels

Similarly, we map the features and labels to this new ID system.

In [None]:
HPAv2.set_index('Ensembl', inplace=True)
HPAv2

In [12]:
# generate X feature matrix
all_features = HPAv2.columns[4: 107]
print('features: ', all_features)

features = HPAv2.loc[unique_genes][all_features]
X = features.to_numpy()

features:  Index(['Single Cell Type RNA - Mucus-secreting cells [NX]', '0', '1', '2', '3',
       '4', '5', '6', '7', '8',
       ...
       '92', '93', '94', '95', '96', '97', '98', '99', 'OMIM_pos',
       'PROG_F_pos'],
      dtype='object', length=103)


In [13]:
# get labels (use NIH labels here)
Y = HPAv2.loc[unique_genes].NIH_pos

In [14]:
data = {'X': X, 'Y': Y}
np.save('data/gene_feat_label_data.npy', data, allow_pickle=True)